In [18]:
# 필요 라이브러리
import json
import os
import pandas as pd

from tabulate import tabulate
from collections import OrderedDict

In [4]:
def print_structure(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        print(f"파일명: {os.path.basename(file_path)}")
        print(f"데이터 타입: {type(data)}")
        
        if isinstance(data, dict):
            print("키 구조:")
            structure = []
            for key, value in data.items():
                if isinstance(value, list) and len(value) > 0:
                    sub_structure = get_structure(value[0])
                    for sub_key, sub_type in sub_structure:
                        structure.append([f"{key}.{sub_key}", sub_type])
                elif isinstance(value, dict):
                    sub_structure = get_structure(value)
                    for sub_key, sub_type in sub_structure:
                        structure.append([f"{key}.{sub_key}", sub_type])
                else:
                    structure.append([key, type(value).__name__])
            
            df = pd.DataFrame(structure, columns=['키', '타입'])
            print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))
            
            print("\n데이터 샘플:")
            if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
                sample = data['data'][0]
                df_sample = pd.DataFrame([sample])
                print(tabulate(df_sample, headers='keys', tablefmt='pretty', showindex=False))
        
        elif isinstance(data, list):
            print(f"리스트 길이: {len(data)}")
            if len(data) > 0:
                print("첫 번째 항목 구조:")
                structure = get_structure(data[0])
                df = pd.DataFrame(structure, columns=['키', '타입'])
                print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))
                
                print("\n데이터 샘플:")
                df_sample = pd.DataFrame([data[0]])
                print(tabulate(df_sample, headers='keys', tablefmt='pretty', showindex=False))
        
        print("\n")

def get_structure(obj, prefix=''):
    structure = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_key = f"{prefix}.{k}" if prefix else k
            if isinstance(v, (dict, list)):
                structure.extend(get_structure(v, new_key))
            else:
                structure.append([new_key, type(v).__name__])
    elif isinstance(obj, list) and len(obj) > 0:
        structure.extend(get_structure(obj[0], prefix))
    else:
        structure.append([prefix, type(obj).__name__])
    return structure

In [5]:
# JSON 파일들이 있는 디렉토리 경로
train_path = '../datasets/KoreanError/Training/labelled_data/'
valid_path = '../datasets/KoreanError/Validation/labelled_data/'

In [6]:
# Training 데이터의 구조 확인
for filename in os.listdir(train_path):
    if filename.endswith('.json'):
        print_structure(os.path.join(train_path, filename))
        break

파일명: 띄어쓰기문장부호오류.json
데이터 타입: <class 'dict'>
키 구조:
+--------------------------------------+------+
|                  키                  | 타입 |
+--------------------------------------+------+
|           info.description           | str  |
|            info.data_name            | str  |
|        info.data_description         | str  |
|             info.creator             | str  |
|           info.distributor           | str  |
|             info.version             | str  |
|        data.metadata_info.id         | str  |
|      data.metadata_info.source       | str  |
|     data.annotation.err_sentence     | str  |
|  data.annotation.err_sentence_spell  | str  |
|     data.annotation.cor_sentence     | str  |
|  data.annotation.cor_sentence_spell  | str  |
|    data.annotation.errors.err_idx    | int  |
| data.annotation.errors.err_location  | int  |
|   data.annotation.errors.err_text    | str  |
|   data.annotation.errors.cor_text    | str  |
|  data.annotation.errors.err_details  | 

In [7]:
# Validation 데이터의 구조 확인
for filename in os.listdir(valid_path):
    if filename.endswith('.json'):
        print_structure(os.path.join(valid_path, filename))
        break

파일명: 띄어쓰기문장부호오류.json
데이터 타입: <class 'dict'>
키 구조:
+--------------------------------------+------+
|                  키                  | 타입 |
+--------------------------------------+------+
|           info.description           | str  |
|            info.data_name            | str  |
|        info.data_description         | str  |
|             info.creator             | str  |
|           info.distributor           | str  |
|             info.version             | str  |
|        data.metadata_info.id         | str  |
|      data.metadata_info.source       | str  |
|     data.annotation.err_sentence     | str  |
|  data.annotation.err_sentence_spell  | str  |
|     data.annotation.cor_sentence     | str  |
|  data.annotation.cor_sentence_spell  | str  |
|    data.annotation.errors.err_idx    | int  |
| data.annotation.errors.err_location  | int  |
|   data.annotation.errors.err_text    | str  |
|   data.annotation.errors.cor_text    | str  |
|  data.annotation.errors.err_details  | 

In [8]:
# err_sentecne와 cor_sentence만 뽑아서 데이터 구성
def extract_sents(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    extracted_data = []
    for item in data['data']:
        extracted_item = {
            'err_sentence': item['annotation']['err_sentence'],
            'cor_sentence': item['annotation']['cor_sentence']
        }
        extracted_data.append(extracted_item)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(extracted_data, f, ensure_ascii=False, indent=2)

In [9]:
train_dir = "../datasets/KoreanError/Training"
valid_dir = "../datasets/KoreanError/Validation"

In [10]:
# 모든 json파일에 대해서 처리 (train)
for filename in os.listdir(train_path):
    if filename.endswith('.json'):
        input_file = os.path.join(train_path, filename)
        output_file = os.path.join(train_dir, f'{filename}')
        extract_sents(input_file, output_file)

In [11]:
# 모든 json파일에 대해서 처리 (valid)
for filename in os.listdir(valid_path):
    if filename.endswith('.json'):
        input_file = os.path.join(valid_path, filename)
        output_file = os.path.join(valid_dir, f'{filename}')
        extract_sents(input_file, output_file)

In [22]:
def merge_json_files(input_dir, output_file):
    merged_data = []

    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            file_path = os.path.join(input_dir, filename)
            file_id = os.path.splitext(filename)[0]  # 파일 확장자를 제외한 이름
            
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for item in data:
                    new_item = OrderedDict([
                        ('id', file_id),
                        ('err_sentence', item['err_sentence']),
                        ('cor_sentence', item['cor_sentence'])
                    ])
                    merged_data.append(new_item)

    # 병합된 데이터를 새 파일에 저장
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=2)

    return len(merged_data)  # 총 데이터 수 반환

In [23]:
# train, validation 개수와 함께 출력
train_len = merge_json_files(train_dir, os.path.join(train_dir, "train_data.json"))
valid_len = merge_json_files(valid_dir, os.path.join(valid_dir, "valid_data.json"))

print(f"Training Data Length : {train_len}, Validation Data Length : {valid_len}")

Training Data Length : 214300, Validation Data Length : 29050
