In [17]:
from langconv import *
from tqdm import tqdm
import json
import re

In [18]:
def load_jsonl_file(file_path):
    """Load a jsonl file."""
    data = []
    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data


def save_jsonl_file(file_path, tuples):
    """Save tuples as a jsonl file."""
    with open(file_path, "w", encoding="utf-8") as f:
        for t in tuples:
            f.write(json.dumps(t, ensure_ascii=False) + "\n")


In [19]:
def Traditional2Simplified(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence

def Simplified2Traditional(sentence):
    '''
    将sentence中的简体字转为繁体字
    :param sentence: 待转换的句子
    :return: 将句子中简体字转换为繁体字之后的句子
    '''
    sentence = Converter('zh-hant').convert(sentence)
    return sentence

# cite from: https://blog.csdn.net/wds2006sdo/article/details/53583367

In [22]:
lang = ["trans/zh-needTrans", "trans/zh-cn-trans-extend"]
#PRETICATES = ["P1412", "P1376", "P1303", "P530", "P495", "P449", "P364", "P264", "P190", "P136", "P106", "P103", "P47", "P37", "P36", "P30", "P27", "P20", "P19", "P17"]
PRETICATES = ["P449", "P264"]

file_paths = {}

for i in lang:
    file_path_temp = {}
    for predicate in PRETICATES:
        file_names = [predicate+"_general_ASIA.jsonl", predicate+"_general_WESTERN_COUNTRIES.jsonl"]
        for j in file_names:
            file_path_temp[j] = i + "\\"+ j
    file_paths[i] = file_path_temp


In [23]:
for i in file_paths["trans/zh-needTrans"].keys():
    zhcn_triples = []
    file_path_zh = file_paths["trans/zh-needTrans"][i]
    file_path_zhcn = file_paths["trans/zh-cn-trans-extend"][i]
    for triple in tqdm(load_jsonl_file(file_path_zh)):
        
        translated_sub = Traditional2Simplified(triple["sub_label"])
        
        translated_obj_temp = [Traditional2Simplified(obj) for obj in triple["obj_label"]]
        # remove the duplicate items
        translated_obj_temp = list(set(translated_obj_temp))
        # remove items that are not in Chinese
        #translated_obj = [a for a in translated_obj_temp if re.match(r'^[\u4e00-\u9fff]+$', a)]
        translated_obj = translated_obj_temp
        
        
        
        triple["sub_label"] = translated_sub
        triple["obj_label"] = translated_obj
        
        zhcn_triples.append(triple)
    save_jsonl_file(file_path_zhcn, zhcn_triples)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4721.65it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4408.96it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6205.58it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2636.14it/s]
