In [1]:
from deep_translator import GoogleTranslator
from tqdm import tqdm
import translators as ts
import numpy as np
import pandas as pd

In [2]:
drop_columns = ['answer',
 'ccid',
 'context_id',
 'course_id',
 'course_order',
 'create_time',
 'end',
 'enroll_time',
 'exercise_id',
 'gender',
 'graph_predict',
 'ground_truth',
 'id',
 'language',
 'location',
 'name_en',
 'problem_id',
 'resource_id',
 'score',
 'sign',
 'start',
 'text_predict',
 'type',
 'user_id',
 'year_of_birth']
drop_keys = ['resource_id', 'chapter']

In [45]:
def save_file(df, name):
    if os.path.isdir('translated') is False:
        os.mkdir('translated')
    df.to_json(f'translated/{name}_translated.json', orient='records', force_ascii=False)
    return df

In [36]:
def translate_text(text, type_trans):
    if text != None:
        if type_trans == 'deep_translator':
            text = GoogleTranslator(source='auto', target='vi').translate(text=text)
        elif type_trans == 'translator':
            text = ts.translate_text(text, to_language='vi', if_ignore_empty_query=True)
        else: pass
    return text

In [37]:
def translate_list(lst, type_trans):
    if len(lst)!=0:
        valid_data = [item for item in lst if item is not None]
        random_sample = valid_data[0]

        type_rnd = type(random_sample).__name__
        if type_rnd == 'str':
            lst = [translate_text(item, type_trans) for item in lst]
        elif type_rnd == 'dict':
            lst = [translate_dict(item, type_trans) for item in lst]
        else: pass
    return lst

In [38]:
def translate_dict(dct, type_trans):
    lst_keys = list(dct.keys())

    if len(lst_keys) != 0:
        lst_keys = np.setdiff1d(lst_keys, drop_keys)
        for item in lst_keys:
            type_keys_dict = type(dct[item]).__name__

            if type_keys_dict == 'list':
                dct[item] = translate_list(dct[item], type_trans)
            elif type_keys_dict == 'str':
                dct[item] = translate_text(dct[item], type_trans)
    return dct

In [42]:
def translate_data(df, name, type_trans, checkpoint=None, startpos=0):
    if os.path.exists(f'translated/{name}_translated.json'):
        ans = input(f"Do you want to overwrite the exist file: {name}_translated.json? (Y/N)")
        if ans.lower() != 'y': return df
    
    if checkpoint is None: checkpoint = len(df)
    old_idx = startpos

    col_df = np.setdiff1d(np.array(df.columns), drop_columns)
    col_types = df[col_df].apply(lambda col: type(col[0])).to_numpy()
    col_df_dict = dict(zip(col_df, [col_type.__name__ for col_type in col_types]))
    
    print("Start pos:", startpos)
    for idx in tqdm (range (startpos, len(df)), desc="Translating..."):
        for col_name, col_type in col_df_dict.items():
            if col_type=='str':
                df.at[idx, col_name]=translate_text(df.at[idx, col_name], type_trans)
            elif col_type=='list':
                df.at[idx, col_name]=translate_list(df.at[idx, col_name], type_trans)
            elif col_type=='dict':
                df.at[idx, col_name]=translate_dict(df.at[idx, col_name], type_trans)
            else: continue

        if (idx-startpos+1) % checkpoint == 0:
            df.iloc[old_idx:idx] = save_file(df, name).iloc[old_idx:idx]
            old_idx = idx

    df.iloc[old_idx:] = save_file(df, name).iloc[old_idx:]
    return df

In [57]:
name = 'example'
folder = ''
folder = folder + '/' if folder != '' else '' 
df = pd.read_json(f'{folder}{name}.json', lines=True)
df

Unnamed: 0,id,name,name_en,sign,about,motto
0,S_1,清华大学,Tsinghua University,thu,简称“清华”，由中华人民共和国教育部直属，中央直管副部级建制，位列“211工程”、“985工...,"自强不息,厚德载物"
1,S_2,北京大学,Peking University,PKU,北京大学（Peking University），简称“北大”，是中华人民共和国教育部直属的全...,博学、审问、慎思、明辨
2,S_3,武汉大学,Wuhan University,whu,武汉大学（Wuhan University）简称“武大”，是中华人民共和国教育部直属的综合性...,自强 弘毅 求是 拓新
3,S_4,苏州大学,Soochow University,suda,苏州大学（Soochow University），坐落于历史文化名城苏州，是教育部与江苏省人...,养天地正气，法古今完人
4,S_5,四川大学,Sichuan University,scu,四川大学（Sichuan University），简称“川大”，坐落于四川省会成都，是教育部...,
...,...,...,...,...,...,...
424,S_1402,高雄医学大学,Kaohsiung Medical University,Kaohsiung Medical University,高雄医学大学（Kaohsiung Medical University）源自1954年创校的...,
425,S_1405,黑龙江农垦职业学院,Heilongjiang Agricultural ReclamationVocationa...,nkzy,黑龙江农垦职业学院（Heilongjiang Agricultural Reclamatio...,
426,S_1448,江西财经大学,Jiangxi University of Finance And Economics,jxufe,江西财经大学（Jiangxi University of Finance and Econo...,
427,S_1453,贵州交通职业技术学院,GuiZhou Communications Polytechnic,gzjtzy,贵州交通职业技术学院是一所以交通为特色的理工类高职院校。创办于1958年，走过国家“示范校”...,知行合一 德技双馨


In [48]:
translate_data(df, name, 'deep_translator', checkpoint=10, startpos=30)

Start pos: 30


Translating...:   9%|█████▌                                                           | 34/399 [01:04<11:34,  1.90s/it]


KeyboardInterrupt: 