# Generate Panlex Word Translation Dictionary
This notebook is for generating translator dictionary using panlex database.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os, pickle
import pandas as pd
os.chdir("/content/drive/My Drive/Colab Notebooks/panlex-experiment/")

In [11]:
!wget https://db.panlex.org/panlex-20230401-csv.zip

--2023-05-06 03:08:56--  https://db.panlex.org/panlex-20230401-csv.zip
Resolving db.panlex.org (db.panlex.org)... 208.70.31.123
Connecting to db.panlex.org (db.panlex.org)|208.70.31.123|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1274847653 (1.2G) [application/zip]
Saving to: ‘panlex-20230401-csv.zip’


2023-05-06 03:36:34 (751 KB/s) - ‘panlex-20230401-csv.zip’ saved [1274847653/1274847653]



In [15]:
!ls

panlex-20230401-csv  panlex-20230401-csv.zip  panlex-experiment.ipynb


In [14]:
!unzip panlex-20230401-csv.zip

Archive:  panlex-20230401-csv.zip
   creating: panlex-20230401-csv/
  inflating: panlex-20230401-csv/source.csv  
  inflating: panlex-20230401-csv/denotation_class.csv  
  inflating: panlex-20230401-csv/format.csv  
  inflating: panlex-20230401-csv/denotation_prop.csv  
  inflating: panlex-20230401-csv/expr.csv  
  inflating: panlex-20230401-csv/langvar_cldr_char.csv  
  inflating: panlex-20230401-csv/definition.csv  
  inflating: panlex-20230401-csv/source_format.csv  
  inflating: panlex-20230401-csv/lang_code.csv  
  inflating: panlex-20230401-csv/source_langvar.csv  
  inflating: panlex-20230401-csv/langvar.csv  
  inflating: panlex-20230401-csv/langvar_char.csv  
  inflating: panlex-20230401-csv/meaning_prop.csv  
  inflating: panlex-20230401-csv/LICENSE.txt  
  inflating: panlex-20230401-csv/source_license.csv  
  inflating: panlex-20230401-csv/meaning_class.csv  
  inflating: panlex-20230401-csv/meaning.csv  
  inflating: panlex-20230401-csv/denotation.csv  


## Load required tables

In [None]:
expr_df = pd.read_csv("./panlex-20230401-csv/expr.csv")

In [None]:
denotation_df = pd.read_csv("./panlex-20230401-csv/denotation.csv")

In [None]:
langvar_df = pd.read_csv("./panlex-20230401-csv/langvar.csv")

## Generate Dictionary

In [43]:
def create_translation_dictionary(
    src_lang_code,
    src_var_code,
    dst_lang_code,
    dst_var_code,
):
  src_langvar_id = langvar_df[(langvar_df['lang_code'] == src_lang_code) & (langvar_df['var_code'] == src_var_code)]['id'].item()
  dst_langvar_id = langvar_df[(langvar_df['lang_code'] == dst_lang_code) & (langvar_df['var_code'] == dst_var_code)]['id'].item()
  src_expr_df = expr_df[expr_df['langvar'] == src_langvar_id]
  dst_expr_df = expr_df[expr_df['langvar'] == dst_langvar_id]
  src_denotation_df = denotation_df[denotation_df['expr'].isin(src_expr_df['id'])]
  dst_denotation_df = denotation_df[denotation_df['expr'].isin(dst_expr_df['id'])]
  src_txt_df = pd.merge(src_denotation_df, src_expr_df, left_on='expr',right_on='id')
  dst_txt_df = pd.merge(dst_denotation_df, dst_expr_df, left_on='expr',right_on='id')
  src_txt_df['src_txt'] = src_txt_df['txt']
  dst_txt_df['dst_txt'] = dst_txt_df['txt']
  src_txt_df = src_txt_df[['meaning', 'src_txt']]
  dst_txt_df = dst_txt_df[['meaning', 'dst_txt']]
  src_to_dst_df = pd.merge(src_txt_df, dst_txt_df, on='meaning')
  src_to_dst_df.drop_duplicates('src_txt', inplace=True)
  src_to_dst_dict = {}
  for _, row in src_to_dst_df.iterrows():
    src_to_dst_dict[row['src_txt']] = row['dst_txt']
  return src_to_dst_dict

def create_save_translation_dictionary(
    src_lang,
    src_lang_code,
    src_var_code,
    dst_lang,
    dst_lang_code,
    dst_var_code,
):
  src_to_dst_dict = create_translation_dictionary(
      src_lang_code=src_lang_code,
      src_var_code=src_var_code,
      dst_lang_code=dst_lang_code,
      dst_var_code=dst_var_code,
  )
  filename = f'{src_lang}_to_{dst_lang}.pkl'
  with open(filename, 'wb') as fp:
    pickle.dump(src_to_dst_dict, fp)
    print(f'[INFO] dictionary {filename} saved successfully.')
  return src_to_dst_dict

def generate_word_translator_pickle_files(
  language_details,
  base_lang,
  base_language_details,
):
  languages = list(language_details.keys())
  for lang in languages:
    # base_lang to lang
    create_save_translation_dictionary(
        src_lang=base_lang,
        src_lang_code=base_language_details[base_lang]['lang_code'],
        src_var_code=base_language_details[base_lang]['var_code'],
        dst_lang=lang,
        dst_lang_code=language_details[lang]['lang_code'],
        dst_var_code=language_details[lang]['var_code'],
    )
    # lang to base_lang
    create_save_translation_dictionary(
        dst_lang=base_lang,
        dst_lang_code=base_language_details[base_lang]['lang_code'],
        dst_var_code=base_language_details[base_lang]['var_code'],
        src_lang=lang,
        src_lang_code=language_details[lang]['lang_code'],
        src_var_code=language_details[lang]['var_code'],
    )

In [23]:
# test case
# src_to_dst_dict = create_translation_dictionary(
#     src_lang_code='sun',
#     src_var_code=0,
#     dst_lang_code='ind',
#     dst_var_code=0,
# )

In [None]:
# test case
# create_save_translation_dictionary(
#     src_lang='sun',
#     src_lang_code='sun',
#     src_var_code=0,
#     dst_lang='ind',
#     dst_lang_code='ind',
#     dst_var_code=0,
# )

In [58]:
language_details = {
    'abs' : {'lang_code' : 'abs', 'var_code' : 0},
    'btk' : {'lang_code' : 'bya', 'var_code' : 0},
    'bew' : {'lang_code' : 'bew', 'var_code' : 0},
    'bhp' : {'lang_code' : 'bhp', 'var_code' : 0},
    'jav' : {'lang_code' : 'jav', 'var_code' : 0},
    'mad' : {'lang_code' : 'mad', 'var_code' : 0},
    'mak' : {'lang_code' : 'mfp', 'var_code' : 0},
    'min' : {'lang_code' : 'min', 'var_code' : 0},
    'mui' : {'lang_code' : 'mui', 'var_code' : 0},
    'rej' : {'lang_code' : 'rej', 'var_code' : 0},
    'sun' : {'lang_code' : 'sun', 'var_code' : 0},
}

base_language_details = {
    'ind' : {'lang_code' : 'ind', 'var_code' : 0},
}

generate_word_translator_pickle_files(
  language_details = language_details,
  base_lang = 'ind',
  base_language_details = base_language_details,
)

[INFO] dictionary ind_to_abs.pkl saved successfully.
[INFO] dictionary abs_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_btk.pkl saved successfully.
[INFO] dictionary btk_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_bew.pkl saved successfully.
[INFO] dictionary bew_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_bhp.pkl saved successfully.
[INFO] dictionary bhp_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_jav.pkl saved successfully.
[INFO] dictionary jav_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_mad.pkl saved successfully.
[INFO] dictionary mad_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_mak.pkl saved successfully.
[INFO] dictionary mak_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_min.pkl saved successfully.
[INFO] dictionary min_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_mui.pkl saved successfully.
[INFO] dictionary mui_to_ind.pkl saved successfully.
[INFO] dictionary ind_to_rej.pkl saved success

In [59]:
with open('ind_to_rej.pkl', 'rb') as fp:
    translator = pickle.load(fp)
    print('translator dictionary')
    print(translator)

translator dictionary
{'enam': 'enum', 'air': 'biyoa', 'hitam': 'məluəw', 'hujan': 'ujən', 'berjalan': 'bəpanəw', 'aku': 'uku', 'saya': 'uku', 'zon': 'uku', 'kuning': 'kuniŋ', 'kuniŋ': 'kuniŋ', 'semua': 'kətə', 'sepuluh': 'poloaʔ', 'satu': 'do', 'ini': 'dio', 'anjing': 'kuyuʔ', 'anjiŋ': 'kuyuʔ', 'mati': 'matiɛ', 'mata': 'maṱəy', 'ikan': 'kan', 'tangan': 'taŋən', 'taŋan': 'taŋən', 'malam': 'kəlmən', 'kecil': 'titiʔ', 'dan': 'ŋen [dan]', 'burung': 'buruŋ', 'buruŋ': 'buruŋ', 'angin': 'aŋin', 'aŋin': 'aŋin', 'datang': 'təko', 'serta': 'təko', 'ikut': 'təko', 'dataŋ': 'təko', 'mendataŋ': 'təko', 'mendataŋi': 'təko', 'mendekati': 'təko', 'jalan': 'dalən', 'kepala': 'uləw', 'telinga': 'tiʔuʔ', 'kupiŋ': 'tiʔuʔ', 'teliŋa': 'tiʔuʔ', 'mulut': 'bebea[ʔ]', 'api': 'opoy', 'awan': 'awan', 'besar': 'loy', 'rambut': 'buʔ', 'berenang': 'bər̃naŋ', 'berenaŋ': 'bər̃naŋ', 'bulan': 'bulən', 'dingin': 'səŋãʔ', 'diŋin': 'səŋãʔ', 'langit': 'lɛŋɛt', 'laŋit': 'lɛŋɛt', 'laut': 'lautʔ', 'danau': 'danuəw', 'bagus': 