# Split between languages

In [1]:
import pandas as pd 
import os 
from sklearn.model_selection import train_test_split
import json

_LANG_NAME={
    "af": "Afrikaans",
	"ar": "Arabic",
	"az": "Azerbaijani",
	"bn": "Bengali",
	"cs": "Czech",
	"de": "German",
	"en": "English",
	"es": "Spanish",
	"et": "Estonian",
	"fa": "Persian",
	"fi": "Finnish",
	"fr": "French",
	"gl": "Galician",
	"gu": "Gujarati",
	"he": "Hebrew",
	"hi": "Hindi",
	"hr": "Croatian",
	"id": "Indonesian",
	"it": "Italian",
	"ja": "Japanese",
	"ka": "Georgian",
	"kk": "Kazakh",
	"km": "Khmer",
	"ko": "Korean",
	"lt": "Lithuanian",
	"lv": "Latvian",
	"mk": "Macedonian",
	"ml": "Malayalam",
	"mn": "Mongolian",
	"mr": "Marathi",
	"my": "Burmese",
	"ne": "Nepali",
	"nl": "Dutch",
	"pl": "Polish",
	"ps": "Pashto",
	"pt": "Portuguese",
	"ro": "Romanian",
	"ru": "Russian",
	"si": "Sinhala",
	"sl": "Slovene",
	"sv": "Swedish",
	"sw": "Swahili",
	"ta": "Tamil",
	"te": "Telugu",
	"th": "Thai",
	"tl": "Tagalog",
	"tr": "Turkish",
	"uk": "Ukrainian",
	"ur": "Urdu",
	"vi": "Vietnamese",
	"xh": "Xhosa",
	"zh": "Chinese",
}



In [2]:
languages = ["en","zh"]

def read_files_and_split(languages):
    src_lang_path = languages[0]+"_formats.json"
    src_lang_path = os.path.join("templates", src_lang_path)
    src_json = pd.read_json(src_lang_path)
    tgt_lang_path = languages[1]+"_formats.json"
    tgt_lang_path = os.path.join("templates", tgt_lang_path)
    tgt_json = pd.read_json(tgt_lang_path)

    path = os.path.join("data", "people.csv")
    src_lang_train = pd.read_csv(path)

    tgt_lang_train, tgt_lang_test = train_test_split(src_lang_train, test_size=0.5, random_state=42)

    return src_json, tgt_json, src_lang_train, tgt_lang_train, tgt_lang_test




def fill_the_templates(dataframe, template_json, language, typ, for_CLA_format=False):
    dir = os.path.join("data", language)
    os.makedirs(dir, exist_ok=True)
    if for_CLA_format:
        path = os.path.join(dir, f"{typ}_cla.json")
    else:
        path = os.path.join(dir, f"{typ}.json") 

    print(path)
    all_data = []
    with open(path, "w", encoding="utf-8") as outfile:
        for person in dataframe.iterrows():
            for key in template_json.keys():
                for value in template_json[key]:
                    dictr = {}
                    question = value["question"]
                    answer = value["answer"]
                    question = question.replace("{name}", person[1]["name"])
                    answer = answer.replace("{name}", person[1]["name"])
                    if key =="Place of living":
                        answer = answer.replace("{location}", person[1]["city"])
                    if key=="Birth":
                        answer = answer.replace("{date}", str(person[1]["birth_date"]))
                    if key=="Death":
                        answer = answer.replace("{date}", str(person[1]["death_date"]))
                    if for_CLA_format:
                        clm_text = question + " Answer in " + _LANG_NAME[language] + ". " + answer
                        dictr = {   
                            "sent0": "",
                            "sent1": "",
                            "clm_text": clm_text,
                            "clm_prompt_len": len(clm_text) - len(answer),
                        }
                        

                    else:
                        dictr["prompt"] = question
                        dictr["answer"] = answer
                    all_data.append(dictr)
        json.dump(all_data, outfile, ensure_ascii=False, indent=2)
                    

src_json, tgt_json, src_lang_train, tgt_lang_train, tgt_lang_test = read_files_and_split(languages)
fill_the_templates(src_lang_train, src_json, languages[0], "train", for_CLA_format=False)
fill_the_templates(tgt_lang_train, tgt_json, languages[1], "train", for_CLA_format=False)
fill_the_templates(tgt_lang_test, tgt_json, languages[1], "test", for_CLA_format=False)
fill_the_templates(tgt_lang_test, src_json, languages[0], "test", for_CLA_format=False)

fill_the_templates(src_lang_train, src_json, languages[0], "train", for_CLA_format=True)
fill_the_templates(tgt_lang_train, tgt_json, languages[1], "train", for_CLA_format=True)
# fill_the_templates(tgt_lang_test, tgt_json, languages[1], "test", for_CLA_format=True)


data/en/train.json
data/zh/train.json
data/zh/test.json
data/en/test.json
data/en/train_cla.json
data/zh/train_cla.json
