In [11]:
import pandas as pd
# from src.translation_utils import *
from src.dataset_utils import *
from src.cot_utils_copy import *

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
def get_translated_dataset_df(name,lang):
    """
    Loads a translated dataset from the directory in the requested language.
    
    Parameters:
    name: name of the dataset ['mgsm', 'xcopa', 'xstorycloze', 'mkqa', 'pawsx', 'xnli' or 'xlsum']
    lang: language of the dataset to load.
    
    Returns:
    Dataset in the specified language as dataframe
    """
    lang = language_codes[lang]

    if name == "mgsm":
        
        df = pd.read_csv('./datasets/mgsm/mgsm_' + lang + '.csv',sep=';') 
        
        return df
    
    elif name == "xcopa":
       
        df = pd.read_csv('./datasets/xcopa/xcopa_' + lang + '.csv',sep=';') 
        
        return df
    
    elif name == 'msvamp':

        df = pd.read_csv('./datasets/msvamp/msvamp_' + lang + '.csv',sep=';') 

        return df
    
    elif name == 'coinflip':

        df = pd.read_csv('./datasets/coinflip/coinflip_' + lang + '.csv',sep=';') 

        return df
    
    elif name == 'shuffled_objects':

        df = pd.read_csv('./datasets/shuffled_objects/shuffled_objects_' + lang + '.csv',sep=';') 

        return df
    
    elif name == 'xstorycloze':

        df = pd.read_csv('../datasets/xstorycloze/' + lang + '.csv',sep=',') 

        return df
    
    else:
        print("Dataset name is not correctly specified. Please input 'mgsm' or 'xcopa'.")

def get_prompt(row,task,prompt_setting,instr_lang):
    """
    Generate a string prompt for a given promptsetting,task and instruction language.
    
    Parameters:
    row: dataframe row containing the prompt input.
    prompt_setting: different prompting techniques: 'basic', 'cot'
    task: dataset.
    instr_lang: language of the instruction.

    Returns:    
    String prompt.
    """

    instr_lang = language_codes[instr_lang]

    def generate_message(string,**kwargs):
        return string.format(**kwargs)
    
    instructions = pd.read_csv("../datasets/translated_instructions.csv",sep=';')
    instructions.set_index(instructions['language'],inplace=True)
    instructions = instructions.drop('Unnamed: 0',axis=1)
    instructions = instructions.drop('language',axis=1)

    if task == 'xcopa':

        if prompt_setting == 'cot':

            cot = instructions.loc[instr_lang]['cot']

        elif prompt_setting == 'basic':

            cot = ''

        question = row['question']
        
        if question == 'cause':

            return generate_message(instructions.loc[instr_lang]['xcopa_cause'],
                            premise = row['premise'],
                            choice1 = row['choice1'],
                            choice2 = row['choice2'],
                            question = row['question'],
                            cot=cot)
            
        elif question == 'effect':

            return generate_message(instructions.loc[instr_lang]['xcopa_effect'],
                            premise = row['premise'],
                            choice1 = row['choice1'],
                            choice2 = row['choice2'],
                            question = row['question'],
                            cot=cot)

    elif task == 'mgsm':

        if prompt_setting == 'cot':
        
            return generate_message(instructions.loc[instr_lang]['mgsm_cot'],
                                question = row['question'])
        
        elif prompt_setting == 'basic':
            
            return generate_message(instructions.loc[instr_lang]['mgsm_basic'],
                                question = row['question'])  
        
    elif task == 'msvamp':

        if prompt_setting == 'cot':

            return generate_message(instructions.loc[instr_lang]['mgsm_cot'],
                                question = row['m_query'])
        
        elif prompt_setting == 'basic':

            return generate_message(instructions.loc[instr_lang]['mgsm_basic'],
                                question = row['m_query'])   
        
    elif task == 'xstorycloze':

        if prompt_setting == 'cot':

            cot = instructions.loc[instr_lang]['cot']

            return generate_message(instructions.loc[instr_lang]['xstorycloze'],
                                    input_sentence_1 = row['input_sentence_1'], 
                                    input_sentence_2 = row['input_sentence_2'],
                                    input_sentence_3 = row['input_sentence_3'],
                                    input_sentence_4 = row['input_sentence_4'],
                                    sentence_quiz1 = row['sentence_quiz1'],
                                    sentence_quiz2 = row['sentence_quiz2'],
                                    cot = cot)

        elif prompt_setting == 'basic':

            cot = ''
            
            return generate_message(instructions.loc[instr_lang]['xstorycloze'],
                                    input_sentence_1 = row['input_sentence_1'], 
                                    input_sentence_2 = row['input_sentence_2'],
                                    input_sentence_3 = row['input_sentence_3'],
                                    input_sentence_4 = row['input_sentence_4'],
                                    sentence_quiz1 = row['sentence_quiz1'],
                                    sentence_quiz2 = row['sentence_quiz2'],
                                    cot = cot)

In [27]:
df = get_translated_dataset_df("xstorycloze","Basque")
df
promptlist = []

for index, row in df.iterrows():
    promptlist.append(get_prompt(row,"xstorycloze","basic","Basque"))


In [27]:
def extract_part(answer):

    if "Answer:" in answer:
        return answer.split("Answer:", 1)[1].strip()
    return answer  

df[0] = df[0].apply(extract_part)
df.to_csv('../results/xstorycloze/llama/test.csv', header=None, index=False)


In [1]:
len(['Afrikaans','Arabic','Balinese', "Basque",'Belarusian','Bengali','Tibetan', 'Bosnian', 'Chinese',
                                 'Bulgarian', 'Catalan', 'Czech', 'Danish', 'Estonian','Haitian','Indonesian','Italian','Khmer', 'Korean', 'Lao', 'Maithili', 
                                 'Malayalam', 'Marathi', 'Dutch', 'Norwegian', 'Nepali', 'German', 'Polish', 'Greek',
                                 'Portuguese','Quechua','Russian','French','Romanian','Finnish','Hebrew','Slovak','Hindi',
                                 'Croatian','Hungarian','Swedish','Japanese','Javanese',"Armenian", "Bulgarian", 
                                 "Burmese", "Cantonese", "Malay", "Serbian", "Slovenian", "Spanish", 'Swahili',"Tagalog", 'Tamil',
                                 'Thai',"Telugu", 'Turkish',"Ukrainian", "Urdu",'Vietnamese', "Zulu"])

61

In [6]:
langs = ["afr_Latn","arb_Arab","ban_Latn","bel_Cyrl","ben_Beng","bos_Latn","bul_Cyrl",
"ces_Latn", "cat_Latn","dan_Latn", "deu_Latn","eng_Latn","ell_Grek","est_Latn", 
"fin_Latn", "fra_Latn","hat_Latn", "heb_Hebr","hin_Deva","hun_Latn", "hrv_Latn", "hye_Armn", 
"ind_Latn", "ita_Latn","jav_Latn", "jpn_Jpan","khm_Khmr","kor_Hang", 
"lao_Laoo","mai_Deva", "mal_Mlym", "mar_Deva", "mya_Mymr", "nno_Latn",
"nld_Latn", "npi_Deva","pol_Latn","por_Latn", "slk_Latn","quy_Latn","ron_Latn", "rus_Cyrl", 
"slv_Latn", "spa_Latn", "srp_Cyrl", "swe_Latn", "swh_Latn", "tam_Taml", "tel_Telu", 
"tgl_Latn", 'tha_Thai',"tur_Latn","ukr_Cyrl", "urd_Arab", "vie_Latn" , 'yue_Hant', "zho_Hant", "zsm_Latn","zul_Latn","eus_Latn"]

In [32]:
df = pd.read_csv('../datasets/xstorycloze/bod_Tibt.csv',sep=';')
df.to_csv('../datasets/xstorycloze/bod_Tibt3.csv',sep=',')