In [None]:
from google.cloud import translate_v2 as translate
import os
from google_auth_oauthlib import flow
import pandas as pd
import json
import time
from tqdm import tqdm
import random

In [None]:
def test():
    text = "Hello world"
    result = translate_client.translate(text, target_language="mk")
    print(result)

In [None]:
def read_and_prepare_data(path):
    with open(path) as f:
        lines = f.read().splitlines()
        df_inter = pd.DataFrame(lines)
        df_inter.columns = ['json_element']
        df_inter['json_element'].apply(json.loads)
        df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))
        
        return df_final

In [None]:
def multirc_preprocessing(original_dataset_dest):
    df_final = read_and_prepare_data(original_dataset_dest)

    multi_rc = {}
    for index, row in tqdm(df_final.iterrows()):
    #     print(pd.json_normalize(row['passage.questions']))
        multi_rc[index] = {}
        multi_rc[index]["idx"] = row["idx"]
        multi_rc[index]["passage_text"] = row["passage.text"]
        multi_rc[index]["questions"] = row["passage.questions"]
        
    return multi_rc
    

### Import credentials

In [None]:
translate_client = translate.Client()

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="translation_api_credentials.json"

In [None]:
test()

In [None]:
dtypes = ["train", "test", "val"]

## 1. MULTIRC

## Translation

In [None]:
for dtype in dtypes:
    print(dtype)
    
    original_dataset_dest = f"MultiRC/{dtype}.jsonl"
    translated_dataset_dest = f"New/multirc_{dtype}_mk.json"
    multi_rc = multirc_preprocessing(original_dataset_dest)
    multi_rc_dict = {}
    
    for index, row in tqdm(multi_rc.items()):

        multi_rc_dict[index] = {}

        translated_passage = translate_client.translate(
            row['passage_text'], target_language='mk')["translatedText"]
        multi_rc_dict[index]['passage'] = translated_passage

        questions = row['questions']
        translated_questions_lst = []
        for question in questions:
            # Question is a dictionary
            current_question_dict = {}
            current_question_dict['idx'] = question['idx']
            q = question['question']
            current_question_dict['question'] = translate_client.translate(
                q, target_language='mk')["translatedText"]
            answers = question['answers']
            current_answers = []
            for ans_id, ans in enumerate(answers):
                # Ans is a dictionary
                answer_translation = {}
                answer_translation['idx'] = ans['idx']

                if dtype!="test":
                    answer_translation['label'] = ans['label']

                answer_translation['text'] = ''
    #             if len(ans['text']) > 0 and not(urlparse(ans['text']).scheme and urlparse(ans['text']).netloc):
                try:
                    answer_translation['text'] = translate_client.translate(
                        ans['text'], target_language='mk')["translatedText"]
                except:
                    answer_translation['text'] = ans['text']

                current_answers.append(answer_translation)

    #             time.sleep(0.1)

    #         time.sleep(1)

            current_question_dict['answers'] = current_answers
            translated_questions_lst.append(current_question_dict)

        multi_rc_dict[index]['questions'] = translated_questions_lst


    #     if index % 50 == 0:
        with open(f"{translated_dataset_dest}", "w") as outfile:
            json.dump(multi_rc_dict, outfile, ensure_ascii=False)

    #     time.sleep(0.1)
    with open(f"{translated_dataset_dest}", "w") as outfile:
        json.dump(multi_rc_dict, outfile, ensure_ascii=False)

# 2. BOOLQ

In [None]:
for dtype in dtypes:
    print(dtype)
    original_dataset_dest = f"BoolQ/{dtype}.jsonl"
    translated_dataset_dest = f"New/boolq_{dtype}_mk.json"
    df_final = read_and_prepare_data(original_dataset_dest)
    
    boolq_dict = {}
    for index, row in tqdm(df_final.iterrows()):
    
        boolq_dict[index] = {}
        translated_question = translate_client.translate(row['question'], target_language='mk')["translatedText"]
        translated_passage = translate_client.translate(row['passage'], target_language='mk')["translatedText"]

        boolq_dict[index]['question'] = translated_question
        boolq_dict[index]['passage'] = translated_passage

        if dtype != "test":
            boolq_dict[index]['label'] = row['label']

        if index % 500 == 0:
            with open(translated_dataset_dest, "w") as outfile:
                json.dump(boolq_dict, outfile, ensure_ascii=False)

    #     time.sleep(0.5)
    with open(translated_dataset_dest, "w") as outfile:
        json.dump(boolq_dict, outfile, ensure_ascii=False)


## 3. COPA

In [None]:
for dtype in dtypes:
    print(dtype)
    original_dataset_dest = f"COPA/{dtype}.jsonl"
    translated_dataset_dest = f"New/copa_{dtype}_mk.json"
    df_final = read_and_prepare_data(original_dataset_dest)
    
    copa_dict = {}
    for index, row in tqdm(df_final.iterrows()):

        copa_dict[index] = {}
    #     translated_question = translate_client.translate(row['question'], target_language='mk')["translatedText"]
        translated_premise = translate_client.translate(row['premise'], target_language='mk')["translatedText"]
        translated_choice1 = translate_client.translate(row['choice1'], target_language='mk')["translatedText"]
        translated_choice2 = translate_client.translate(row['choice2'], target_language='mk')["translatedText"]

        copa_dict[index]['question'] = row['question']
        copa_dict[index]['premise'] = translated_premise
        copa_dict[index]['choice1'] = translated_choice1
        copa_dict[index]['choice2'] = translated_choice2

        if dtype != "test":
            copa_dict[index]['label'] = row['label']

        if index % 500 == 0:
            with open(translated_dataset_dest, "w") as outfile:
                json.dump(copa_dict, outfile, ensure_ascii=False)

    time.sleep(0.5)
    with open(translated_dataset_dest, "w") as outfile:
        json.dump(copa_dict, outfile, ensure_ascii=False)

## 4. RTE

In [None]:
for dtype in dtypes:
    print(dtype)
    original_dataset_dest = f"RTE/{dtype}.jsonl"
    translated_dataset_dest = f"New/rte_{dtype}_mk.json"
    df_final = read_and_prepare_data(original_dataset_dest)
    
    current_dict = {}
    
    for index, row in tqdm(df_final.iterrows()):

    
        current_dict[index] = {}
        translated_premise = translate_client.translate(row['premise'], target_language='mk')["translatedText"]
        translated_hypothesis = translate_client.translate(row['hypothesis'], target_language='mk')["translatedText"]

        current_dict[index]['hypothesis'] = translated_hypothesis
        current_dict[index]['premise'] = translated_premise

        if dtype != "test":
            current_dict[index]['label'] = row['label']

        if index % 500 == 0:
            with open(translated_dataset_dest, "w") as outfile:
                json.dump(current_dict, outfile, ensure_ascii=False)
    
    with open(translated_dataset_dest, "w") as outfile:
        json.dump(current_dict, outfile, ensure_ascii=False)
    
    time.sleep(0.5)

    

## 5. WIC

In [None]:
for dtype in dtypes:
    print(dtype)
    original_dataset_dest = f"WiC/{dtype}.jsonl"
    translated_dataset_dest = f"New/wic_{dtype}_mk.json"
    
    df_final = read_and_prepare_data(original_dataset_dest)
    
    current_dict = {}
    
    for index, row in tqdm(df_final.iterrows()):

    
        current_dict[index] = {}
        translated_word = translate_client.translate(row['word'], target_language='mk')["translatedText"]
        translated_sentence1 = translate_client.translate(row['sentence1'], target_language='mk')["translatedText"]
        translated_sentence2 = translate_client.translate(row['sentence2'], target_language='mk')["translatedText"]

        current_dict[index]['word'] = translated_word
        current_dict[index]['sentence1'] = translated_sentence1
        current_dict[index]['sentence2'] = translated_sentence2
        current_dict[index]['start1'] = row['start1']
        current_dict[index]['start2'] = row['start2']
        current_dict[index]['end1'] = row['end1']
        current_dict[index]['end2'] = row['end2']
        

        if dtype != "test":
            current_dict[index]['label'] = row['label']

        if index % 500 == 0:
            with open(translated_dataset_dest, "w") as outfile:
                json.dump(current_dict, outfile, ensure_ascii=False)
    
    with open(translated_dataset_dest, "w") as outfile:
        json.dump(current_dict, outfile, ensure_ascii=False)
    
    time.sleep(0.5)

    

## 6. AX-b

In [None]:
original_dataset_dest = f"AX-b/AX-b.jsonl"
df_final = read_and_prepare_data(original_dataset_dest)
translated_dataset_dest = f"New/ax-b_mk.json"
current_dict = {}
for index, row in tqdm(df_final.iterrows()):


    current_dict[index] = {}
    translated_sentence1 = translate_client.translate(row['sentence1'], target_language='mk')["translatedText"]
    translated_sentence2 = translate_client.translate(row['sentence2'], target_language='mk')["translatedText"]

    current_dict[index]['label'] = row['label']
    current_dict[index]['sentence1'] = translated_sentence1
    current_dict[index]['sentence2'] = translated_sentence2
    current_dict[index]['logic'] = row['logic']
    current_dict[index]['predicate-argument-structure'] = row['predicate-argument-structure']
    current_dict[index]['lexical-semantics'] = row['lexical-semantics']
    current_dict[index]['knowledge'] = row['knowledge']

    if index % 500 == 0:
        with open(translated_dataset_dest, "w") as outfile:
            json.dump(current_dict, outfile, ensure_ascii=False)

with open(translated_dataset_dest, "w") as outfile:
    json.dump(current_dict, outfile, ensure_ascii=False)

# time.sleep(0.5)



## 7. AX-g

In [None]:
original_dataset_dest = f"AX-g/AX-g.jsonl"
df_final = read_and_prepare_data(original_dataset_dest)
translated_dataset_dest = f"New/ax-g_mk.json"
current_dict = {}

for index, row in tqdm(df_final.iterrows()):


    current_dict[index] = {}
    translated_hypothesis = translate_client.translate(row['hypothesis'], target_language='mk')["translatedText"]
    translated_premise = translate_client.translate(row['premise'], target_language='mk')["translatedText"]

    current_dict[index]['label'] = row['label']
    current_dict[index]['pair_id'] = row['pair_id']
    current_dict[index]['hypothesis'] = translated_hypothesis
    current_dict[index]['premise'] = translated_premise
    

    if index % 500 == 0:
        with open(translated_dataset_dest, "w") as outfile:
            json.dump(current_dict, outfile, ensure_ascii=False)

with open(translated_dataset_dest, "w") as outfile:
    json.dump(current_dict, outfile, ensure_ascii=False)

# time.sleep(0.5)



## 8. CB

In [None]:
for dtype in dtypes:
    print(dtype)
    original_dataset_dest = f"CB/{dtype}.jsonl"
    translated_dataset_dest = f"New/cb_{dtype}_mk.json"
    
    df_final = read_and_prepare_data(original_dataset_dest)
    
    current_dict = {}
    
    for index, row in tqdm(df_final.iterrows()):

    
        current_dict[index] = {}
        translated_premise = translate_client.translate(row['premise'], target_language='mk')["translatedText"]
        translated_hypothesis = translate_client.translate(row['hypothesis'], target_language='mk')["translatedText"]

        current_dict[index]['premise'] = translated_premise
        current_dict[index]['hypothesis'] = translated_hypothesis
        

        if dtype != "test":
            current_dict[index]['label'] = row['label']

        if index % 500 == 0:
            with open(translated_dataset_dest, "w") as outfile:
                json.dump(current_dict, outfile, ensure_ascii=False)
    
    with open(translated_dataset_dest, "w") as outfile:
        json.dump(current_dict, outfile, ensure_ascii=False)
    
    time.sleep(0.5)

    

## 9. WSC

In [None]:
for dtype in dtypes:
    print(dtype)
    original_dataset_dest = f"WSC/{dtype}.jsonl"
    translated_dataset_dest = f"New/wsc_{dtype}_mk.json"
    
    df_final = read_and_prepare_data(original_dataset_dest)
    
    current_dict = {}
    
    for index, row in tqdm(df_final.iterrows()):

        current_dict[index] = {}
        translated_text = translate_client.translate(row['text'], target_language='mk')["translatedText"]
        translated_span1 = translate_client.translate(row['target.span1_text'], target_language='mk')["translatedText"]
        translated_span2 = translate_client.translate(row['target.span2_text'], target_language='mk')["translatedText"]

        current_dict[index]['text'] = translated_text
        current_dict[index]['target'] = {'span1_index':row['target.span1_index'], 'span2_index':row['target.span2_index'],
                                        'span1_text':translated_span1, 'span2_text':translated_span2}
        

        if dtype != "test":
            current_dict[index]['label'] = row['label']

        if index % 500 == 0:
            with open(translated_dataset_dest, "w",  encoding="utf-8") as outfile:
                json.dump(current_dict, outfile, ensure_ascii=False)
    
    with open(translated_dataset_dest, "w",  encoding="utf-8") as outfile:
        json.dump(current_dict, outfile, ensure_ascii=False)
    
    time.sleep(0.5)

    