In [6]:
import warnings
warnings.filterwarnings("ignore")

Translation Functions

In [4]:
def translate(texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]

    # Tokenize the texts
    encoded = tokenizer.prepare_seq2seq_batch(src_texts,
                                              return_tensors='pt')
    
    # Generate translation using model
    translated = model.generate(**encoded)

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

def back_translate(texts, target_model, target_tokenizer, source_model, source_tokenizer, target_lang="fr", source_lang="en" ):
    # Translate to target language
    fr_texts = translate(texts, target_model, target_tokenizer, 
                         language=target_lang)

    # Translate from target language back to source language
    back_translated_texts = translate(fr_texts, source_model, source_tokenizer, 
                                      language=source_lang)
    
    return back_translated_texts

Load models and tokenizers

In [None]:
from transformers import MarianMTModel, MarianTokenizer

target_model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)


en_model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)

Translation test

In [9]:
en_texts = ['This is so cool', 'I hated the food', 'They were very helpful']
source_lang="en"
target_lang="es"

aug_texts = back_translate(en_texts, 
                           target_model=target_model, target_tokenizer=target_tokenizer,
                           source_model=en_model, source_tokenizer=en_tokenizer, 
                           source_lang=source_lang, target_lang=target_lang)
print(en_texts,"\n",aug_texts)

['This is so cool', 'I hated the food', 'They were very helpful'] 
 ['This is so great.', 'I hated food.', 'They were very helpful.']


Actual dataset Transaltion: performed on Validation model aware and agnostic

In [19]:
import json
import pandas as pd

file_path = r"Jack_Data\val.model-agnostic.json"
file_transformed_path = r"Jack_Data\val.model-agnostic-backtranslated.json"
with open(file_path) as f:
   data = json.load(f)
   
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,hyp,ref,src,tgt,model,task,labels,label,p(Hallucination)
0,Resembling or characteristic of a weasel.,tgt,The writer had just entered into his eighteent...,Resembling a weasel (in appearance).,,DM,"[Hallucination, Not Hallucination, Not Halluci...",Not Hallucination,0.2
1,Alternative form of sheath knife,tgt,Sailors ' and fishermen 's <define> sheath - k...,.,,DM,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,0.8
2,(obsolete) A short period of time.,tgt,"As to age , Bead could not form any clear impr...","(poetic) An instant, a short moment.",,DM,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.0
3,(slang) An incel.,tgt,Because redpillers are usually normies or <def...,"(incel, _, slang) A man of a slightly lower ra...",,DM,"[Not Hallucination, Not Hallucination, Halluci...",Not Hallucination,0.2
4,"An island in Lienchiang County, Taiwan.",tgt,On the second day of massive live - fire drill...,"An island in Dongyin, Lienchiang, Taiwan, in t...",,DM,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.0


In [None]:
df["hyp_bt"] = back_translate(df["hyp"], 
                           target_model=target_model, target_tokenizer=target_tokenizer,
                           source_model=en_model, source_tokenizer=en_tokenizer, 
                           source_lang=source_lang, target_lang=target_lang)

df["tgt_bt"] = back_translate(df["tgt"], 
                           target_model=target_model, target_tokenizer=target_tokenizer,
                           source_model=en_model, source_tokenizer=en_tokenizer, 
                           source_lang=source_lang, target_lang=target_lang)


In [None]:
df.to_json(file_transformed_path, orient='records')