In [24]:
import os
from pathlib import Path
os.environ['HF_HOME'] = '/data1/malto/cache'

BASE_DIR = Path("/data1/malto/shroom")

In [25]:
import warnings
warnings.filterwarnings("ignore")

Translation Functions

In [26]:
def translate(texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]

    # Tokenize the texts
    encoded = tokenizer.prepare_seq2seq_batch(src_texts,
                                              return_tensors='pt')
    
    # Generate translation using model
    translated = model.generate(**encoded)

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

def back_translate(texts, target_model, target_tokenizer, source_model, source_tokenizer, target_lang="fr", source_lang="en" ):
    # Translate to target language
    fr_texts = translate(texts, target_model, target_tokenizer, 
                         language=target_lang)

    # Translate from target language back to source language
    back_translated_texts = translate(fr_texts, source_model, source_tokenizer, 
                                      language=source_lang)
    
    return back_translated_texts

Load models and tokenizers

In [27]:
from transformers import MarianMTModel, MarianTokenizer

target_model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)


en_model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)

Translation test

In [28]:
en_texts = ['This is so cool', 'I hated the food', 'They were very helpful']
source_lang="en"
target_lang="es"

aug_texts = back_translate(en_texts, 
                           target_model=target_model, target_tokenizer=target_tokenizer,
                           source_model=en_model, source_tokenizer=en_tokenizer, 
                           source_lang=source_lang, target_lang=target_lang)
print(en_texts,"\n",aug_texts)

['This is so cool', 'I hated the food', 'They were very helpful'] 
 ['This is so great.', 'I hated food.', 'They were very helpful.']


Actual dataset Transaltion: performed on Validation model aware and agnostic

In [29]:
import json
import pandas as pd

file_path = BASE_DIR / "val.model-agnostic.json"
file_transformed_path = BASE_DIR / "val.model-agnostic-backtranslated.json"

data = "["
with open(file_path) as f:
   for l in f.readlines():
      data += l + ","
data = data[:-1] + "]"
data = json.loads(data)

df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,labels,label,model,ref,hyp,task,tgt,p(Hallucination),src,C-W
0,"[Hallucination, Not Hallucination, Not Halluci...",Not Hallucination,,tgt,Resembling or characteristic of a weasel.,DM,Resembling a weasel (in appearance).,0.2,The writer had just entered into his eighteent...,1.01
1,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,,tgt,Alternative form of sheath knife,DM,.,0.8,Sailors ' and fishermen 's <define> sheath - k...,1.01
2,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,,tgt,(obsolete) A short period of time.,DM,"(poetic) An instant, a short moment.",0.0,"As to age , Bead could not form any clear impr...",1.01
3,"[Not Hallucination, Not Hallucination, Halluci...",Not Hallucination,,tgt,(slang) An incel.,DM,"(incel, _, slang) A man of a slightly lower ra...",0.2,Because redpillers are usually normies or <def...,1.01
4,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,,tgt,"An island in Lienchiang County, Taiwan.",DM,"An island in Dongyin, Lienchiang, Taiwan, in t...",0.0,On the second day of massive live - fire drill...,1.01


In [30]:
df = df[:10]

In [31]:
df["hyp_bt"] = back_translate(df["hyp"], 
                           target_model=target_model, target_tokenizer=target_tokenizer,
                           source_model=en_model, source_tokenizer=en_tokenizer, 
                           source_lang=source_lang, target_lang=target_lang)

df["tgt_bt"] = back_translate(df["tgt"], 
                           target_model=target_model, target_tokenizer=target_tokenizer,
                           source_model=en_model, source_tokenizer=en_tokenizer, 
                           source_lang=source_lang, target_lang=target_lang)


In [32]:
df

Unnamed: 0,labels,label,model,ref,hyp,task,tgt,p(Hallucination),src,C-W,hyp_bt,tgt_bt
0,"[Hallucination, Not Hallucination, Not Halluci...",Not Hallucination,,tgt,Resembling or characteristic of a weasel.,DM,Resembling a weasel (in appearance).,0.2,The writer had just entered into his eighteent...,1.01,Likeness or characteristic of a coma.,Likeness of a weasel (in appearance).
1,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,,tgt,Alternative form of sheath knife,DM,.,0.8,Sailors ' and fishermen 's <define> sheath - k...,1.01,Alternative form of sheath knife,- Why not?
2,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,,tgt,(obsolete) A short period of time.,DM,"(poetic) An instant, a short moment.",0.0,"As to age , Bead could not form any clear impr...",1.01,(obsolete) A short period of time.,"One moment, one short moment."
3,"[Not Hallucination, Not Hallucination, Halluci...",Not Hallucination,,tgt,(slang) An incel.,DM,"(incel, _, slang) A man of a slightly lower ra...",0.2,Because redpillers are usually normies or <def...,1.01,A cell.,A man of a slightly lower ranking on a scale o...
4,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,,tgt,"An island in Lienchiang County, Taiwan.",DM,"An island in Dongyin, Lienchiang, Taiwan, in t...",0.0,On the second day of massive live - fire drill...,1.01,"An island in Lienchiang County, Taiwan.","An island in Dongyin, Lienchiang, Taiwan, in t..."
5,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,,tgt,Alternative form of blue-bearded,DM,"Having thick, dark facial hair.",1.0,Grouped in the center of the hall were about t...,1.01,Alternative form of blue beard,"He's got thick, dark facial hair."
6,"[Hallucination, Not Hallucination, Not Halluci...",Not Hallucination,,tgt,(baseball) A sacrifice bunt.,DM,(baseball) A ball that has been intentionally ...,0.4,The <define> sacrifice bunt </define> was fiel...,1.01,A sacrificial bunt.,A ball that has been hit intentionally gently ...
7,"[Hallucination, Hallucination, Hallucination, ...",Hallucination,,tgt,"(Australia, New Zealand, slang) Drunk.",DM,(AU) Lost in the bush. [from 19th c.],1.0,� She changes her shape depending on which way...,1.01,"(Australia, New Zealand, jargon)",(AU) Lost in the bush.
8,"[Hallucination, Not Hallucination, Not Halluci...",Not Hallucination,,tgt,(linguistics) The study of the relationships b...,DM,The ontology of ontology.,0.4,The <define> metaontology </define> debate has...,1.01,The study of the relationships between words a...,Ontology of ontology.
9,"[Not Hallucination, Hallucination, Hallucinati...",Hallucination,,tgt,(uncountable) The quality or state of being in...,DM,"The ability to solve difficult problems, often...",0.6,Poverty is the mother of <define> ingenuity </...,1.01,(uncountable) The quality or state of being in...,"The ability to solve difficult problems, often..."


In [None]:
df.to_json(file_transformed_path, orient='records')