# Translation of test sentences with the 4 automatic translation systems

<ul>
    <li>OpusMT</li>
    <li>M2M-100</li>
    <li>NLLB-200</li>
    <li>DeepL</li>
</ul>

In [1]:
import sentencepiece as spm
import pandas as pd
from transformers import pipeline, AutoTokenizer, TFMarianMTModel, AutoModelForSeq2SeqLM
import deepl 
import ctranslate2

  from .autonotebook import tqdm as notebook_tqdm


## Source sentences extraction

In [2]:
lines = []
with open('datasets\source\sentences_en.csv', 'r', encoding='utf-8') as f:
    for line in f:
        lines.append(line.strip())

## Machine translations

### OpusMT

In [None]:
src = "en"  # source language
trg = "it"  # target language

model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"

model = TFMarianMTModel.from_pretrained(model_name, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def translate_text_marian(text):
    batch = tokenizer(text, return_tensors="tf")
    gen = model.generate(**batch)
    return tokenizer.batch_decode(gen, skip_special_tokens=True)[0]

# traduzione
trad_lines_marian = []
for line in lines:
    trad = translate_text_marian(line)
    trad_lines_marian.append(trad)

# salvataggio
df = pd.DataFrame(trad_lines_marian)
df.to_csv("datasets\candidates\sentences_it_opusmt.csv", index=False, header=False)

### M2M-100

In [4]:
translator = pipeline("translation_en_to_it", model="facebook/m2m100_418M")

# traduzione
trad_lines_m2m100 = []
for line in lines:
    trad_lines_m2m100.append(translator(line)[0]['translation_text'])

# salvataggio
df = pd.DataFrame(trad_lines_m2m100)
df.to_csv("datasets\candidates\sentences_it_m2m100.csv", index=False, header=False)

### NLLB-200

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

def translate_text_nllb200(text, src_lang = "eng_Latn", tgt_lang = "ita_Latn"):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    output_ids = model.generate(input_ids=input_ids, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# traduzione
trad_lines_nllb200 = []
for line in lines:
    trad = translate_text_nllb200(line)
    trad_lines_nllb200.append(trad)

# salvataggio
df = pd.DataFrame(trad_lines_nllb200)
df.to_csv("datasets\candidates\sentences_it_nllb200.csv", index=False, header=False)

### DeepL

In [None]:
auth_key = ""
translator = deepl.Translator(auth_key) 

def translate_text_deepl(text):
    result = translator.translate_text(text, target_lang="it") 
    return result.text

# traduzione
trad_lines_deepl = []
for line in lines:
    trad_lines_deepl.append(translate_text_deepl(line))

# salvataggio
df = pd.DataFrame(trad_lines_deepl)
df.to_csv("datasets\candidates\sentences_it_deepl.csv", index=False, header=False)