In [1]:
import pandas as pd
import ctranslate2
import sentencepiece as spm
from sacrebleu import sentence_bleu

In [2]:
path_to_corpus = './Corpora/corpus_df.pkl'
corpus_df = pd.read_pickle(path_to_corpus)
path_to_syntethic = '.\Corpora\syhtethic.pkl'
synthetic_df = pd.read_pickle(path_to_syntethic)

In [3]:
corpus_df

Unnamed: 0,Dutch_Sentence,English_Sentence,Dutch Sentence Length,English Sentence Length
0,Veel Afrikaanse vrouwen gaan naar huis met een...,"Many African women return home with a fistula,...",12,12
1,Beschouw ons niet als de moppentrommel van de ...,Do not consider us as the joke bin of the univ...,9,11
2,Er is al een patiëntje geholpen met onze techniek,Our technique has already helped a patient,9,7
3,De schedelcontouren worden uit de data van de ...,The cranial contours are filtered from the sca...,17,17
4,Nadat we voor die schedels de computerprocedur...,After we ran the computerized procedure for th...,38,40
...,...,...,...,...
310,Ik werk er mee aan een Europees actieplan voor...,I am working on a European action plan for tel...,20,19
311,Nederland is in de eerste plaats een groot lab...,"The Netherlands is, first and foremost, a huge...",12,14
312,Wij zijn als het ware het oog en oor van Vlaan...,In a sense were the eyes and ears of Flanders ...,14,11
313,De Nederlandse staatsveiligheid kon zich niet ...,The Dutch State Security Service couldnt belie...,15,16


In [4]:
synthetic_df

Unnamed: 0,Dutch_Sentence,English_Sentence,Dutch Sentence Length,English Sentence Length
315,Een appel per dag houdt de arts bezig,An apple a day keeps the doctor away,8,8
316,Geld is het root van alle kwaad,Money is the root of all evil,7,7
317,Een lege maag is een slechte raadgever,An empty stomach is not a good advisor,7,8
318,Een vogel in de hand is beter dan tien in de l...,A bird in the hand is worth two in the bush,12,11
319,Een wolf in schaapskleren,A wolf in sheeps clothing,4,5
...,...,...,...,...
388,Hou nou op met die gekke grapjes! Ze zijn echt...,Stop making those silly jokes! Theyre not funn...,17,15
389,"Doe maar wat je wilt, ik heb geen zin om te di...","Do what you want, I dont feel like discussing ...",15,11
390,Kun je het raam even dichtdoen? Het is koud bu...,Can you close the window for a moment? Its col...,20,22
391,"Bel hem eens op, hij zal nou wel thuis zijn en...","Call him once, hell probably be home by now an...",21,20


In [5]:
translator = ctranslate2.Translator("./nllb-200-600M-int8/", device="cpu")
sp = spm.SentencePieceProcessor("./Labs/flores200_sacrebleu_tokenizer_spm.model")

In [6]:
corpus_df['Translated_Dutch'] = ''
corpus_df['BLEU_score']=0
src_lang = 'dutch_Latn'
tgt_lang = 'eng_Latn'

index = 0
for row in corpus_df.iterrows():
    #get the dutch and english sentence from the dataframe
    dutch_sent = str(corpus_df["Dutch_Sentence"][index])
    reference_sentence = str(corpus_df['English_Sentence'][index])

    #pre processing
    source_sentences = [dutch_sent.strip()]
    target_prefix = [[tgt_lang]] * len(source_sentences)

    #encode the stripped dutch sentence
    source_sents_subworded = sp.encode_as_pieces(source_sentences)
    source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

    #translate the encoded subword token
    translated_sentence_encoded = translator.translate_batch(source_sents_subworded, batch_type='tokens',target_prefix=target_prefix )
    translated_tokens = translated_sentence_encoded[0].hypotheses[0]

    #recompose a sentence from the translated encoded tokens
    translated_sentence = sp.decode(translated_tokens)
    translated_sentence = translated_sentence.replace(tgt_lang,'').lstrip()

    #calculate sentence bleu between translated sentence and the reference
    sentence_bleu_computed = sentence_bleu(translated_sentence,[reference_sentence]) 

    #assign the translated sentence a spot in the dataframe and add the bleu score for the translated sentence
    corpus_df.loc[index,'Translated_Dutch'] = translated_sentence
    corpus_df.loc[index,'BLEU_score']= sentence_bleu_computed.score
    index+=1
    
    

  corpus_df.loc[index,'BLEU_score']= sentence_bleu_computed.score


In [8]:
synthetic_df['Translated_Dutch'] = ''
synthetic_df['BLEU_score']=0
src_lang = 'dutch_Latn'
tgt_lang = 'eng_Latn'

index = 315
for row in synthetic_df.iterrows():
    #get the dutch and english sentence from the dataframe
    dutch_sent = str(synthetic_df["Dutch_Sentence"][index])
    reference_sentence = str(synthetic_df['English_Sentence'][index])

    #pre processing
    source_sentences = [dutch_sent.strip()]
    target_prefix = [[tgt_lang]] * len(source_sentences)

    #encode the stripped dutch sentence
    source_sents_subworded = sp.encode_as_pieces(source_sentences)
    source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

    #translate the encoded subword token
    translated_sentence_encoded = translator.translate_batch(source_sents_subworded, batch_type='tokens',target_prefix=target_prefix )
    translated_tokens = translated_sentence_encoded[0].hypotheses[0]

    #recompose a sentence from the translated encoded tokens
    translated_sentence = sp.decode(translated_tokens)
    translated_sentence = translated_sentence.replace(tgt_lang,'').lstrip()

    #calculate sentence bleu between translated sentence and the reference
    sentence_bleu_computed = sentence_bleu(translated_sentence,[reference_sentence]) 

    #assign the translated sentence a spot in the dataframe and add the bleu score for the translated sentence
    synthetic_df.loc[index,'Translated_Dutch'] = translated_sentence
    synthetic_df.loc[index,'BLEU_score']= sentence_bleu_computed.score
    index+=1
    
    

  synthetic_df.loc[index,'BLEU_score']= sentence_bleu_computed.score


In [13]:
corpus_df.describe()

Unnamed: 0,Dutch Sentence Length,English Sentence Length,BLEU_score
count,315.0,315.0,315.0
mean,16.990476,18.184127,24.151696
std,10.740567,11.519206,23.268744
min,2.0,1.0,0.0
25%,9.0,10.0,5.939585
50%,15.0,16.0,16.809638
75%,23.0,24.0,35.05133
max,67.0,72.0,100.0


In [14]:
synthetic_df

Unnamed: 0,Dutch_Sentence,English_Sentence,Dutch Sentence Length,English Sentence Length,Translated_Dutch,BLEU_score
315,Een appel per dag houdt de arts bezig,An apple a day keeps the doctor away,8,8,One call a day keeps the arts busy.,29.847459
316,Geld is het root van alle kwaad,Money is the root of all evil,7,7,Money is the root of all evil.,84.089642
317,Een lege maag is een slechte raadgever,An empty stomach is not a good advisor,7,8,A foolish man is a bad consultant.,6.567275
318,Een vogel in de hand is beter dan tien in de l...,A bird in the hand is worth two in the bush,12,11,A bird in the hand is better than ten in the air.,42.803206
319,Een wolf in schaapskleren,A wolf in sheeps clothing,4,5,A wolf in sheep's clothing,42.728701
...,...,...,...,...,...,...
388,Hou nou op met die gekke grapjes! Ze zijn echt...,Stop making those silly jokes! Theyre not funn...,17,15,"Stop with those crazy jokes, they really aren'...",32.160571
389,"Doe maar wat je wilt, ik heb geen zin om te di...","Do what you want, I dont feel like discussing ...",15,11,"Do what you will, I have no desire to discuss ...",14.458925
390,Kun je het raam even dichtdoen? Het is koud bu...,Can you close the window for a moment? Its col...,20,22,It's cold outside and there's wind blowing aro...,12.139459
391,"Bel hem eens op, hij zal nou wel thuis zijn en...","Call him once, hell probably be home by now an...",21,20,"Give him a call, he'll be home soon and I can ...",7.133515


In [10]:
corpus_df.to_pickle('.\Corpora\corpus_with_MT_translations.pkl')
synthetic_df.to_pickle('.\Corpora\sythetic_with_MT_translations.pkl')

In [None]:
# path_to_corpus = '.\Corpora\corpus_with_MT_translations.pkl'
# corpus_with_translations_df = pd.read_pickle(path_to_corpus)