In [None]:
#### Cell to run
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00

In [None]:
#### Cell to run

import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from transformers import MarianMTModel, MarianTokenizer, AdamW
import torch.distributed as dist
from tqdm import tqdm
from transformers import pipeline
import evaluate
import shutil
from torch.utils.data import Dataset, DataLoader, ConcatDataset

In [None]:
#### Cell to run

# Loading Train, Test and Validation data of multilingual data

Dutch_train_data = pd.read_csv('Dutch_train.csv')
Dutch_val_data = pd.read_csv('Dutch_validation.csv')
Dutch_test_data = pd.read_csv('Dutch_test.csv')

Dutch_train_data = Dutch_train_data.drop(columns = ['Unnamed: 0'])
Dutch_val_data = Dutch_val_data.drop(columns = ['Unnamed: 0'])
Dutch_test_data = Dutch_test_data.drop(columns = ['Unnamed: 0'])

German_train_data = pd.read_csv('German_train.csv')
German_val_data = pd.read_csv('German_validation.csv')
German_test_data = pd.read_csv('German_test.csv')

German_train_data = German_train_data.drop(columns = ['Unnamed: 0'])
German_val_data = German_val_data.drop(columns = ['Unnamed: 0'])
German_test_data = German_test_data.drop(columns = ['Unnamed: 0'])

Spanish_train_data = pd.read_csv('Spanish_train.csv')
Spanish_val_data = pd.read_csv('Spanish_validation.csv')
Spanish_test_data = pd.read_csv('Spanish_test.csv')

Spanish_train_data = Spanish_train_data.drop(columns = ['Unnamed: 0'])
Spanish_val_data = Spanish_val_data.drop(columns = ['Unnamed: 0'])
Spanish_test_data = Spanish_test_data.drop(columns = ['Unnamed: 0'])

#### MarianMT

Three models for three languages

Spanish Use:  model_name = "Helsinki-NLP/opus-mt-en-es"

German Use: model_name  = "Helsinki-NLP/opus-mt-en-de"

Dutch Use: model_name = "Helsinki-NLP/opus-mt-en-nl"

So training requires separate datasets which makes it three models for MarianMT

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-nl"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [None]:
src_text = Dutch_test_data[1:4]['English'].to_list()

translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
tgt_text

['En, zeg ik tegen mevrouw Reding, dank u voor de steun en de hulp die u tot nu toe hebt gegeven, ga door met het goede werk, we staan aan uw kant.',
 'Ik zou u een paar voorbeelden willen geven om deze tegenstrijdigheden te illustreren.',
 'De stemming vindt morgen om 11.00 uur plaats.']

In [None]:
# Create new csv files to store inferences for MarianMT

Dutch_outputs_df = Dutch_test_data.copy()
Dutch_outputs_df ['MarianMT-pretrained'] = None

German_outputs_df = German_test_data.copy()
German_outputs_df ['MarianMT-pretrained'] = None

Spanish_outputs_df = Spanish_test_data.copy()
Spanish_outputs_df ['MarianMT-pretrained'] = None

In [None]:
# Dutch model

src_text = Dutch_test_data['English'].to_list()

model_name = "Helsinki-NLP/opus-mt-en-nl"
translator = pipeline("translation_en_to_nl", model=model_name, tokenizer=model_name)

translations = []
for i in tqdm(range(0, len(src_text)), desc="Processing Batches", unit="sentences"):
    translations.append(translator(src_text[i])[0].get('translation_text'))

# Run inference
Dutch_outputs_df['MarianMT-pretrained'] = translations

Processing Batches: 100%|██████████| 1468/1468 [36:54<00:00,  1.51s/sentences]


In [None]:
# German model

src_text = German_test_data['English'].to_list()

model_name = "Helsinki-NLP/opus-mt-en-de"
translator = pipeline("translation_en_to_de", model=model_name, tokenizer=model_name)

translations = []
for i in tqdm(range(0, len(src_text)), desc="Processing Batches", unit="sentences"):
    translations.append(translator(src_text[i])[0].get('translation_text'))

# Run inference
German_outputs_df['MarianMT-pretrained']  = translations

Processing Batches: 100%|██████████| 1478/1478 [36:02<00:00,  1.46s/sentences] 


In [None]:
# Spanish

src_text = Spanish_test_data['English'].to_list()
model_name = "Helsinki-NLP/opus-mt-en-es"
translator = pipeline("translation_en_to_es", model=model_name, tokenizer=model_name)

translations = []
for i in tqdm(range(0, len(src_text)), desc="Processing Batches", unit="sentences"):
    translations.append(translator(src_text[i])[0].get('translation_text'))

# Run inference
Spanish_outputs_df['MarianMT-pretrained']  = translations

Processing Batches: 100%|██████████| 1476/1476 [38:23<00:00,  1.56s/sentences] 


In [None]:
# Save the dataframes as csv for evaluation

Dutch_outputs_df.to_csv('Dutch outputs_MarianMT.csv',index=False)
German_outputs_df.to_csv('German outputs_MarianMT.csv',index=False)
Spanish_outputs_df.to_csv('Spanish outputs_MarianMT.csv',index=False)

In [None]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/marianivethaantonypushparaj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marianivethaantonypushparaj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/marianivethaantonypushparaj/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
len(Dutch_outputs_df['Dutch'].to_list())

1468

In [None]:
# Compute metrics

# bleu_metric = load_metric("bleu",trust_remote_code=True)
# rouge_metric = load_metric("rouge",trust_remote_code=True)
# meteor_metric = load_metric("meteor",trust_remote_code=True)

# Dutch model

predictions = Dutch_outputs_df['MarianMT-pretrained'].to_list()
references = Dutch_outputs_df['Dutch'].to_list()


bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU score for Dutch Model: {bleu_score}")

# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE score for Dutch Model: {rouge_score}")

# Compute METEOR score
meteor_score = meteor_metric.compute(predictions=predictions, references=references)
print(f"METEOR score for Dutch Model: {meteor_score}")

###########################################################################################

# German model

predictions = German_outputs_df['MarianMT-pretrained'].to_list()
references = German_outputs_df['German'].to_list()


bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU score for German Model: {bleu_score}")

# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE score for German Model: {rouge_score}")

# Compute METEOR score
meteor_score = meteor_metric.compute(predictions=predictions, references=references)
print(f"METEOR score for German Model: {meteor_score}")

###########################################################################################

# Spanish model

predictions = Spanish_outputs_df['MarianMT-pretrained'].to_list()
references = Spanish_outputs_df['Spanish'].to_list()


bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU score for Spanish Model: {bleu_score}")

# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE score for Spanish Model: {rouge_score}")

# Compute METEOR score
meteor_score = meteor_metric.compute(predictions=predictions, references=references)
print(f"METEOR score for Spanish Model: {meteor_score}")

BLEU score for Dutch Model: {'bleu': 0.23670363486569077, 'precisions': [0.5444265719155883, 0.28440960557421924, 0.17631714876033058, 0.11498510347048876], 'brevity_penalty': 1.0, 'length_ratio': 1.028951853956177, 'translation_length': 41653, 'reference_length': 40481}
ROUGE score for Dutch Model: {'rouge1': 0.54880173272648, 'rouge2': 0.30545784259474906, 'rougeL': 0.4876714408446511, 'rougeLsum': 0.4878248561064088}
METEOR score for Dutch Model: {'meteor': 0.4984006043414953}
BLEU score for German Model: {'bleu': 0.24035835851731063, 'precisions': [0.5453654860587792, 0.2882448085150788, 0.17962284628951294, 0.1182023742227247], 'brevity_penalty': 1.0, 'length_ratio': 1.0151468788249693, 'translation_length': 39810, 'reference_length': 39216}
ROUGE score for German Model: {'rouge1': 0.5291796376818525, 'rouge2': 0.30559721360263276, 'rougeL': 0.48128744855697725, 'rougeLsum': 0.4811391734559068}
METEOR score for German Model: {'meteor': 0.5008671134114652}
BLEU score for Spanish Mo

### Dutch Model finetuning (en-nl)

In [None]:
#### Cell to run

def preprocess_texts(texts, src_lang_code, tgt_lang_code):
    processed_src_texts = []
    processed_tgt_texts = []
    for src_text, tgt_text in texts:
        src_text = f"{src_text} {tokenizer.eos_token} {src_lang_code}"
        tgt_text = f"{tgt_lang_code} {tgt_text} {tokenizer.eos_token}"
        processed_src_texts.append(src_text)
        processed_tgt_texts.append(tgt_text)
    return processed_src_texts, processed_tgt_texts

In [None]:
#### Cell to run

class TranslationDataset(Dataset):
    def __init__(self, texts, tokenizer, src_lang_code, tgt_lang_code):
        self.texts = texts
        self.tokenizer = tokenizer
        self.src_lang_code = src_lang_code
        self.tgt_lang_code = tgt_lang_code
        self.src_texts, self.tgt_texts = preprocess_texts(texts, src_lang_code, tgt_lang_code)

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        inputs = self.tokenizer(src_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        labels = self.tokenizer(tgt_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

        input_ids = inputs.input_ids.squeeze()
        attention_mask = inputs.attention_mask.squeeze()
        labels = labels.input_ids.squeeze()

        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


In [None]:
model_name = "Helsinki-NLP/opus-mt-en-nl"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
# torch.quantization.prepare(model, inplace=True)

english_dutch_texts = list(zip(Dutch_train_data['English'].tolist(), Dutch_train_data['Dutch'].tolist()))

dutch_train_datasets = TranslationDataset(english_dutch_texts, tokenizer, "en", "nl")
# dutch_train_dataloader = DataLoader(dutch_datasets, batch_size=128, shuffle=True)

english_dutch_texts = list(zip(Dutch_val_data['English'].tolist(), Dutch_val_data['Dutch'].tolist()))  # For Dutch translation
dutch_eval_datasets = TranslationDataset(english_dutch_texts, tokenizer, "en", "nl")
# dutch_eval_dataloader = DataLoader(dutch_datasets, batch_size=128, shuffle=True)



In [None]:
##### Run this cell

# from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./logs',
    push_to_hub=False,
    report_to='none',
    logging_steps=10,
    dataloader_num_workers=2,
)




In [None]:
##### Run this cell

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dutch_train_datasets,
    eval_dataset=dutch_eval_datasets,
    tokenizer=tokenizer,
)

In [None]:
##### Run this cell

trainer.train()

  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
1,1.7154,1.569334
2,1.2921,1.382931
3,1.3095,1.313312
4,1.1919,1.278212
5,1.1747,1.269117


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[67027]], 'fo

TrainOutput(global_step=14675, training_loss=1.469979932320382, metrics={'train_runtime': 834.6414, 'train_samples_per_second': 70.312, 'train_steps_per_second': 17.582, 'total_flos': 1989325545799680.0, 'train_loss': 1.469979932320382, 'epoch': 5.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/MarianMT_dutch_model_weights.pth')
# shutil.copy('MarianMT_dutch_model_weights.pth', '/content/drive/MyDrive')

In [49]:
from transformers import pipeline

In [94]:
model_name = "Helsinki-NLP/opus-mt-en-nl"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model.load_state_dict(torch.load('/content/drive/MyDrive/MarianMT_dutch_model_weights.pth'),strict=False)



<All keys matched successfully>

In [96]:
translations = []
Dutch_outputs_df = pd.read_csv('Dutch outputs_MarianMT.csv')
src_text = Dutch_outputs_df['English'].to_list()
model.to('cuda')

translator = pipeline("translation_en_to_nl", model=model, tokenizer=tokenizer,device = 'cuda')
translated_texts = [translator(text)[0]['translation_text'] for text in src_text]

Dutch_outputs_df['MarianMT-finetuned']  = translated_texts

In [99]:
Dutch_outputs_df.to_csv('Dutch outputs_MarianMT_finetuned.csv',index=False)

In [98]:
# Compute metrics

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

# Dutch model

predictions = Dutch_outputs_df['MarianMT-finetuned'].apply(lambda x: x[0] if isinstance(x, list) else x)
references = Dutch_outputs_df['Dutch'].to_list()


bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU score for Dutch Model: {bleu_score}")

# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE score for Dutch Model: {rouge_score}")

# Compute METEOR score
meteor_score = meteor_metric.compute(predictions=predictions, references=references)
print(f"METEOR score for Dutch Model: {meteor_score}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


BLEU score for Dutch Model: {'bleu': 0.2070843491637414, 'precisions': [0.5238598388047966, 0.25428265524625265, 0.14892884569552206, 0.09269972451790634], 'brevity_penalty': 1.0, 'length_ratio': 1.0053111336182408, 'translation_length': 40696, 'reference_length': 40481}
ROUGE score for Dutch Model: {'rouge1': 0.5181524404497124, 'rouge2': 0.2702332215846026, 'rougeL': 0.45348876259616644, 'rougeLsum': 0.45335005224894187}
METEOR score for Dutch Model: {'meteor': 0.4587905497275448}


#### German model

In [65]:
#### Cell to run


model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
# torch.quantization.prepare(model, inplace=True)

english_german_texts = list(zip(German_train_data['English'].tolist(), German_train_data['German'].tolist()))

german_train_datasets = TranslationDataset(english_german_texts, tokenizer, "en", "de")
# german_train_dataloader = DataLoader(german_datasets, batch_size=128, shuffle=True)

english_german_texts = list(zip(German_val_data['English'].tolist(), German_val_data['German'].tolist()))  # For German translation
german_eval_datasets = TranslationDataset(english_german_texts, tokenizer, "en", "de")
# german_eval_dataloader = DataLoader(german_datasets, batch_size=128, shuffle=True)



In [62]:
##### Run this cell

# from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./logs',
    push_to_hub=False,
    report_to='none',
    logging_steps=10,
    dataloader_num_workers=2,
)




In [66]:
##### Run this cell

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=german_train_datasets,
    eval_dataset=german_eval_datasets,
    tokenizer=tokenizer,
)

In [67]:
trainer.train()

  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
1,1.596,1.506231
2,1.2995,1.32962
3,1.1764,1.264535
4,1.1256,1.237693
5,0.9441,1.229653


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[58100]], 'fo

TrainOutput(global_step=14775, training_loss=1.3874550982054115, metrics={'train_runtime': 852.17, 'train_samples_per_second': 69.352, 'train_steps_per_second': 17.338, 'total_flos': 2003393367244800.0, 'train_loss': 1.3874550982054115, 'epoch': 5.0})

In [68]:
torch.save(model.state_dict(), '/content/drive/MyDrive/MarianMT_german_model_weights.pth')
# shutil.copy('MarianMT_dutch_model_weights.pth', '/content/drive/MyDrive')

In [89]:
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model.load_state_dict(torch.load('/content/drive/MyDrive/MarianMT_german_model_weights.pth'),strict=False)



<All keys matched successfully>

In [91]:
translations = []
German_outputs_df = pd.read_csv('German outputs_MarianMT.csv')
src_text = German_outputs_df['English'].to_list()
model.to('cuda')

translator = pipeline("translation_en_to_de", model=model, tokenizer=tokenizer,device = 'cuda')
translated_texts = [translator(text)[0]['translation_text'] for text in src_text]

German_outputs_df['MarianMT-finetuned']  = translated_texts

In [92]:
German_outputs_df.to_csv('German outputs_MarianMT_finetuned.csv',index=False)

In [93]:
# Compute metrics

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

# Dutch model

predictions = German_outputs_df['MarianMT-finetuned'].apply(lambda x: x[0] if isinstance(x, list) else x)
references = German_outputs_df['German'].to_list()


bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU score for German Model: {bleu_score}")

# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE score for German Model: {rouge_score}")

# Compute METEOR score
meteor_score = meteor_metric.compute(predictions=predictions, references=references)
print(f"METEOR score for German Model: {meteor_score}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


BLEU score for German Model: {'bleu': 0.21858831789182193, 'precisions': [0.5290947174823746, 0.26721529068224054, 0.15956124112397424, 0.1019067918658909], 'brevity_penalty': 0.9982645096493903, 'length_ratio': 0.9982660138718891, 'translation_length': 39148, 'reference_length': 39216}
ROUGE score for German Model: {'rouge1': 0.5045687844070336, 'rouge2': 0.2779183874667054, 'rougeL': 0.45437414486358935, 'rougeLsum': 0.4545721738380622}
METEOR score for German Model: {'meteor': 0.47222057766621955}


#### Spanish Model

In [80]:
#### Cell to run

model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
# torch.quantization.prepare(model, inplace=True)

english_spanish_texts = list(zip(Spanish_train_data['English'].tolist(), Spanish_train_data['Spanish'].tolist()))

spanish_train_datasets = TranslationDataset(english_spanish_texts, tokenizer, "en", "es")
# spanish_train_dataloader = DataLoader(spanish_datasets, batch_size=128, shuffle=True)

english_spanish_texts = list(zip(Spanish_val_data['English'].tolist(), Spanish_val_data['Spanish'].tolist()))  # For Spanish translation
spanish_eval_datasets = TranslationDataset(english_spanish_texts, tokenizer, "en", "es")
# spanish_eval_dataloader = DataLoader(spanish_datasets, batch_size=128, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [81]:
##### Run this cell

# from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir='./logs',
    push_to_hub=False,
    report_to='none',
    logging_steps=10,
    dataloader_num_workers=2,
)




In [82]:
##### Run this cell

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=german_train_datasets,
    eval_dataset=german_eval_datasets,
    tokenizer=tokenizer,
)

In [83]:
trainer.train()

  self.pid = os.fork()


Epoch,Training Loss,Validation Loss
1,3.8248,3.677544
2,3.2753,3.209465


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'fo

Epoch,Training Loss,Validation Loss
1,3.8248,3.677544
2,3.2753,3.209465
3,3.1096,2.988768
4,2.8921,2.879969
5,2.859,2.842788


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
  self.pid = os.fork()
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'fo

TrainOutput(global_step=14775, training_loss=3.4441784489820453, metrics={'train_runtime': 860.023, 'train_samples_per_second': 68.719, 'train_steps_per_second': 17.18, 'total_flos': 2003393367244800.0, 'train_loss': 3.4441784489820453, 'epoch': 5.0})

In [85]:
torch.save(model.state_dict(), '/content/drive/MyDrive/MarianMT_spanish_model_weights.pth')
# shutil.copy('MarianMT_dutch_model_weights.pth', '/content/drive/MyDrive')

In [86]:
translations = []
Spanish_outputs_df = pd.read_csv('Spanish outputs_MarianMT.csv')
src_text = Spanish_outputs_df['English'].to_list()
model.to('cuda')

translator = pipeline("translation_en_to_es", model=model, tokenizer=tokenizer)
translated_texts = [translator(text)[0]['translation_text'] for text in src_text]

Spanish_outputs_df['MarianMT-finetuned']  = translated_texts

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [87]:
Spanish_outputs_df.to_csv('Spanish outputs_MarianMT_finetuned.csv',index=False)

In [88]:
# Compute metrics

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

# Dutch model

predictions = Spanish_outputs_df['MarianMT-finetuned'].apply(lambda x: x[0] if isinstance(x, list) else x)
references = Spanish_outputs_df['Spanish'].to_list()


bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print(f"BLEU score for spanish Model: {bleu_score}")

# Compute ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
print(f"ROUGE score for spanish Model: {rouge_score}")

# Compute METEOR score
meteor_score = meteor_metric.compute(predictions=predictions, references=references)
print(f"METEOR score for spanish Model: {meteor_score}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


BLEU score for spanish Model: {'bleu': 0.010400569254565567, 'precisions': [0.11136637499014607, 0.012230734874772392, 0.00426349816529792, 0.0020149103364900263], 'brevity_penalty': 1.0, 'length_ratio': 2.018342993521991, 'translation_length': 88797, 'reference_length': 43995}
ROUGE score for spanish Model: {'rouge1': 0.13546525599457826, 'rouge2': 0.026081957756840357, 'rougeL': 0.11003373250781623, 'rougeLsum': 0.10979627457516289}
METEOR score for spanish Model: {'meteor': 0.13811856773690753}
