In [1]:
!pip install transformers
!pip install datasets
!pip install sacrebleu
!pip install sentencepiece
!pip install jiwer
!pip install evaluate



In [2]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric
import torch
from sacrebleu import corpus_bleu
import pandas as pd
from datasets import Dataset
import jiwer
import evaluate
import nltk

In [None]:
!mkdir -p "mbart"
%cd mbart

In [None]:
# Tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Load data from files
train_data = pd.read_csv("train.hi", sep="\t", header=None, names=["input_text"])
train_labels = pd.read_csv("train.en", sep="\t", header=None, names=["target_text"])
test_data = pd.read_csv("test.hi", sep="\t", header=None, names=["input_text"])
test_labels = pd.read_csv("test.en", sep="\t", header=None, names=["target_text"])
valid_data = pd.read_csv("valid.hi", sep="\t", header=None, names=["input_text"])
valid_labels = pd.read_csv("valid.en", sep="\t", header=None, names=["target_text"])

# Combine input and target text into single DataFrames
train_df = pd.concat([train_data, train_labels], axis=1)
test_df = pd.concat([test_data, test_labels], axis=1)
valid_df = pd.concat([valid_data, valid_labels], axis=1)

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
valid_dataset = Dataset.from_pandas(valid_df)


In [3]:
# Tokenize function
def tokenize_data(example):
    input = tokenizer(example["input_text"], max_length=128, truncation=True, padding="max_length")
    target = tokenizer(example["target_text"], max_length=128, truncation=True, padding="max_length")
    return {"input_ids": input["input_ids"],
            "attention_mask": input["attention_mask"],
            "labels": target["input_ids"]}


In [None]:
# Map the tokenization function to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=["input_text", "target_text"])
tokenized_valid_dataset = valid_dataset.map(tokenize_data, batched=True, remove_columns=["input_text", "target_text"])

In [None]:
# Model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [None]:
# Training arguments
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./results",
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     predict_with_generate=True,
#     evaluation_strategy="epoch",
#     logging_dir="./logs",
#     save_strategy="epoch",
#     num_train_epochs=3,
#     learning_rate=5e-5,
#     weight_decay=0.01,
#     warmup_steps=500,
#     save_total_limit=3,
# )

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=1e-5, # lower learning rate
    weight_decay=0.03,
    warmup_steps=2000, # increase warmup steps
    save_total_limit=3,
    lr_scheduler_type="linear", # use linear learning rate schedule
    load_best_model_at_end=True,
    metric_for_best_model="loss", # use validation loss for early stopping
)


# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

Make sure to replace the dataset file paths with the paths to your dataset files. This code snippet will fine-tune the mBART model for 3 epochs with a learning rate of 5e-5 and a batch size of 4. You can adjust these hyperparameters based on your computational resources and the size of your dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir -p mbart
%cd "mbart"

In [None]:
!mkdir -p results

In [None]:
# !cp -r '/content/mbart/results/checkpoint-5040' '/content/drive/MyDrive/'
!cp -r '/content/drive/MyDrive/checkpoint-5040' '/content/mbart/results'

In [5]:
%cd "Hinglish-English"

C:\Users\Mahendranath\Hinglish-English


In [4]:
pwd

'C:\\Users\\Mahendranath\\Hinglish-English'

In [6]:
# Load the fine-tuned model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("./CMU_27.71")  # Replace XXXX with the desired checkpoint number
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")


In [7]:
# Load data from files
test_data = pd.read_csv("27.71/test.hi", sep="\t", header=None, names=["input_text"])
test_labels = pd.read_csv("27.71/test.en", sep="\t", header=None, names=["target_text"])

# Combine input and target text into single DataFrames
test_df = pd.concat([test_data, test_labels], axis=1)

# Convert DataFrames to Datasets
test_dataset = Dataset.from_pandas(test_df)

# Map the tokenization function to the test dataset
tokenized_test_dataset = test_dataset.map(tokenize_data, batched=True, remove_columns=["input_text", "target_text"])

# Extract Hinglish and English sentences
test_hinglish = test_data["input_text"].tolist()
test_english = test_labels["target_text"].tolist()

# Reduce the size of the test data (e.g., use only the first 100 sentences)
test_hinglish_subset = test_hinglish[:15]
test_english_subset = test_english[:15]

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

In [8]:
# # Generate translations
translations = []
for text in test_hinglish:
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(**inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translations.append(translation)


for i in range(15):
  print("Test : ",test_english[i])
  print("Translation : ",translations[i])

# Calculate sacreBLEU score
bleu_score = corpus_bleu(translations, [test_english]).score
print(f'sacreBLEU score: {bleu_score}')



# device = torch.cuda.current_device() if torch.cuda.is_available() else -1

# translated_sentences = []

# for sentence in test_hinglish_subset:
#     input_ids = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
#     translations = model.generate(input_ids, num_return_sequences=1, max_new_tokens=200, no_repeat_ngram_size=2)
#     translated_text = tokenizer.batch_decode(translations, skip_special_tokens=True)
#     translated_sentences.extend(translated_text)




Test :  hi
Translation :  hi
Test :  Marvel's The Avengers is a 2012 American superhero film based on the Marvel Comics superhero team of the same name, produced by Marvel Studios and distributed by Walt DisneyStudios Motion Pictures
Translation :  Marvel's The Avengers is a 2012 American superhero movie, based on the Marvel Comics superhero series, which stars Marvel's Standing Beauty and Walt Disney World animated by Pixar.
Test :  Hello. How are you? I am not entirely sure about what question to ask, so I'll just ask: do you think the critics were fair in their critique of the movie?
Translation :  Hello. How are you? I'm not sure exactly what to ask, so I'll just go through it: Do you think the characters were successful in the adaptation of the movie?
Test :  I agree with them that Ruffalo was great in the movie. 
Translation :  I agree they were great in the whole drama.
Test :  I agree them great in the Avengers   movie. 
Translation :  I agree, the Avengers were great in the mo

In [9]:
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [10]:
meteor = evaluate.load('meteor')

[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading omw-1.4: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [11]:
# Create an empty list to store results
results = []

# Iterate through each prediction-reference pair
for prediction, reference in zip(translations, test_english):
    result = meteor.compute(predictions=[prediction], references=[reference])
    results.append(result['meteor'])

In [12]:
# Calculate the average METEOR score for all pairs
average_meteor = sum(results) / len(results)

print("Average METEOR score:", average_meteor)

Average METEOR score: 0.5597351697692411


In [13]:
print("METEOR scores for each pair:", results)


METEOR scores for each pair: [0.5, 0.5406060606060606, 0.7269718698290126, 0.4766949152542372, 0.6974657995066158, 0.8675523349436391, 0.0, 0.6305084745762711, 0.5, 0.9914604139528151, 0.864795918367347, 0.9995, 0.8675523349436391, 0.8819444444444444, 0.25, 0.5033238366571701, 0.6914285714285714, 0.46875, 0.45731707317073167, 0.9375, 0.8819444444444444, 0.3703703703703703, 0.0, 0.7211538461538461, 0.2380952380952381, 0.5888888888888889, 0.0, 0.8627717391304347, 0.2380952380952381, 0.6918367346938775, 0.28615384615384615, 0.32763532763532766, 0.3333333333333333, 0.8819444444444444, 0.4206349206349207, 0.30340136054421774, 0.17241379310344826, 0.4213483146067415, 0.5095890410958904, 0.5439024390243904, 0.41296296296296303, 0.39821428571428563, 0.8861997310842195, 0.24109014675052406, 0.7471655328798186, 0.39473684210526316, 0.843770667989418, 0.7621951219512195, 0.9866898148148149, 0.9217759872424357, 0.28434684684684686, 0.5952380952380953, 0.8375715193897013, 0.5975308641975308, 0.1351

In [14]:
from sacrebleu import corpus_ter

In [15]:
# Calculate TER using SacreBLEU
ter_score = corpus_ter(translations, [test_english])

print(f'TER score: {ter_score.score}')

TER score: 58.912495498739645


In [16]:
# Calculate WER
wer_score=0
for i in range(len(test_english)):
    wer = jiwer.wer(translations[i], test_english[i])
    wer_score+=wer

print("WER:", wer_score/len(test_english))

WER: 0.670149089492499


Regarding the batch size and epochs, it depends on your computational resources and the size of your dataset. Since your dataset is relatively small, you can try increasing the number of epochs to 10 or even 20, and observe the performance on the validation set. If the performance on the validation set starts to degrade, you might be overfitting, and it’s time to stop training. As for the batch size, you can try increasing it to 8, 16, or 32, depending on your GPU memory. A larger batch size may speed up training and result in more stable convergence, but you may need to adjust the learning rate accordingly.

Remember that increasing the batch size may require more GPU memory, and increasing the number of epochs may lead to longer training times. You can experiment with different hyperparameters to find the best combination for your specific dataset and available resources.

Given that you have a 15 GB GPU, you can try the following hyperparameters:

per_device_train_batch_size: 8 or 16 (depending on memory usage)
num_train_epochs: 10 to 20 (monitor the validation loss to avoid overfitting)
learning_rate: 3e-5
Remember that increasing the batch size may require you to decrease the learning rate slightly. The suggested learning rate of 3e-5 should work well with a batch size of 8 or 16, but you might need to fine-tune the learning rate based on your specific case.

You can update the Seq2SeqTrainingArguments in the fine-tuning code with these new hyperparameters:

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,  # or 16
    per_device_eval_batch_size=8,  # or 16
    predict_with_generate=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    learning_rate=3e-5, # lower learning rate
    weight_decay=0.03,
    warmup_steps=2000, # increase warmup steps
    save_total_limit=3,
    lr_scheduler_type="linear", # use linear learning rate schedule
    load_best_model_at_end=True,
    metric_for_best_model="loss", # use validation loss for early stopping
)

It’s important to monitor the training process and the model’s performance on the validation set. If the validation loss starts increasing or plateaus, you can stop training and try different hyperparameters or adjust the number of epochs.

The Seq2SeqTrainer in the Hugging Face Transformers library will log the model’s performance on the validation set at the end of each epoch if you set the evaluation_strategy to "epoch" in the Seq2SeqTrainingArguments. In your case, it’s already set to "epoch".

To see the logged information, you can check the log files generated in the logging_dir specified in the Seq2SeqTrainingArguments. In your case, the log files will be saved in the ./logs directory.

You can also monitor the training progress in real-time by running tensorboard in your terminal:

In [None]:
tensorboard --logdir ./logs

After running the above command, open your browser and go to http://localhost:6006 to see the TensorBoard interface with the training and validation metrics plotted.

Keep an eye on the validation loss curve. If it starts increasing or plateaus, it may indicate that the model is overfitting or has reached its best performance on the validation set. You can then stop the training and try adjusting the number of epochs or other hyperparameters.