In [None]:
# Install the necessary libraries
!pip install transformers datasets evaluate accelerate
!pip install torch
!pip install -U nltk
!pip install sacrebleu
!pip install wandb

In [None]:
# Login to hugging face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Login to wandb
import wandb
wandb.login()

In [None]:
# Import the necessary libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import csv
import evaluate
import numpy as np
import torch
torch.cuda.is_available()

In [None]:
# Obtain the train and test datasets. Split the datasets for training.
train_data = load_dataset("wmt16", name="de-en", split="train")
train_data = train_data.train_test_split(test_size=0.2)
test_data = load_dataset("wmt16", name="de-en", split="test")

In [None]:
# Load the T5-model tokenizer
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Preprocess the training data
source_lang = "en"
target_lang = "de"
prefix = "translate English to German: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_data = train_data.map(preprocess_function, batched=True)

In [None]:
# Truncate the dataset
small_train_dataset = tokenized_data["train"].shuffle(seed=42).select(range(50000))
small_test_dataset = tokenized_data["test"].shuffle(seed=42).select(range(50000))

In [None]:
# Pad the sentences
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
# Create a function to include the sacrebleu metric during training
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# Load the T5 model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda")
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model = model.to(device)
# device

In [None]:
# Define the training hyperparameters and pass the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_trained_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
# Train the model
trainer.train()

In [None]:
# Push the model to the hugging face hub
trainer.push_to_hub()

In [None]:
# Import a pretrained google-t5 model and create a translator from the pipeline
tokenizer = AutoTokenizer.from_pretrained("Markie-TheHenry/t5_trained_model")
model = AutoModelForSeq2SeqLM.from_pretrained("Markie-TheHenry/t5_trained_model")
translator = pipeline("translation_en_to_de", model=model, tokenizer=tokenizer)

In [None]:
# Translate the sentences (around 2K) in the dataset and write them into a file
file = open('google_t5_trained_translation.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Sample Input', 'Translation', 'Ground Truth'])
for i in range(0, len(test_data)):
    text = test_data[i]["translation"]["en"]
    output = translator(text)
    #print(output)
    #print(output[0]["translation_text"])
    prediction = output[0]["translation_text"]
    ground_truth = test_data[i]["translation"]["de"]
    row = [text, prediction, ground_truth]
    writer.writerow(row)
file.close()

In [None]:
# Read the predictions and references from the created csv file
filename = "google_t5_trained_translation.csv"
predictions = []
references = []
with open(filename, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    i = 0
    for row in csvreader:
        if i == 0:
            i = i + 1
            continue
        else:
            predictions.append(row[1])
            references.append(row[2])

In [None]:
# Evaluate the bleu metric
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print("Bleu Metric:", results["bleu"])

In [None]:
# Evaluate the meteor metric
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=predictions, references=references)
print("Meteor Metric", results['meteor'])

In [None]:
 !nvidia-smi