# Libraries Used

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import evaluate

# Relative Paths for models and data

In [2]:
models_path = "../models/"
data_path = "../data/"

# Loading and Preprocessing data

In [3]:
df = pd.read_csv(data_path + "filtered_reviews.csv", usecols=["Id","Summary", "Text", "ProductId"])
df.dropna(subset=["Summary", "Text"], inplace=True)

df = df.sample(10000, random_state=42) # Testing fintetuning with a small dataset currently

df = df.rename(columns={"Summary": "target_text", "Text": "input_text"})

dataset = Dataset.from_pandas(df) # converting to HF Dataset format since we will be using T5 transformer model from HF

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Step 4: Preprocessing function (from this paper for encoding summarization task page 47: https://arxiv.org/pdf/1910.10683)
def preprocess_function(example):
    input_text = "summarize: " + example["input_text"]
    model_inputs = tokenizer(
        input_text, max_length=512, truncation=True, padding="max_length"
    )

    labels = tokenizer(
        example["target_text"], max_length=64, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=False)

# Train/test Split
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# Finetuning T5

In [5]:
training_args = TrainingArguments(
    output_dir= models_path + "t5_summarizer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir= models_path + "logs",
    logging_steps=100,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained(models_path + "t5_summarizer")
tokenizer.save_pretrained(models_path + "t5_summarizer")


  0%|          | 0/3375 [00:00<?, ?it/s]

{'loss': 3.7543, 'grad_norm': 1.6389862298965454, 'learning_rate': 4.8592592592592596e-05, 'epoch': 0.09}
{'loss': 0.5825, 'grad_norm': 1.141833782196045, 'learning_rate': 4.711111111111111e-05, 'epoch': 0.18}
{'loss': 0.476, 'grad_norm': 0.7669867873191833, 'learning_rate': 4.5629629629629636e-05, 'epoch': 0.27}
{'loss': 0.4486, 'grad_norm': 0.7971938252449036, 'learning_rate': 4.414814814814815e-05, 'epoch': 0.36}
{'loss': 0.4361, 'grad_norm': 0.5160514116287231, 'learning_rate': 4.266666666666667e-05, 'epoch': 0.44}
{'loss': 0.4571, 'grad_norm': 0.5493866205215454, 'learning_rate': 4.1185185185185186e-05, 'epoch': 0.53}
{'loss': 0.4261, 'grad_norm': 0.5630396008491516, 'learning_rate': 3.97037037037037e-05, 'epoch': 0.62}
{'loss': 0.4342, 'grad_norm': 0.4969012141227722, 'learning_rate': 3.8222222222222226e-05, 'epoch': 0.71}
{'loss': 0.4169, 'grad_norm': 0.5673686861991882, 'learning_rate': 3.674074074074074e-05, 'epoch': 0.8}
{'loss': 0.4256, 'grad_norm': 0.7762271165847778, 'lear

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.39003264904022217, 'eval_runtime': 10.8735, 'eval_samples_per_second': 91.967, 'eval_steps_per_second': 11.496, 'epoch': 1.0}
{'loss': 0.4242, 'grad_norm': 0.5085411071777344, 'learning_rate': 3.22962962962963e-05, 'epoch': 1.07}
{'loss': 0.4209, 'grad_norm': 0.6219936609268188, 'learning_rate': 3.0814814814814816e-05, 'epoch': 1.16}
{'loss': 0.4126, 'grad_norm': 0.6410334706306458, 'learning_rate': 2.9333333333333336e-05, 'epoch': 1.24}
{'loss': 0.4113, 'grad_norm': 0.5282003283500671, 'learning_rate': 2.7851851851851853e-05, 'epoch': 1.33}
{'loss': 0.4157, 'grad_norm': 0.5552299618721008, 'learning_rate': 2.6370370370370373e-05, 'epoch': 1.42}
{'loss': 0.4068, 'grad_norm': 0.8922046422958374, 'learning_rate': 2.488888888888889e-05, 'epoch': 1.51}
{'loss': 0.4087, 'grad_norm': 0.522246778011322, 'learning_rate': 2.340740740740741e-05, 'epoch': 1.6}
{'loss': 0.4004, 'grad_norm': 0.746623694896698, 'learning_rate': 2.1925925925925926e-05, 'epoch': 1.69}
{'loss': 0.4186, 

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3831178545951843, 'eval_runtime': 11.2894, 'eval_samples_per_second': 88.579, 'eval_steps_per_second': 11.072, 'epoch': 2.0}
{'loss': 0.3925, 'grad_norm': 0.5014961361885071, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.04}
{'loss': 0.4093, 'grad_norm': 0.4830264449119568, 'learning_rate': 1.4518518518518521e-05, 'epoch': 2.13}
{'loss': 0.4062, 'grad_norm': 0.8073115944862366, 'learning_rate': 1.3037037037037036e-05, 'epoch': 2.22}
{'loss': 0.4107, 'grad_norm': 0.5469932556152344, 'learning_rate': 1.1555555555555556e-05, 'epoch': 2.31}
{'loss': 0.3978, 'grad_norm': 0.5385692715644836, 'learning_rate': 1.0074074074074074e-05, 'epoch': 2.4}
{'loss': 0.3982, 'grad_norm': 0.736526608467102, 'learning_rate': 8.592592592592593e-06, 'epoch': 2.49}
{'loss': 0.397, 'grad_norm': 0.6076058149337769, 'learning_rate': 7.111111111111112e-06, 'epoch': 2.58}
{'loss': 0.4044, 'grad_norm': 0.4985710680484772, 'learning_rate': 5.62962962962963e-06, 'epoch': 2.67}
{'loss': 0.4104, '

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3816366195678711, 'eval_runtime': 11.2954, 'eval_samples_per_second': 88.532, 'eval_steps_per_second': 11.066, 'epoch': 3.0}
{'train_runtime': 864.1731, 'train_samples_per_second': 31.244, 'train_steps_per_second': 3.905, 'train_loss': 0.5195833700674551, 'epoch': 3.0}


('../models/t5_summarizer\\tokenizer_config.json',
 '../models/t5_summarizer\\special_tokens_map.json',
 '../models/t5_summarizer\\spiece.model',
 '../models/t5_summarizer\\added_tokens.json')

# Testing finetune T5

# Evaluation Metrics

In [None]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")  
meteor = evaluate.load("meteor")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initial model Loading
initial_model = T5ForConditionalGeneration.from_pretrained("t5-small")
initial_model.to(device)

# Finetuned model Loading
finetuned_model_path = "../models/t5_summarizer"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path)
finetuned_model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [14]:
def evaluate_model(model_to_eval, dataset, tokenizer, max_input_length=512, max_target_length=64):
    model_to_eval.eval()
    predictions = []
    references = []

    for example in dataset.select(range(200)):
        input_text = "summarize: " + example["input_text"]
        input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=max_input_length)
        input_ids = input_ids.to(model_to_eval.device)
        with torch.no_grad():
            output_ids = model_to_eval.generate(input_ids, max_length=max_target_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(example["target_text"])
    
    results = {}
    # ROUGE-L
    rouge_results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    results["rougeL"] = rouge_results["rougeL"]
    # BLEU score
    bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    results["bleu"] = bleu_results["bleu"]
    # METEOR score
    meteor_results = meteor.compute(predictions=predictions, references=references)
    results["meteor"] = meteor_results["meteor"]

    return results

# Initial model
initial_results = evaluate_model(initial_model, eval_dataset, tokenizer)
print("Initial Model Metrics:")
print(f"ROUGE-L (Initial pre-trained model): {initial_results['rougeL']:.4f}")
print(f"BLEU (Initial pre-trained model): {initial_results['bleu']:.4f}")
print(f"METEOR (Initial pre-trained model): {initial_results['meteor']:.4f}")

# Fine-tuned model
finetuned_results = evaluate_model(finetuned_model, eval_dataset, tokenizer)
print("\nFine-tuned Model Metrics:")
print(f"ROUGE-L (Fine-tuned model): {finetuned_results['rougeL']:.4f}")
print(f"BLEU (Fine-tuned model): {finetuned_results['bleu']:.4f}")
print(f"METEOR (Fine-tuned model): {finetuned_results['meteor']:.4f}")

Initial Model Metrics:
ROUGE-L (Initial pre-trained model): 0.0890
BLEU (Initial pre-trained model): 0.0036
METEOR (Initial pre-trained model): 0.1272

Fine-tuned Model Metrics:
ROUGE-L (Fine-tuned model): 0.1432
BLEU (Fine-tuned model): 0.0000
METEOR (Fine-tuned model): 0.0865


A 60% increase in ROUGE accuracy. However, BLEU and METEOR dropped significantly in accuracy. A higher BLEU score for the initial model supports the reasoning of data leakage for the initial model (which is possible since T5 HF model is trained on public available data - our dataset).  