# Libraries Used

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)
import evaluate
import random

# Relative Paths for models and data

In [None]:
models_path = "../models/"
data_path = "../data/"

# Loading and Preprocessing data

In [None]:
df = pd.read_csv(
    data_path + "FilteredReviews.csv", usecols=["Id", "Summary", "Text", "ProductId"]
)
df.dropna(subset=["Summary", "Text"], inplace=True)

df = df.sample(10000, random_state=42)

df = df.rename(columns={"Summary": "target_text", "Text": "input_text"})

dataset = Dataset.from_pandas(df)

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Step 4: Preprocessing function (from this paper for encoding summarization task page 47: https://arxiv.org/pdf/1910.10683)


def preprocess_function(example):
    input_text = "summarize: " + example["input_text"]
    model_inputs = tokenizer(
        input_text, max_length=512, truncation=True, padding="max_length"
    )

    labels = tokenizer(
        example["target_text"], max_length=64, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess_function, batched=False)

# Train/test Split
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# Finetuning T5

In [4]:
training_args = TrainingArguments(
    output_dir=models_path + "t5_summarizer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=models_path + "logs",
    logging_steps=100,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained(models_path + "t5_summarizer_balanced")
tokenizer.save_pretrained(models_path + "t5_summarizer_balanced")

  0%|          | 0/3375 [00:00<?, ?it/s]

{'loss': 3.57, 'grad_norm': 1.0318259000778198, 'learning_rate': 4.8562962962962964e-05, 'epoch': 0.09}
{'loss': 0.6033, 'grad_norm': 0.8597363829612732, 'learning_rate': 4.708148148148148e-05, 'epoch': 0.18}
{'loss': 0.4909, 'grad_norm': 0.7873992919921875, 'learning_rate': 4.5600000000000004e-05, 'epoch': 0.27}
{'loss': 0.4401, 'grad_norm': 0.6481276750564575, 'learning_rate': 4.411851851851852e-05, 'epoch': 0.36}
{'loss': 0.4497, 'grad_norm': 0.5863837003707886, 'learning_rate': 4.263703703703704e-05, 'epoch': 0.44}
{'loss': 0.4617, 'grad_norm': 0.5111042261123657, 'learning_rate': 4.115555555555556e-05, 'epoch': 0.53}
{'loss': 0.4511, 'grad_norm': 0.6567074656486511, 'learning_rate': 3.967407407407408e-05, 'epoch': 0.62}
{'loss': 0.4448, 'grad_norm': 0.7687165141105652, 'learning_rate': 3.8192592592592594e-05, 'epoch': 0.71}
{'loss': 0.45, 'grad_norm': 0.45338940620422363, 'learning_rate': 3.671111111111111e-05, 'epoch': 0.8}
{'loss': 0.4129, 'grad_norm': 0.4568389654159546, 'learn

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.40407595038414, 'eval_runtime': 11.4445, 'eval_samples_per_second': 87.378, 'eval_steps_per_second': 10.922, 'epoch': 1.0}
{'loss': 0.4319, 'grad_norm': 0.5494844317436218, 'learning_rate': 3.226666666666667e-05, 'epoch': 1.07}
{'loss': 0.4128, 'grad_norm': 0.5478624701499939, 'learning_rate': 3.078518518518519e-05, 'epoch': 1.16}
{'loss': 0.426, 'grad_norm': 0.4588949382305145, 'learning_rate': 2.9303703703703704e-05, 'epoch': 1.24}
{'loss': 0.4261, 'grad_norm': 0.5116302371025085, 'learning_rate': 2.782222222222222e-05, 'epoch': 1.33}
{'loss': 0.4145, 'grad_norm': 0.43846842646598816, 'learning_rate': 2.6340740740740744e-05, 'epoch': 1.42}
{'loss': 0.4048, 'grad_norm': 1.2250356674194336, 'learning_rate': 2.485925925925926e-05, 'epoch': 1.51}
{'loss': 0.4215, 'grad_norm': 0.6696498990058899, 'learning_rate': 2.337777777777778e-05, 'epoch': 1.6}
{'loss': 0.4101, 'grad_norm': 0.5202645063400269, 'learning_rate': 2.1896296296296297e-05, 'epoch': 1.69}
{'loss': 0.4105, 'g

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3983560800552368, 'eval_runtime': 16.7363, 'eval_samples_per_second': 59.75, 'eval_steps_per_second': 7.469, 'epoch': 2.0}
{'loss': 0.4049, 'grad_norm': 0.5594578981399536, 'learning_rate': 1.597037037037037e-05, 'epoch': 2.04}
{'loss': 0.4007, 'grad_norm': 0.5810642242431641, 'learning_rate': 1.448888888888889e-05, 'epoch': 2.13}
{'loss': 0.4294, 'grad_norm': 0.5252114534378052, 'learning_rate': 1.3007407407407407e-05, 'epoch': 2.22}
{'loss': 0.4099, 'grad_norm': 0.5973052978515625, 'learning_rate': 1.1525925925925926e-05, 'epoch': 2.31}
{'loss': 0.405, 'grad_norm': 0.5804758071899414, 'learning_rate': 1.0044444444444446e-05, 'epoch': 2.4}
{'loss': 0.4284, 'grad_norm': 0.5606054663658142, 'learning_rate': 8.562962962962962e-06, 'epoch': 2.49}
{'loss': 0.4244, 'grad_norm': 0.7815588712692261, 'learning_rate': 7.081481481481482e-06, 'epoch': 2.58}
{'loss': 0.3999, 'grad_norm': 0.40906208753585815, 'learning_rate': 5.600000000000001e-06, 'epoch': 2.67}
{'loss': 0.4176, 'g

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3964140713214874, 'eval_runtime': 11.8643, 'eval_samples_per_second': 84.287, 'eval_steps_per_second': 10.536, 'epoch': 3.0}
{'train_runtime': 953.5335, 'train_samples_per_second': 28.316, 'train_steps_per_second': 3.539, 'train_loss': 0.5228604888916015, 'epoch': 3.0}


('../models/t5_summarizer_balanced\\tokenizer_config.json',
 '../models/t5_summarizer_balanced\\special_tokens_map.json',
 '../models/t5_summarizer_balanced\\spiece.model',
 '../models/t5_summarizer_balanced\\added_tokens.json')

# Testing finetune T5

# Evaluation Metrics

In [8]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initial model Loading
initial_model = T5ForConditionalGeneration.from_pretrained("t5-small")
initial_model.to(device)

# Finetuned model Loading
finetuned_model_path = "../models/t5_summarizer_balanced"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path)
finetuned_model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [10]:
def evaluate_model(
    model_to_eval, dataset, tokenizer, max_input_length=512, max_target_length=64
):
    model_to_eval.eval()
    predictions = []
    references = []

    for example in dataset.select(range(200)):
        input_text = "summarize: " + example["input_text"]
        input_ids = tokenizer.encode(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_length,
        )
        input_ids = input_ids.to(model_to_eval.device)
        with torch.no_grad():
            output_ids = model_to_eval.generate(input_ids, max_length=max_target_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(example["target_text"])

    results = {}
    # ROUGE-L
    rouge_results = rouge.compute(
        predictions=predictions, references=references, use_stemmer=True
    )
    results["rougeL"] = rouge_results["rougeL"]
    # BLEU score
    bleu_results = bleu.compute(
        predictions=predictions, references=[[ref] for ref in references]
    )
    results["bleu"] = bleu_results["bleu"]
    # METEOR score
    meteor_results = meteor.compute(predictions=predictions, references=references)
    results["meteor"] = meteor_results["meteor"]

    return results

In [11]:
# Initial model
initial_results = evaluate_model(initial_model, eval_dataset, tokenizer)
print("Initial Model Metrics:")
print(f"ROUGE-L (Initial pre-trained model): {initial_results['rougeL']:.4f}")
print(f"BLEU (Initial pre-trained model): {initial_results['bleu']:.4f}")
print(f"METEOR (Initial pre-trained model): {initial_results['meteor']:.4f}")

# Fine-tuned model
finetuned_results = evaluate_model(finetuned_model, eval_dataset, tokenizer)
print("\nFine-tuned Model Metrics:")
print(f"ROUGE-L (Fine-tuned model): {finetuned_results['rougeL']:.4f}")
print(f"BLEU (Fine-tuned model): {finetuned_results['bleu']:.4f}")
print(f"METEOR (Fine-tuned model): {finetuned_results['meteor']:.4f}")

Initial Model Metrics:
ROUGE-L (Initial pre-trained model): 0.1027
BLEU (Initial pre-trained model): 0.0059
METEOR (Initial pre-trained model): 0.1401

Fine-tuned Model Metrics:
ROUGE-L (Fine-tuned model): 0.1614
BLEU (Fine-tuned model): 0.0289
METEOR (Fine-tuned model): 0.0999


In [None]:
def sample_outputs(
    initial_model,
    finetuned_model,
    dataset,
    tokenizer,
    device,
    sample_size=5,
    max_input_length=512,
    max_target_length=64,
    target_text=True,
):

    indices = list(range(len(dataset)))
    random_indices = random.sample(indices, sample_size)

    for idx in random_indices:
        example = dataset[idx]
        input_str = "summarize: " + example["input_text"]
        input_ids = tokenizer.encode(
            input_str, return_tensors="pt", truncation=True, max_length=max_input_length
        ).to(device)

        # Initial model
        with torch.no_grad():
            initial_output_ids = initial_model.generate(
                input_ids, max_length=max_target_length
            )
        initial_output = tokenizer.decode(
            initial_output_ids[0], skip_special_tokens=True
        )

        # Fine-tuned model
        with torch.no_grad():
            finetuned_output_ids = finetuned_model.generate(
                input_ids, max_length=max_target_length
            )
        finetuned_output = tokenizer.decode(
            finetuned_output_ids[0], skip_special_tokens=True
        )

        # Display the outputs
        print("=" * 50)
        print(f"Example ID: {idx}")
        print("Input Text:")
        print(example["input_text"])
        if target_text:
            print("\nReference Summary:")
            print(example["target_text"])
        print("\nInitial Model Output:")
        print(initial_output)
        print("\nFine-tuned Model Output:")
        print(finetuned_output)
        print("=" * 50 + "\n")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

sample_outputs(initial_model, model, eval_dataset, tokenizer, device, sample_size=5)

Based on some of the generated summaries, We can notice that there might be data imbalance as most of the fine-tuned outputs are positive even if the review is negative. Further investigation required. Also, some of the reviews from the dataset does not have a good summarization. Some examples are mentioned below:

==================================================

Example ID: 25 (Irrelevant summary)

Input Text:
Good flavor, but not real sweet.  I add a little stevia for my sweet tooth.

Reference Summary:
Mom of 5

Initial Model Output:
good flavor, but not real sweet. add a little stevia for my sweet tooth.

Fine-tuned Model Output:
Good flavor, but not real sweet

==================================================

==================================================

Example ID: 759 (Imbalanced data)

Input Text:
This Item Taste Like Dirt.. I've Prob Used it 4 Times & Now It's Just Sitting in MY Freezer.. I Have A High Tolerance for Nasty Stuff.. Just Don't Really Like this Product.. Something In Grinding It Up Makes It Taste Nasty.. The Hulled Seeds Nutiva Sells Are Way Better.. If You Want Good Tasting Hemp Protein Powder It's $15/lb @ Earthshiftproducts.com  but It Taste Wayyy Better Actually Taste Good From Earthshift..

Reference Summary:
Taste Really Gross

Initial Model Output:
I've Prob Used it 4 times & Now It's Just Sitting in MY Freezer.. I have a high tolerance for Nasty Stuff..

Fine-tuned Model Output:
Good Taste

==================================================

# Testing on test dataset for presentation

In [5]:
test_df = pd.read_csv(data_path + "test_final_presentation.csv", usecols=["Review"])
test_df.rename(columns={"Review": "input_text"}, inplace=True)

test_dataset = Dataset.from_pandas(test_df)  # Convert to Hugging Face Dataset format

sample_outputs(
    initial_model,
    finetuned_model,
    test_dataset,
    finetuned_tokenizer,
    device,
    sample_size=13,
    target_text=False,
)

Example ID: 8
Input Text:
I've been looking for a tasty cola that does not contain aspertame. This drink does not meet these standards. Tasty it is not. I tried to drink it and served it to four family members. They all disliked it. Zevia gets two stars though because it is an effective cleaner when it is mixed with baking soda.

Initial Model Output:
cola does not contain aspertame. it is an effective cleaner when mixed with baking soda.

Fine-tuned Model Output:
Not aspertame

Example ID: 7
Input Text:
This cream soda is delicious but beware of packaging because one can was almost empty and the low box and plastic wrapping it was banged up. I suggest just buying it at a brick and morter store like walmart.

Initial Model Output:
cream soda is delicious but beware of packaging because one can was almost empty.

Fine-tuned Model Output:
Cream soda

Example ID: 2
Input Text:
Love this drink with a little squeeze of lemon. Other people I have shared this with enjoy the squeeze of lemon a