In [1]:
# Load csv file
import pandas as pd

df = pd.read_csv("reviews.csv")

In [2]:
df.shape

(568454, 10)

In [3]:
list(df.columns)

['Id',
 'ProductId',
 'UserId',
 'ProfileName',
 'HelpfulnessNumerator',
 'HelpfulnessDenominator',
 'Score',
 'Time',
 'Summary',
 'Text']

In [4]:
# Remove missing values n duplicates

df = df[["Text", "Summary"]].dropna().drop_duplicates()

In [5]:
df.shape

(394967, 2)

In [6]:
# Add length columns
df["text_len"] = df["Text"].apply(lambda x: len(str(x).split()))
df["summary_len"] = df["Summary"].apply(lambda x: len(str(x).split()))


In [7]:
# Filter: keep only reasonably long reviews and summaries
filtered_df = df[(df["text_len"] > 100) & (df["summary_len"] > 5)]

# Sample 10,000 rows randomly
sampled_df = filtered_df.sample(n=1000, random_state=42).reset_index(drop=True)

# Drop extra columns now
final_df = sampled_df[["Text", "Summary"]]

In [8]:
# Check shape
print(f"✅ Final dataset size: {final_df.shape}")
print(final_df.head(3))

✅ Final dataset size: (1000, 2)
                                                Text  \
0  My daughter is 7 months old, and we started ba...   
1  I starter drinking Milo since I was two years ...   
2  Ingredients: Spices, salt, onion, paprika, gar...   

                                             Summary  
0  portable, liked by a kid who doesn't like gree...  
1  Liked It When I was a Kid; Enjoying it Now tha...  
2  Here are the actual ingredients from the packa...  


### Prepare the data for T5 format

In [9]:
# Add T5-format input and target columns
final_df["input_text"] = "summarize: " + final_df["Text"]
final_df["target_text"] = final_df["Summary"]

# Drop old columns to keep only what's needed
t5_df = final_df[["input_text", "target_text"]]
print(t5_df.head(2))


                                          input_text  \
0  summarize: My daughter is 7 months old, and we...   
1  summarize: I starter drinking Milo since I was...   

                                         target_text  
0  portable, liked by a kid who doesn't like gree...  
1  Liked It When I was a Kid; Enjoying it Now tha...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["input_text"] = "summarize: " + final_df["Text"]


In [10]:
t5_df.to_csv("t5_amazon_10k.csv", index=False)

### Tokenize the dataset for T5-small

In [11]:
## Load Tokenizer and Define Preprocessing Function

from transformers import AutoTokenizer

# Load the tokenizer for T5-small
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Define preprocessing/tokenization function
def preprocess_function(example):
    model_inputs = tokenizer(
        example["input_text"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    labels = tokenizer(
        example["target_text"],
        max_length=64,
        padding="max_length",
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [12]:
## Convert pandas DataFrame to Hugging Face Dataset

from datasets import Dataset

# Convert to Hugging Face Dataset object
dataset = Dataset.from_pandas(t5_df)

# Tokenize
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Check sample
tokenized_dataset[0]


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'input_text': "summarize: My daughter is 7 months old, and we started baby food right before 6 months. She has never been a fan of most green things (spring veggie mixes, green beans, etc) even though we keep trying them. But she likes this! It tastes more like pear sauce than anything (I tried it).<br /><br />It's also nicely portable. My little one is too young to squeeze it right into her mouth, but I can carefully squeeze some onto a spoon if we're out (if I'm home, I usually just squeeze it into a bowl). I also reseals well (the cap screws back on) and is easily tossed back into the diaper back if it's not finished. I would definitely buy this again and in other flavors.",
 'target_text': "portable, liked by a kid who doesn't like green things",
 'input_ids': [21603,
  10,
  499,
  3062,
  19,
  489,
  767,
  625,
  6,
  11,
  62,
  708,
  1871,
  542,
  269,
  274,
  431,
  767,
  5,
  451,
  65,
  470,
  118,
  3,
  9,
  1819,
  13,
  167,
  1442,
  378,
  41,
  14662,
  30642,

### Fine-Tune t5-small Using Hugging Face Trainer

In [13]:
## Load the model

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [14]:
## Define ROUGE for summarization

import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline after each sentence
    decoded_preds = ["\n".join(pred.strip().split(". ")) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split(". ")) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 2) for k, v in result.items()}


In [15]:
## Split dataset

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [16]:
## TrainingArguments + Trainer

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./t5_summarizer_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # logging_dir="./logs",
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,
    # predict_with_generate=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)




  trainer = Trainer(


In [17]:
## TRain the model

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=100, training_loss=3.4182504272460936, metrics={'train_runtime': 551.7132, 'train_samples_per_second': 1.45, 'train_steps_per_second': 0.181, 'total_flos': 108273441177600.0, 'train_loss': 3.4182504272460936, 'epoch': 1.0})

In [24]:
# Save the model and tokenizer
trainer.save_model("./t5_summarizer_model")
tokenizer.save_pretrained("./t5_summarizer_model")


('./t5_summarizer_model\\tokenizer_config.json',
 './t5_summarizer_model\\special_tokens_map.json',
 './t5_summarizer_model\\spiece.model',
 './t5_summarizer_model\\added_tokens.json',
 './t5_summarizer_model\\tokenizer.json')

## Run Predictions with Fine-Tuned Model

In [25]:
## Load some test examples

# Take some test samples from your original dataset (not tokenized)
import random

sample_texts = final_df.sample(5, random_state=42)["Text"].tolist()

In [26]:
## Define a prediction function

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./t5_summarizer_model")
tokenizer = T5Tokenizer.from_pretrained("./t5_summarizer_model")
model.eval()

def summarize(text, max_input_length=512, max_target_length=64):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(
        input_text,
        return_tensors="pt",
        max_length=max_input_length,
        truncation=True
    ).to(model.device)

    output_ids = model.generate(
        input_ids=input_ids,
        max_length=max_target_length,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [29]:
## Run predictions

for i, text in enumerate(sample_texts):
    print(f"\n🔹 Review {i+1}:")
    print(f" Original:\n{text[:500]}...\n")
    print(f" Summary:\n{summarize(text)}")


🔹 Review 1:
 Original:
Ok, my kid is VERY finicky. Things have to be made a certain way, etc. She also has food allergies, which in turn exacerbate her eczema, as well as seriously limiting what she can eat, even healthy stuff. Flax is good for eczema, but she doesn't like it added to cereal, won't do the oil, even when mixed w/maple syrup, etc. Enter these WONDERFUL crackers. Dehydrated at low temps to preserve the oil (& thus all the nutritional benefits), with no artificial colors or flavors, no gluten or other com...

 Summary:


🔹 Review 2:
 Original:
I've tried this tea by the bag and it does tend to be milder than your average supermarket Tetley or Lipton. But I've been drinking Bigelow Green Tea that way for about a year and I wanted to revisit Earl Grey via a Keurig-clone machine I got for my birthday over the summer. After I went through a couple of "variety packs" I picked up an Ekobrew Refillable basket and started drinking expresso done that way for my morning coffee. I a