In [None]:
from datasets import load_dataset, Dataset
import random
from evaluate import load
import os
import pandas as pd
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
import kagglehub
import evaluate
import torch

In [None]:
# Download latest version of amazon fine food reviews dataset
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")
print("Path to dataset files:", path)

In [None]:
# parse the csv for the summary and text columns and use them for the input and target
df = pd.read_csv("Reviews.csv", usecols=["Id", "Summary", "Text", "ProductId"])
df.dropna(subset=["Summary", "Text"], inplace=True)
df = df.sample(20000, random_state=42)
df = df.rename(columns={"Summary": "target_text", "Text": "input_text"})
dataset = Dataset.from_pandas(df)
print(len(dataset))

In [None]:
# intialize the pretrained tokenizer and model
tokenizer = BartTokenizer.from_pretrained("lucadiliello/bart-small")
model = BartForConditionalGeneration.from_pretrained("lucadiliello/bart-small")

print(len(dataset))

In [None]:
# preproccessing function for tokenizing and padding the dataset
def preprocess_function(examples):
    # Tokenize the input text with padding and truncation to a max length of 512

    return tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    ) | {
        "labels": tokenizer(  # Tokenize the target text separately, with a shorter max length for summaries
            examples["target_text"],
            max_length=64,
            truncation=True,
            padding="max_length",
        )[
            "input_ids"
        ]
    }

In [None]:
# tokenize dataset and slit into train and eval
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

In [None]:
# for debugging purpose using a smaller set
small_train = train_dataset  # .select(range(10))

In [None]:
# train the model using these hyperparams and save to ./bart_summarizer folder
training_args = TrainingArguments(
    output_dir="./bart_summarizer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,  # number of epochs is small enough to train quicker but also to learn enough
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
# save model to ./bart_summarizer folder
model.save_pretrained("./bart_summarizer")
tokenizer.save_pretrained("./bart_summarizer")

In [27]:
# load rouge score evaluator
rouge = evaluate.load("rouge")

In [28]:
# call the initial model and the finetuned model so we can compare them
initial_model = BartForConditionalGeneration.from_pretrained("lucadiliello/bart-small")

finetuned_model = BartForConditionalGeneration.from_pretrained("./bart_summarizer")
finetuned_tokenizer = BartTokenizer.from_pretrained("./bart_summarizer")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


In [26]:
# evaluation function to get the rouge score
def evaluate_model(
    model_to_eval, dataset, tokenizer, max_input_length=512, max_target_length=64
):
    model_to_eval.eval()
    predictions = []
    references = []

    # evaluate on the first 200 examples
    for example in dataset.select(range(200)):
        input_text = example["input_text"]
        input_ids = tokenizer.encode(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_length,
        )
        input_ids = input_ids.to(model_to_eval.device)

        # generate prediction from the model
        with torch.no_grad():
            output_ids = model_to_eval.generate(input_ids, max_length=max_target_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(example["target_text"])

    results = {}
    # ROUGE-L
    rouge_results = rouge.compute(
        predictions=predictions, references=references, use_stemmer=True
    )
    results["rougeL"] = rouge_results["rougeL"]

    return results


# Initial model
initial_results = evaluate_model(initial_model, eval_dataset, tokenizer)
print("Initial Model Metrics:")
print(f"ROUGE-L (Initial pre-trained model): {initial_results['rougeL']:.4f}")

# Fine-tuned model
finetuned_results = evaluate_model(finetuned_model, eval_dataset, tokenizer)
print("\nFine-tuned Model Metrics:")
print(f"ROUGE-L (Fine-tuned model): {finetuned_results['rougeL']:.4f}")

Initial Model Metrics:
ROUGE-L (Initial pre-trained model): 0.0711

Fine-tuned Model Metrics:
ROUGE-L (Fine-tuned model): 0.1335


In [31]:
# simple test to see if a coherent summary is generated
input_text = "These cinnamon bears have great flavor and do not taste sugar free.  My only issue is that they should be softer."
inputs = finetuned_tokenizer.encode(
    input_text, return_tensors="pt", max_length=512, truncation=True
)
summary_ids = finetuned_model.generate(
    inputs, max_length=64, num_beams=4, early_stopping=True
)

summary = finetuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)

Generated Summary: Great cinnamon bears


In [32]:
# Sample evaluations from the eval dataset and the finetuned model
print(eval_dataset)
indices = list(range(len(eval_dataset)))
random_indices = random.sample(indices, 5)

for idx in random_indices:
    # get the example from the eval dataset
    example = eval_dataset[idx]

    input_str = example["input_text"]
    print("\nInput:", input_str)
    # use finetuned model to generate a summary, tokenize the input and generate the summary
    inputs = finetuned_tokenizer(
        input_str, return_tensors="pt", max_length=512, truncation=True
    )

    summary_ids = finetuned_model.generate(
        **inputs, max_length=64, num_beams=4, early_stopping=True
    )
    summary = finetuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print("Generated Summary:", summary)

Dataset({
    features: ['Id', 'ProductId', 'target_text', 'input_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

Input: I am a vegetarian body builder. I have tried many types of plant protein supplimentation. Hemp is the gold standard of the plant kingdom, much like eggs are the gold standard of the animal kingdom. Their amino acid profiles are both very complete and similar.<br /><br />Most hemp proteins have a massive amount of fiber though and if you are taking in as many grams of protein as recommended for body building, you will be way too regular I'll put it that way.<br /><br />This 70% formula has less fiber and more protein. In one lb/16oz container there are 15 servings of 21g of protein 0g fiber. It is superior for that alone and is well worth the extra cost per/gram of protein.<br /><br />It disolves easily in water, you can hand mix with a spoon for 1min and your good with not one clump.<br /><br />It is sweeter than normal he