In [None]:
from datasets import load_dataset, Dataset
import random
from evaluate import load
import os
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import kagglehub
import evaluate
import torch

In [None]:
# Download latest version of amazon fine food reviews dataset
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")
print("Path to dataset files:", path)


In [None]:
df = pd.read_csv("Reviews.csv", usecols=["Id","Summary", "Text", "ProductId"])
df.dropna(subset=["Summary", "Text"], inplace=True)
df = df.sample(20000, random_state=42)
df = df.rename(columns={"Summary": "target_text", "Text": "input_text"})
dataset = Dataset.from_pandas(df)
print(len(dataset))           

In [None]:
tokenizer = BartTokenizer.from_pretrained("lucadiliello/bart-small")
model = BartForConditionalGeneration.from_pretrained("lucadiliello/bart-small")

print(len(dataset))           

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ) | {
        "labels": tokenizer(
            examples["target_text"],
            max_length=64,
            truncation=True,
            padding="max_length"
        )["input_ids"]
    }


In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]


In [None]:
small_train = train_dataset#.select(range(10))

In [None]:
training_args = TrainingArguments(
    output_dir= "./bart_summarizer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir= "./logs",
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained("./bart_summarizer")
tokenizer.save_pretrained("./bart_summarizer")


In [27]:
rouge = evaluate.load("rouge")

In [28]:
initial_model = BartForConditionalGeneration.from_pretrained("lucadiliello/bart-small")

finetuned_model = BartForConditionalGeneration.from_pretrained("./bart_summarizer")
finetuned_tokenizer = BartTokenizer.from_pretrained("./bart_summarizer")



You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


In [26]:
def evaluate_model(model_to_eval, dataset, tokenizer, max_input_length=512, max_target_length=64):
    model_to_eval.eval()
    predictions = []
    references = []

    for example in dataset.select(range(200)):
        input_text = "summarize: " + example["input_text"]
        input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=max_input_length)
        input_ids = input_ids.to(model_to_eval.device)
        with torch.no_grad():
            output_ids = model_to_eval.generate(input_ids, max_length=max_target_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(example["target_text"])
    
    results = {}
    # ROUGE-L
    rouge_results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    results["rougeL"] = rouge_results["rougeL"]

    return results

# Initial model
initial_results = evaluate_model(initial_model, eval_dataset, tokenizer)
print("Initial Model Metrics:")
print(f"ROUGE-L (Initial pre-trained model): {initial_results['rougeL']:.4f}")

# Fine-tuned model
finetuned_results = evaluate_model(finetuned_model, eval_dataset, tokenizer)
print("\nFine-tuned Model Metrics:")
print(f"ROUGE-L (Fine-tuned model): {finetuned_results['rougeL']:.4f}")

Initial Model Metrics:
ROUGE-L (Initial pre-trained model): 0.0711

Fine-tuned Model Metrics:
ROUGE-L (Fine-tuned model): 0.1335


In [31]:
input_text ="These cinnamon bears have great flavor and do not taste sugar free.  My only issue is that they should be softer."
inputs = finetuned_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = finetuned_model.generate(inputs, max_length=64, num_beams=4, early_stopping=True)

summary = finetuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)


Generated Summary: Great cinnamon bears


In [30]:

print(eval_dataset)
indices = list(range(len(eval_dataset)))
random_indices = random.sample(indices, 5)

for idx in random_indices:
    example = eval_dataset[idx]  

    # build the prompt string
    input_str = "summarize: " + example["input_text"]
    print("\nInput:", input_str)

    inputs = finetuned_tokenizer(
        input_str,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )

    # 5) generate and decode the first element
    summary_ids = finetuned_model.generate(
        **inputs,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )
    summary = finetuned_tokenizer.decode(
        summary_ids[0],
        skip_special_tokens=True
    )

    print("Generated Summary:", summary)


Dataset({
    features: ['Id', 'ProductId', 'target_text', 'input_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})

Input: summarize: I thought that these bars were very tasty if a bit on the expensive side.  My main complaint is that the bars disintegrate as you open up the packaging. Since the bars are also very sticky remain stuck to the rapper as they break apart so you are stuck having to pick off pieces of the bar from the inside of the wrapper (which is difficult because the wrapper is plastic not foil). I don't know if I got an old box of these or if they're all like this. Regardless I probably won't risk getting another box like the first.
Generated Summary: delicious

Input: summarize: This cat food is great. My big tom cat eats it as well as my two tiny doggies. Its all natural and doesnt make them sick or throw up. Nothing like that. That animals normally do with cheap processed pet food. And Amazon's price is much cheaper than the