In [72]:
from datasets import load_dataset, Dataset
import random
from evaluate import load
import os
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import kagglehub
import evaluate
import torch



In [73]:
# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

for dirname, _, filenames in os.walk('/Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Path to dataset files: /Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2
/Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2/database.sqlite
/Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2/Reviews.csv
/Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2/hashes.txt


In [74]:
df = pd.read_csv("/Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2/" + "reviews.csv", usecols=["Id","Summary", "Text", "ProductId"])
df.dropna(subset=["Summary", "Text"], inplace=True)
df = df.sample(10000, random_state=42)
df = df.rename(columns={"Summary": "target_text", "Text": "input_text"})
dataset = Dataset.from_pandas(df)
print(len(dataset))           

10000


In [75]:
def load_balanced_reviews(csv_path):
    df = pd.read_csv(csv_path, usecols=["Id","Score","Summary", "Text", "ProductId"])
    # Filter for 1, 2, and 5 star reviews
    low = df[df.Score.isin([1,2])]

    high = df[df['Score'] == 5]


    #n_neg = len(negative)
    #target_pos = n_neg  
    
    #positive_down = positive.sample(n=target_pos, random_state=42, replace=False)
    
    #balanced = pd.concat([negative, positive_down]) \
                 #.sample(frac=1, random_state=42) \
                 #.reset_index(drop=True)
    #return balanced




    n_samples = min(len(low), len(high))
    low_balanced = low.sample(n=n_samples, random_state=42)
    high_balanced = high.sample(n=n_samples, random_state=42)
    balanced_df = pd.concat([low_balanced, high_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)
    balanced_df.to_csv("FilteredReviews.csv", index=False)
    return balanced_df

# Calls the function
balanced_reviews = load_balanced_reviews("/Users/milez/.cache/kagglehub/datasets/snap/amazon-fine-food-reviews/versions/2/" + "reviews.csv")
balanced_reviews.dropna(subset=["Summary", "Text"], inplace=True)
balanced_reviews = balanced_reviews.sample(10000, random_state=42)
balanced_reviews = balanced_reviews.rename(columns={"Summary": "target_text", "Text": "input_text"})
dataset1 = Dataset.from_pandas(balanced_reviews)

In [76]:
tokenizer = BartTokenizer.from_pretrained("lucadiliello/bart-small")
model = BartForConditionalGeneration.from_pretrained("lucadiliello/bart-small")

print(len(dataset))           

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


10000


In [77]:
def preprocess_function(examples):
    return tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ) | {
        "labels": tokenizer(
            examples["target_text"],
            max_length=64,
            truncation=True,
            padding="max_length"
        )["input_ids"]
    }


In [78]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [79]:
small_train = train_dataset#.select(range(1000))

In [80]:
training_args = TrainingArguments(
    output_dir= "./bart_summarizer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir= "./logs",
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained("./bart_summarizer")
tokenizer.save_pretrained("./bart_summarizer")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.939,0.892749
2,0.5641,0.530795
3,0.519,0.522671
4,0.5172,0.519291




('./bart_summarizer/tokenizer_config.json',
 './bart_summarizer/special_tokens_map.json',
 './bart_summarizer/vocab.json',
 './bart_summarizer/merges.txt',
 './bart_summarizer/added_tokens.json')

In [81]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")  
meteor = evaluate.load("meteor")


[nltk_data] Downloading package wordnet to /Users/milez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/milez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/milez/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [111]:
initial_model = BartForConditionalGeneration.from_pretrained("lucadiliello/bart-small")

finetuned_model = BartForConditionalGeneration.from_pretrained("./bart_summarizer3")
finetuned_tokenizer = BartTokenizer.from_pretrained("./bart_summarizer3")



You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels will be overwritten to 2.


In [112]:
def evaluate_model(model_to_eval, dataset, tokenizer, max_input_length=512, max_target_length=64):
    model_to_eval.eval()
    predictions = []
    references = []

    for example in dataset.select(range(200)):
        input_text = "summarize: " + example["input_text"]
        input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=max_input_length)
        input_ids = input_ids.to(model_to_eval.device)
        with torch.no_grad():
            output_ids = model_to_eval.generate(input_ids, max_length=max_target_length)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)
        references.append(example["target_text"])
    
    results = {}
    # ROUGE-L
    rouge_results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    results["rougeL"] = rouge_results["rougeL"]
    # BLEU score
    bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    results["bleu"] = bleu_results["bleu"]
    # METEOR score
    meteor_results = meteor.compute(predictions=predictions, references=references)
    results["meteor"] = meteor_results["meteor"]

    return results

# Initial model
initial_results = evaluate_model(initial_model, eval_dataset, tokenizer)
print("Initial Model Metrics:")
print(f"ROUGE-L (Initial pre-trained model): {initial_results['rougeL']:.4f}")
print(f"BLEU (Initial pre-trained model): {initial_results['bleu']:.4f}")
print(f"METEOR (Initial pre-trained model): {initial_results['meteor']:.4f}")

# Fine-tuned model
finetuned_results = evaluate_model(finetuned_model, eval_dataset, tokenizer)
print("\nFine-tuned Model Metrics:")
print(f"ROUGE-L (Fine-tuned model): {finetuned_results['rougeL']:.4f}")
print(f"BLEU (Fine-tuned model): {finetuned_results['bleu']:.4f}")
print(f"METEOR (Fine-tuned model): {finetuned_results['meteor']:.4f}")

Initial Model Metrics:
ROUGE-L (Initial pre-trained model): 0.0681
BLEU (Initial pre-trained model): 0.0068
METEOR (Initial pre-trained model): 0.1373

Fine-tuned Model Metrics:
ROUGE-L (Fine-tuned model): 0.0645
BLEU (Fine-tuned model): 0.0102
METEOR (Fine-tuned model): 0.0417


In [110]:
# Load your fine-tuned model and tokenizer
#tokenizer = BartTokenizer.from_pretrained("path_to_your_fine_tuned_model")
#model = BartForConditionalGeneration.from_pretrained("path_to_your_fine_tuned_model")

# Sample input text
#input_text = "This Item Taste Like Dirt.. I've Prob Used it 4 Times & Now It's Just Sitting in MY Freezer.. I Have A High Tolerance for Nasty Stuff.. Just Don't Really Like this Product.. Something In Grinding It Up Makes It Taste Nasty.. The Hulled Seeds Nutiva Sells Are Way Better.. If You Want Good Tasting Hemp Protein Powder It's $15/lb @ Earthshiftproducts.com but It Taste Wayyy Better Actually Taste Good From Earthshift.."
#input_text = "This compact wireless speaker delivers excellent sound quality with impressive battery life."
input_text ="These cinnamon bears have great flavor and do not taste sugar free.  My only issue is that they should be softer."
# Tokenize the input
inputs = finetuned_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = finetuned_model.generate(inputs, max_length=64, num_beams=4, early_stopping=True)

# Decode and print the summary
summary = finetuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)


Generated Summary: Great flavor


In [106]:
def sample_outputs(initial_model, finetuned_model, dataset, tokenizer, device, sample_size=5, max_input_length=512, max_target_length=64):

    indices = list(range(len(dataset)))
    random_indices = random.sample(indices, sample_size)
    
    for idx in random_indices:
        example = dataset[idx]
        input_str = "summarize: " + example["input_text"]
        input_ids = tokenizer.encode(input_str, return_tensors="pt",
                                     truncation=True, max_length=max_input_length).to(device)
        
        # Initial model
        with torch.no_grad():
            initial_output_ids = initial_model.generate(input_ids, max_length=max_target_length)
        initial_output = tokenizer.decode(initial_output_ids[0], skip_special_tokens=True)
        
        # Fine-tuned model
        with torch.no_grad():
            finetuned_output_ids = finetuned_model.generate(input_ids, max_length=max_target_length)
        finetuned_output = tokenizer.decode(finetuned_output_ids[0], skip_special_tokens=True)
        
        # Display the outputs
        print("=" * 50)
        print(f"Example ID: {idx}")
        print("Input Text:")
        print(example["input_text"])
        print("\nReference Summary:")
        print(example["target_text"])
        print("\nInitial Model Output:")
        print(initial_output)
        print("\nFine-tuned Model Output:")
        print(finetuned_output)
        print("=" * 50 + "\n")

device = "cuda" if torch.cuda.is_available() else "cpu"

#sample_outputs(initial_model, model, eval_dataset, tokenizer, device, sample_size=5)

In [109]:

print(eval_dataset)
indices = list(range(len(eval_dataset)))
random_indices = random.sample(indices, 5)

for idx in random_indices:
    example = eval_dataset[idx]  

    # build the prompt string
    input_str = "summarize: " + example["input_text"]
    print("\nInput:", input_str)

    inputs = finetuned_tokenizer(
        input_str,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )

    # 5) generate and decode the first element
    summary_ids = finetuned_model.generate(
        **inputs,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )
    summary = finetuned_tokenizer.decode(
        summary_ids[0],
        skip_special_tokens=True
    )

    print("Generated Summary:", summary)


Dataset({
    features: ['Id', 'ProductId', 'target_text', 'input_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

Input: summarize: Best peanut brittle ever!  However, I was disappointed in the amount of loose peanuts and candy material in the bottom of the box.  In addition, the shipping cost doubled the cost of my order; outrageous!<br />Donald
Generated Summary: Best peanut brittle ever!

Input: summarize: Nutritionists recommend that you get a serving of vegetables with every meal, but how to accomplish that with a breakfast has always been a tough one for me since I don't like starchy vegetables like potatoes, and aside from cutting up a tomato, few other things go well with yogurt, egg whites, oatmeal, or any of the other stuff I like in the morning. A simple glass of V8 works perfectly. I've always loved V8 juice for that reason, but found it just a tad too salty. The low sodium version has made V8 perfect in my opinion. It has more f