In [None]:
# Install required packages
!pip install transformers==4.21.0 datasets==2.4.0 rouge-score==0.1.2 nltk==3.7
!pip install sentencepiece==0.1.97 evaluate==0.2.2
!pip install wordcloud matplotlib seaborn torch

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset as HFDataset
from rouge_score import rouge_scorer
import evaluate
import os
import json

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
df = pd.read_csv('/content/Reviews.csv')
print(f"‚úÖ Dataset loaded successfully! Shape: {df.shape}")

In [None]:
print("\nDataset columns:")
print(df.columns.tolist())

In [None]:
print("Missing values:")
print(df.isnull().sum())

In [None]:
df = df.dropna(subset=['Text', 'Summary'])
print(f"Dataset shape after removing missing values: {df.shape}")

print("Dataset statistics:")
print(f"Total reviews: {len(df)}")
print(f"Average text length: {df['Text'].str.len().mean():.2f} characters")
print(f"Average summary length: {df['Summary'].str.len().mean():.2f} characters")

# **Data Visualization**
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Score distribution
score_counts = df['Score'].value_counts().sort_index()
ax1.bar(score_counts.index, score_counts.values, color='lightblue', alpha=0.7, edgecolor='black')
ax1.set_xlabel('Score (1-5)')
ax1.set_ylabel('Count')
ax1.set_title('Distribution of Review Scores')
ax1.set_xticks(range(1, 6))

# Text length distribution
text_lengths = df['Text'].str.len()
ax2.hist(text_lengths, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
ax2.set_xlabel('Text Length (characters)')
ax2.set_ylabel('Frequency')
ax2.set_title('Distribution of Review Text Lengths')
ax2.set_xlim(0, 5000)

# Summary length distribution
summary_lengths = df['Summary'].str.len()
ax3.hist(summary_lengths, bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
ax3.set_xlabel('Summary Length (characters)')
ax3.set_ylabel('Frequency')
ax3.set_title('Distribution of Summary Lengths')

# Word count distribution
df['Text_Word_Count'] = df['Text'].str.split().str.len()
ax4.hist(df['Text_Word_Count'], bins=50, alpha=0.7, color='plum', edgecolor='black')
ax4.set_xlabel('Word Count')
ax4.set_ylabel('Frequency')
ax4.set_title('Distribution of Word Counts in Reviews')
ax4.set_xlim(0, 500)

plt.tight_layout()
plt.show()

In [None]:
print("Generating word cloud...")
text_corpus = ' '.join(df['Text'].astype(str).sample(10000))
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(text_corpus)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Most Frequent Words in Amazon Food Reviews')
plt.show()

**Data Preprocessing**

In [None]:
class TextPreprocessor:
    def __init__(self):
        try:
            self.stop_words = set(stopwords.words('english'))
            self.lemmatizer = WordNetLemmatizer()
        except:
            self.stop_words = set()
            self.lemmatizer = None

    def clean_text(self, text):
        """Clean and preprocess text"""
        if not isinstance(text, str):
            return ""

        text = text.lower()
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'[^a-zA-Z\s\.\!\?]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def preprocess_for_summarization(self, text, summary):
        """Preprocess text and summary for summarization task"""
        clean_text = self.clean_text(text)
        clean_summary = self.clean_text(summary)
        return clean_text, clean_summary

In [None]:
preprocessor = TextPreprocessor()

In [None]:
print("\nPreprocessing demonstration:")
sample_idx = 0
sample_text = df.iloc[sample_idx]['Text']
sample_summary = df.iloc[sample_idx]['Summary']

print("Original Text:")
print(sample_text[:200] + "...")
print("\nOriginal Summary:")
print(sample_summary)

clean_text, clean_summary = preprocessor.preprocess_for_summarization(sample_text, sample_summary)
print("\nCleaned Text:")
print(clean_text[:200] + "...")
print("\nCleaned Summary:")
print(clean_summary)

In [None]:
print("\nProcessing dataset...")
SAMPLE_SIZE = 5000
df_sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42)

processed_data = []
for idx, row in df_sample.iterrows():
    clean_text, clean_summary = preprocessor.preprocess_for_summarization(row['Text'], row['Summary'])
    if (50 <= len(clean_text) <= 2000 and
        10 <= len(clean_summary) <= 200 and
        len(clean_text.split()) > 10 and
        len(clean_summary.split()) > 3):
        processed_data.append({
            'text': clean_text,
            'summary': clean_summary,
            'score': int(row['Score'])
        })

processed_df = pd.DataFrame(processed_data)
print(f"Processed dataset size: {len(processed_df)}")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, temp_df = train_test_split(processed_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
print(f"\nDataset splits:")
print(f"Training set: {len(train_df)}")
print(f"Validation set: {len(val_df)}")
print(f"Test set: {len(test_df)}")

In [None]:
train_dataset = HFDataset.from_pandas(train_df[['text', 'summary']])
val_dataset = HFDataset.from_pandas(val_df[['text', 'summary']])
test_dataset = HFDataset.from_pandas(test_df[['text', 'summary']])

**BART Model Setup**

In [None]:
MODEL_NAME = "facebook/bart-large-cnn"
print(f"\nLoading BART-large model: {MODEL_NAME}")

In [None]:
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    print("‚úÖ BART-large-cnn model loaded successfully!")
except Exception as e:
    print(f"‚ùå Error loading BART-large-cnn: {e}")
    print("üîÑ Falling back to BART-base...")
    MODEL_NAME = "facebook/bart-base"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
model.to(device)
print(f"Model device: {next(model.parameters()).device}")

In [None]:
def preprocess_function(examples):
    """Tokenize the texts and prepare for model input"""
    inputs = examples['text']

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors=None
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['summary'],
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors=None
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(preprocess_function, batched=True, batch_size=16)
tokenized_val = val_dataset.map(preprocess_function, batched=True, batch_size=16)
print("Tokenization completed successfully!")

**Model Implementation**

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

In [None]:
def compute_metrics(eval_pred):
    try:
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        rouge = evaluate.load('rouge')
        result = rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )

        return {k: float(round(v, 4)) for k, v in result.items()}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

In [None]:
try:
    # Try with eval_strategy (newer versions)
    training_args = Seq2SeqTrainingArguments(
        output_dir="./bart-amazon-reviews",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=50,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,  # Changed to 50
        save_steps=50,  # Changed to 50 (same as eval_steps)
        predict_with_generate=True,
        generation_max_length=128,
        generation_num_beams=4,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to=None,
        dataloader_pin_memory=False,
    )
    print("‚úÖ Using eval_strategy (newer Transformers version)")

except TypeError as e:
    if "eval_strategy" in str(e):
        # Fallback to evaluation_strategy (older versions)
        training_args = Seq2SeqTrainingArguments(
            output_dir="./bart-amazon-reviews",
            overwrite_output_dir=True,
            num_train_epochs=1,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            warmup_steps=50,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=10,
            evaluation_strategy="steps",
            eval_steps=50,  # Changed to 50
            save_steps=50,  # Changed to 50 (same as eval_steps)
            predict_with_generate=True,
            generation_max_length=128,
            generation_num_beams=4,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to=None,
            dataloader_pin_memory=False,
        )
        print("‚úÖ Using evaluation_strategy (older Transformers version)")
    else:
        raise e

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

**Model Training**

In [None]:
print("Starting training...")
try:
    train_result = trainer.train()
    trainer.save_model("./bart-amazon-food-reviews")
    tokenizer.save_pretrained("./bart-amazon-food-reviews")
    print("‚úÖ BART-large training completed successfully!")
    print(f"Final training loss: {train_result.training_loss:.4f}")

except Exception as e:
    print(f"‚ùå Training failed: {e}")
    print("Using pre-trained BART model without fine-tuning")

**Model Evaluation**

In [None]:
def generate_summary_bart(text, model, tokenizer, max_length=128):
    """Generate summary using BART model"""
    try:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            max_length=512,
            truncation=True
        ).to(device)

        summary_ids = model.generate(
            inputs.input_ids,
            max_length=max_length,
            min_length=30,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3
        )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error generating summary: {e}")
        return "Summary generation failed"

In [None]:
def evaluate_bart_model(model, tokenizer, test_data, max_samples=20):
    """Evaluate BART model using ROUGE metrics"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    sample_results = []

    for i, example in enumerate(test_data):
        if i >= max_samples:
            break

        generated_summary = generate_summary_bart(example['text'], model, tokenizer)
        actual_summary = example['summary']
        scores = scorer.score(actual_summary, generated_summary)

        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

        sample_results.append({
            'original_text': example['text'],
            'actual_summary': actual_summary,
            'generated_summary': generated_summary,
            'rouge1': scores['rouge1'].fmeasure
        })

        if i < 3:
            print(f"\nüìù Sample {i+1}:")
            print(f"Original: {example['text'][:150]}...")
            print(f"Actual: {actual_summary}")
            print(f"Generated: {generated_summary}")
            print(f"ROUGE-1: {scores['rouge1'].fmeasure:.4f}")

    return {
        'rouge1': float(np.mean(rouge1_scores)),
        'rouge2': float(np.mean(rouge2_scores)),
        'rougeL': float(np.mean(rougeL_scores))
    }, sample_results

try:
    eval_model = AutoModelForSeq2SeqLM.from_pretrained("./bart-amazon-food-reviews").to(device)
    eval_tokenizer = AutoTokenizer.from_pretrained("./bart-amazon-food-reviews")
    print("‚úÖ Loaded fine-tuned BART model for evaluation")
except:
    eval_model = model
    eval_tokenizer = tokenizer
    print("‚ÑπÔ∏è Using pre-trained BART model for evaluation")

In [None]:
print("\nRunning comprehensive evaluation...")
evaluation_results, sample_results = evaluate_bart_model(eval_model, eval_tokenizer, test_df.to_dict('records'), max_samples=20)

In [None]:
print("\nüìä BART Model Evaluation Results:")
for metric, score in evaluation_results.items():
    print(f"  {metric.upper()}: {score:.4f}")

**Results Visualization**

In [None]:
def plot_bart_results(results):
    metrics = list(results.keys())
    scores = list(results.values())

    plt.figure(figsize=(10, 6))
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    bars = plt.bar(metrics, scores, color=colors, alpha=0.8, edgecolor='black')

    plt.xlabel('ROUGE Metrics')
    plt.ylabel('Score')
    plt.title('BART-Large Performance on Amazon Food Reviews')
    plt.ylim(0, 1)
    plt.grid(axis='y', alpha=0.3)

    for bar, score in zip(bars, scores):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{score:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=12)

    plt.tight_layout()
    plt.show()

plot_bart_results(evaluation_results)

**Deployment**

In [5]:
class AmazonReviewSummarizer:
    def __init__(self, model_path=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.preprocessor = TextPreprocessor()

        try:
            if model_path and os.path.exists(model_path):
                self.tokenizer = AutoTokenizer.from_pretrained(model_path)
                self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(self.device)
                print("‚úÖ Loaded fine-tuned BART model")
            else:
                self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
                self.model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(self.device)
                print("‚úÖ Loaded pre-trained BART-large-cnn model")
        except Exception as e:
            print(f"‚ùå Error loading model: {e}")
            self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
            self.model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base").to(self.device)

    def summarize_review(self, text, max_length=128):
        try:
            clean_text, _ = self.preprocessor.preprocess_for_summarization(text, "")
            if len(clean_text.split()) < 5:
                return "Text too short for summarization"
            summary = generate_summary_bart(clean_text, self.model, self.tokenizer, max_length)
            return summary
        except Exception as e:
            return f"Error in summarization: {e}"

    def summarize_batch(self, texts, max_length=128):
        summaries = []
        for i, text in enumerate(texts):
            summary = self.summarize_review(text, max_length)
            summaries.append(summary)
        return summaries

üöÄ ULTRA-EFFICIENT SUMMARIZATION SYSTEM
Using pre-trained models - No training required!


Device set to use cpu


Loading Amazon reviews...
Loaded 568427 reviews


Your max_length is set to 80, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)



üìù TESTING SUMMARIZATION SYSTEM


Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üìã Review 1:
Original (54 words): I am a gluten free girl and finding safe grain alternatives has been a challenge. Ancient Harvest Qu...
Actual Summary: Ancient Harvest Quinoa Flakes Indispensable
Generated Summary: Ancient Harvest Quinoa Flakes are great for a hot breakfast cereal and are great to use in baking .
Compression: 54 ‚Üí 19 words
--------------------------------------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üìã Review 2:
Original (397 words): I am so appalled at this finding as my dog has been taking these treats for 8 years, now this explai...
Actual Summary: BEWARE - MADE IN CHINA
Generated Summary: my dog became so ill back in October 2011 and still no diagnosis . doctors kept pumping me for money and told me all tests were negative . 6 months later of my dog vomiting and bloody diarrhea we started prednizone .
Compression: 397 ‚Üí 42 words
--------------------------------------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üìã Review 3:
Original (133 words): I've given my cats a few different kinds of treats daily & these are by far their favorites.... They...
Actual Summary: Beachside Party Mix
Generated Summary: thier breath is so bad from eating other soft chewy treats . they cost $1.50 a pack in the store so they are even cheaper here .
Compression: 133 ‚Üí 27 words
--------------------------------------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üìã Review 4:
Original (96 words): It is advertised as the original 10 calorie Propel Fitness Water, but it's only Propel ZERO! If you ...
Actual Summary: THIS IS NOT 10 CALORIE PROPEL!!!!
Generated Summary: Propel Zero is advertised as the original 10 calorie Propel Fitness Water . but it's only Propel ZERO!
Compression: 96 ‚Üí 18 words
--------------------------------------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üìã Review 5:
Original (97 words): I've been using Freeze Dried Liver treats to train and reward my dogs for about 20 years. It is like...
Actual Summary: This it like dog heroin.  There's nothing my dog won't do for liver treats.
Generated Summary: freezeddried liver treats train and reward my dogs for 20 years . my current dog, Samson, will sit in front of the pantry and drool .
Compression: 97 ‚Üí 26 words
--------------------------------------------------------------------------------

üè¢ BUSINESS APPLICATION - BATCH PROCESSING


Device set to use cpu


Processing business reviews batch...

üìä BATCH PROCESSING RESULTS:

Review 1 [NEUTRAL]:
Summary: Error: cannot access local variable 'clean_text' where it is not associated with a value
Sentiment: neutral
--------------------------------------------------

Review 2 [NEGATIVE]:
Summary: Error: cannot access local variable 'clean_text' where it is not associated with a value
Sentiment: negative
--------------------------------------------------

Review 3 [NEUTRAL]:
Summary: Error: cannot access local variable 'clean_text' where it is not associated with a value
Sentiment: neutral
--------------------------------------------------

Review 4 [POSITIVE]:
Summary: Error: cannot access local variable 'clean_text' where it is not associated with a value
Sentiment: positive
--------------------------------------------------

Review 5 [NEUTRAL]:
Summary: Error: cannot access local variable 'clean_text' where it is not associated with a value
Sentiment: neutral
---------------------------------

**Test the System**

In [None]:
print("\nüß™ Testing the Summarization System...")
summarizer = AmazonReviewSummarizer()

real_samples = test_df.sample(3).to_dict('records')
real_reviews = [sample['text'] for sample in real_samples]
real_summaries = [sample['summary'] for sample in real_samples]

print("Generating summaries for real Amazon reviews...")
generated_summaries = summarizer.summarize_batch(real_reviews)

for i, (review, actual, generated) in enumerate(zip(real_reviews, real_summaries, generated_summaries)):
    print(f"\nüìã Review {i+1}:")
    print(f"Original ({len(review.split())} words): {review[:100]}...")
    print(f"Actual Summary: {actual}")
    print(f"Generated Summary: {generated}")
    print("-" * 80)


**Final Results Summary**

In [None]:
results_summary = {
    'model': str(MODEL_NAME),
    'dataset': 'Amazon Fine Food Reviews',
    'original_dataset_size': int(len(df)),
    'processed_dataset_size': int(len(processed_df)),
    'training_samples': int(len(train_df)),
    'test_samples': int(len(test_df)),
    'performance_metrics': evaluation_results,
    'target_rouge_score': 0.5,
    'achieved_rouge_score': float(evaluation_results['rouge1']),
    'project_target_achieved': bool(evaluation_results['rouge1'] >= 0.5),
    'sample_predictions': [
        {
            'original_text': str(real_reviews[i][:100]) + "...",
            'actual_summary': str(real_summaries[i]),
            'generated_summary': str(generated_summaries[i])
        }
        for i in range(min(2, len(real_reviews)))
    ]
}

print("\n" + "="*70)
print("üéØ BART-LARGE AMAZON FOOD REVIEWS SUMMARIZATION")
print("="*70)

print(f"\nüìä FINAL RESULTS SUMMARY:")
print(f"ü§ñ Model: {results_summary['model']}")
print(f"üìÅ Dataset: {results_summary['dataset']}")
print(f"üìà Original reviews: {results_summary['original_dataset_size']:,}")
print(f"üéØ Target ROUGE-1: ‚â•{results_summary['target_rouge_score']}")
print(f"üèÜ Achieved ROUGE-1: {results_summary['achieved_rouge_score']:.4f}")

if results_summary['project_target_achieved']:
    print("‚úÖ üéâ Project target achieved with BART-large on real Amazon data!")
else:
    print("‚ö†Ô∏è Project target not fully achieved.")
    print("üí° Try training with more data or longer training time.")

In [None]:
try:
    with open('amazon_reviews_summarization_results.json', 'w') as f:
        json.dump(results_summary, f, indent=2, ensure_ascii=False)
    print("üíæ Results saved to 'amazon_reviews_summarization_results.json'")
except Exception as e:
    print(f"‚ùå Error saving results: {e}")

print(f"\nüöÄ DUAL NLP SYSTEM WITH REAL AMAZON DATA COMPLETED! üéâ")
print("‚ú® Your production-ready Amazon review summarization system is ready!")