# Sentiment Analysis - Amazon Food Reviews

**Dataset:** Amazon Fine Food Reviews  
**Models:** Word2Vec + XGBoost + DistilBERT  
**Goal:** Binary sentiment classification (Positive/Negative)

## 1. Setup

In [16]:
!pip install -q gensim xgboost transformers datasets torch scikit-learn nltk evaluate

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

import os
os.environ['WANDB_DISABLED'] = 'true'

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda


In [2]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

## 2. Load Data

In [3]:
df = pd.read_csv('Reviews.csv')
print(f"Loaded {len(df):,} reviews")
df.head()

Loaded 88,916 reviews


Unnamed: 0,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpfulness_ratio,date,full_text
0,B001E4KFG0,A3SGXH7AUHU8GW,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1.0,2011-04-27,Good Quality Dog Food I have bought several of...
1,B00813GRG4,A1D87F6ZCVE5NK,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0.0,2012-09-07,Not as Advertised Product arrived labeled as J...
2,B000LQOCH0,ABXLMWJIXXAIN,1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1.0,2008-08-18,"""Delight"" says it all This is a confection tha..."
3,B000UA0QIQ,A395BORC6FGVXV,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,1.0,2011-06-13,Cough Medicine If you are looking for the secr...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,0.0,2012-10-21,Great taffy Great taffy at a great price. The...


## 3. Prepare Data

In [4]:
# Binary sentiment: 1 (Positive) if Score >= 4, else 0 (Negative)
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

# Sample 50k reviews
df_sample = df[['Text', 'Sentiment']].sample(n=50000, random_state=42).dropna()

print(f"Sample size: {len(df_sample):,}")
print(df_sample['Sentiment'].value_counts())

Sample size: 50,000
Sentiment
1    38546
0    11454
Name: count, dtype: int64


## 4. Text Preprocessing

In [6]:
nltk.download('punkt_tab', quiet=True)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

df_sample['tokens'] = df_sample['Text'].apply(clean_text)

## 5. Split Data

In [7]:
train_df, test_df = train_test_split(df_sample, test_size=0.2, random_state=42, stratify=df_sample['Sentiment'])

print(f"Train: {len(train_df):,} | Test: {len(test_df):,}")

Train: 40,000 | Test: 10,000


## 6. Train Word2Vec

In [8]:
w2v = Word2Vec(
    sentences=train_df['tokens'].tolist(),
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    epochs=10
)

w2v.save('word2vec.model')
print(f"Vocabulary: {len(w2v.wv):,} words")

Vocabulary: 25,481 words


## 7. Vectorization

In [9]:
def text_to_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train = np.array([text_to_vector(tokens, w2v) for tokens in train_df['tokens']])
y_train = train_df['Sentiment'].values

X_test = np.array([text_to_vector(tokens, w2v) for tokens in test_df['tokens']])
y_test = test_df['Sentiment'].values

print(f"X_train: {X_train.shape} | X_test: {X_test.shape}")

X_train: (40000, 100) | X_test: (10000, 100)


## 8. Train XGBoost

In [10]:
xgb = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(classification_report(y_test, y_pred_xgb, target_names=['Negative', 'Positive']))

import pickle
with open('xgboost.pkl', 'wb') as f:
    pickle.dump(xgb, f)

XGBoost Accuracy: 0.8625
              precision    recall  f1-score   support

    Negative       0.80      0.54      0.64      2291
    Positive       0.87      0.96      0.91      7709

    accuracy                           0.86     10000
   macro avg       0.84      0.75      0.78     10000
weighted avg       0.86      0.86      0.85     10000



## 9. Train BERT

In [11]:
# Prepare BERT data
BERT_SAMPLE = 10000 if device == 'cuda' else 5000
indices = np.random.choice(len(df_sample), min(BERT_SAMPLE, len(df_sample)), replace=False)
df_bert = df_sample.iloc[indices][['Text', 'Sentiment']].copy()

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_bert['Text'].tolist(), df_bert['Sentiment'].tolist(), test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})

In [12]:
# Load DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Train
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16 if device == 'cuda' else 8,
    per_device_eval_batch_size=16 if device == 'cuda' else 8,
    eval_strategy='epoch',
    save_strategy='no',
    logging_steps=100,
    report_to='none',
    seed=42
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=compute_metrics)
trainer.train()

results = trainer.evaluate()
bert_accuracy = results['eval_accuracy']

print(f"BERT Accuracy: {bert_accuracy:.4f}")

model.save_pretrained('./bert_model')
tokenizer.save_pretrained('./bert_model')

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0976,0.486681,0.893
2,0.0854,0.552406,0.8965


BERT Accuracy: 0.8965


('./bert_model/tokenizer_config.json',
 './bert_model/special_tokens_map.json',
 './bert_model/vocab.txt',
 './bert_model/added_tokens.json',
 './bert_model/tokenizer.json')

## 10. Model Comparison

In [18]:
print(f"\nXGBoost: {xgb_accuracy:.4f}")
print(f"BERT:    {bert_accuracy:.4f}")

best_model = "BERT" if bert_accuracy > xgb_accuracy else "XGBoost"
print(f"\nBest: {best_model}")


XGBoost: 0.8625
BERT:    0.8965

Best: BERT


## 11. Testing

In [19]:
def predict_xgb(text):
    tokens = clean_text(text)
    vector = text_to_vector(tokens, w2v).reshape(1, -1)
    pred = xgb.predict(vector)[0]
    proba = xgb.predict_proba(vector)[0]
    sentiment = "Positive" if pred == 1 else "Negative"
    return sentiment, proba[pred]

def predict_bert(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs, dim=-1).item()
    sentiment = "Positive" if pred == 1 else "Negative"
    return sentiment, probs[0][pred].item()

In [20]:
test_sentences = [
    "this product is good",
    "this product is not good",
    "this product is bad",
    "this product is not bad",
    "not good at all",
    "not bad actually",
    "this is great",
    "this is not great"
]

for i, sentence in enumerate(test_sentences, 1):
    xgb_sent, xgb_conf = predict_xgb(sentence)
    bert_sent, bert_conf = predict_bert(sentence)
    print(f"{i}. {sentence}")
    print(f"   XGBoost: {xgb_sent} ({xgb_conf:.2%})")
    print(f"   BERT: {bert_sent} ({bert_conf:.2%})")
    print()

1. this product is good
   XGBoost: Positive (96.96%)
   BERT: Positive (99.94%)

2. this product is not good
   XGBoost: Positive (96.96%)
   BERT: Negative (99.97%)

3. this product is bad
   XGBoost: Negative (81.38%)
   BERT: Negative (99.97%)

4. this product is not bad
   XGBoost: Negative (81.38%)
   BERT: Positive (99.03%)

5. not good at all
   XGBoost: Positive (90.63%)
   BERT: Negative (99.95%)

6. not bad actually
   XGBoost: Negative (90.29%)
   BERT: Negative (65.63%)

7. this is great
   XGBoost: Positive (99.82%)
   BERT: Positive (99.95%)

8. this is not great
   XGBoost: Positive (99.82%)
   BERT: Negative (99.95%)



In [21]:
from google.colab import files


!zip -r bert_model.zip bert_model
files.download('bert_model.zip')


print(" BERT downloaded ")

  adding: bert_model/ (stored 0%)
  adding: bert_model/model.safetensors (deflated 8%)
  adding: bert_model/tokenizer.json (deflated 71%)
  adding: bert_model/config.json (deflated 45%)
  adding: bert_model/special_tokens_map.json (deflated 42%)
  adding: bert_model/tokenizer_config.json (deflated 75%)
  adding: bert_model/vocab.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 BERT downloaded 
