# Phase 1:
## 1. Data Wrangling & EDA


##1.2. Install Required Libraries

In [None]:
!pip install datasets pandas matplotlib

## 1.3. Load the Amazon Reviews Data

In [None]:
from datasets import load_dataset
import pandas as pd

# Load a subset (e.g., 10,000 rows for speed)
ds = load_dataset("amazon_polarity", split="train[:10000]")
df = pd.DataFrame(ds)
df.head()


In [None]:
print(df['label'].value_counts())


In [None]:
print(df.isnull().sum())

In [None]:
df[['label', 'title', 'content']].sample(5)


In [None]:
df['word_count'] = df['content'].apply(lambda x: len(x.split()))
df[['word_count']].describe()


In [None]:
import matplotlib.pyplot as plt

df['word_count'].hist(bins=30, edgecolor='black')
plt.xlabel('Number of Words per Review')
plt.ylabel('Number of Reviews')
plt.title('Review Length Distribution')
plt.show()


In [None]:
from collections import Counter

all_words = ' '.join(df['content']).lower().split()
common_words = Counter(all_words).most_common(20)
print("Top 20 words:", common_words)


In [None]:
df.to_csv("amazon_reviews_raw.csv", index=False)
from google.colab import files
files.download("amazon_reviews_raw.csv")


In [None]:
df['clean_content'] = df['content'].str.lower()


In [None]:
import string
df['clean_content'] = df['clean_content'].str.translate(str.maketrans('', '', string.punctuation))


In [None]:
df['clean_content'] = df['clean_content'].str.replace(r'\d+', '', regex=True)


In [None]:
df['clean_content'] = df['clean_content'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [None]:
!pip install nltk
import nltk
nltk.download('stopwords')


In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['clean_content'] = df['clean_content'].apply(remove_stopwords)


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

# For demo speed, do just first 1,000 rows
df.loc[:999, 'clean_content'] = df.loc[:999, 'clean_content'].apply(lemmatize)


In [None]:
df = df[df['clean_content'].str.split().str.len() > 3]
print("After removing short reviews:", df.shape)


In [None]:
df[['content', 'clean_content']].sample(5)


In [None]:
df.to_csv("amazon_reviews_cleaned.csv", index=False)
from google.colab import files
files.download("amazon_reviews_cleaned.csv")


In [None]:
from sklearn.model_selection import train_test_split

X = df['clean_content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
import joblib

joblib.dump(clf, "logreg_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
from google.colab import files
files.download("logreg_model.joblib")
files.download("tfidf_vectorizer.joblib")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
!pip install transformers datasets scikit-learn


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd

df = pd.read_csv('/content/amazon_reviews_cleaned.csv')
df = df.dropna(subset=['clean_content', 'label'])  # Just in case

In [None]:
df.head()

In [None]:
df = df.rename(columns={'clean_content': 'text'})
df = df[['text', 'label']]
df = df[df['text'].str.strip().astype(bool)]  # Remove empty rows


In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"  # Or "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize (batched for speed)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [None]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:

# OPTIONAL: Use only a small subset for quick testing (e.g., first 2000 samples)
small_train_dataset = torch.utils.data.Subset(train_dataset, range(2000))
small_val_dataset = torch.utils.data.Subset(val_dataset, range(500))

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Only 1 epoch for speed
    per_device_train_batch_size=16,  # Bigger batch = faster (if you have GPU memory)
    per_device_eval_batch_size=32,   # Same for eval
    eval_strategy="epoch",
    save_strategy="no",  # Don't save checkpoints
    logging_dir='./logs',
    logging_steps=100,   # Log less frequently
    report_to=[],        # Turn off all reporting (no wandb, no tensorboard)
    disable_tqdm=False,  # Progress bar (set True if it slows down Colab)
    fp16=True if torch.cuda.is_available() else False,  # Mixed precision on GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,  # Use subset
    eval_dataset=small_val_dataset,     # Use subset
)

trainer.train()


In [None]:
trainer.evaluate()


In [None]:
import numpy as np

preds = trainer.predict(val_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(val_labels, y_pred))
print(confusion_matrix(val_labels, y_pred))


In [None]:
model.save_pretrained("./finetuned_distilbert")
tokenizer.save_pretrained("./finetuned_distilbert")
