## **Goal: Predict whether an Amazon review is Positive or Negative, based only on the text**

#### **Step 1: Load the Dataset**

In [1]:
# Step 1: Load the dataset
import pandas as pd

# Read the CSV file from the data folder
df = pd.read_csv("data/Reviews.csv")

# Check the shape (rows, columns)
print("Dataset shape:", df.shape)

# View the first 5 rows
df.head()

Dataset shape: (568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


#### **Step 2: Select the Columns As Needed**

In [2]:
# Step 2: Keep only the 'Text' and 'Score' columns
df = df[['Text', 'Score']]

# Show a sample
df.head()


Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


#### **Step 3: Data Prepping: Select the Columns As Needed**

In [3]:
# Step 3: Filter the dataset to remove Score 3
df = df[df['Score'] != 3]

# Create new binary sentiment labels
df['Sentiment'] = df['Score'].apply(lambda x: 'Positive' if x > 3 else 'Negative')

# Check value counts
print(df['Sentiment'].value_counts())
df.head()

Sentiment
Positive    443777
Negative     82037
Name: count, dtype: int64


Unnamed: 0,Text,Score,Sentiment
0,I have bought several of the Vitality canned d...,5,Positive
1,Product arrived labeled as Jumbo Salted Peanut...,1,Negative
2,This is a confection that has been around a fe...,4,Positive
3,If you are looking for the secret ingredient i...,2,Negative
4,Great taffy at a great price. There was a wid...,5,Positive


#### **Step 4: Text Cleaning for NLP**

In [11]:
# Step 4: Clean the review text
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Define a clean_text function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))  # remove punctuation/numbers
    text = text.lower().split()  # lowercase and tokenize
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

# Apply it to the 'Text' column
df['Cleaned_Review'] = df['Text'].apply(clean_text)

# Show a few cleaned results
df[['Text', 'Cleaned_Review']].head()

# Remove rows where the cleaned review is empty
df = df[df['Cleaned_Review'].str.strip() != '']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mayeshamalihaproma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### **Step 5: Convert Text to Numbers (TF-IDF Vectorization)**

In [12]:
# Step 5: Convert text into numeric form using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer (Use only the top 5000 important words)
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text (Learns vocab and transforms text)
X = vectorizer.fit_transform(df['Cleaned_Review']).toarray()

# Labels (Positive or Negative)
y = df['Sentiment']

import numpy as np

print("NaNs in X:", np.isnan(X).sum())
print("Infs in X:", np.isinf(X).sum())
print("Zero vectors (empty reviews):", np.sum(np.sum(X, axis=1) == 0))

NaNs in X: 0
Infs in X: 0
Zero vectors (empty reviews): 8


#### **Step 6: Train-Test Split**

In [13]:
# Step 6: Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Insert this RIGHT HERE ⬇️
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### **Step 7: Train a Classifier (Logistic Regression)**

In [None]:
# Step 7: Train a classifier (Logistic Regression)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
