In [1]:
import pandas as pd
import numpy as np
import nltk
import ssl
import re
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
import contractions

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# ssl._create_default_https_context = ssl._create_unverified_context
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
 

: 

In [2]:
# ! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

## Read Data

In [3]:
file_path = 'amazon_reviews_us_Office_Products_v1_00.tsv'

data = pd.read_csv(file_path, sep='\t', usecols=['review_body', 'star_rating'], low_memory=False)


## Keep Reviews and Ratings

In [4]:
data.rename(columns={'review_body': 'Review', 'star_rating': 'Rating'}, inplace=True)


 ## We form three classes and select 20000 reviews randomly from each class.



In [5]:
RANDOM_NUM = 6
data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')

#drop NAN Value
data = data.dropna(subset=['Rating'])
data = data.dropna(subset=['Review'])

print("Statistics of the ratings:")
print(data['Rating'].value_counts().sort_values(ascending=False))

# Rating > 3 -> 1 positive，Rating <= 2 -> 0 negative，Rating == 3 -> None
data['Sentiment'] = data['Rating'].apply(lambda x: 1 if x > 3 else (0 if x <= 2 else None))

sentiment_counts = data['Sentiment'].value_counts(dropna=False)
print("Positive Count: "+str(sentiment_counts.get(1, 0)) + ", Negative Count: "+ str(sentiment_counts.get(0, 0)) + ", Neutral Count: " + str(data['Sentiment'].isna().sum() ))

# drop neutral
data = data.dropna(subset=['Sentiment'])

# get positive && negative 10,000 comments
positive_reviews = data[data['Sentiment'] == 1].sample(10000, random_state=RANDOM_NUM)
negative_reviews = data[data['Sentiment'] == 0].sample(10000, random_state=RANDOM_NUM)


Statistics of the ratings:
Rating
5.0    1582769
4.0     418358
1.0     306980
3.0     193683
2.0     138388
Name: count, dtype: int64
Positive Count: 2001127, Negative Count: 445368, Neutral Count: 193683


# Data Cleaning



# Pre-processing

In [6]:
# print sample output
print("Three sample reviews before data cleaning + preprocessing:")
random_reviews = positive_reviews.sample(n=3, random_state=RANDOM_NUM)
print(random_reviews[['Review', 'Rating']])

# print average len
print(f"Average review length before cleaning (Positive): {positive_reviews['Review'].apply(len).mean():.2f} characters")
print(f"Average review length before cleaning (Negative): {negative_reviews['Review'].apply(len).mean():.2f} characters")

# 1. lower case
positive_reviews['Review'] = positive_reviews['Review'].str.lower()
negative_reviews['Review'] = negative_reviews['Review'].str.lower()
# 2. remove HTML
positive_reviews['Review'] = positive_reviews['Review'].astype(str).apply(lambda x: BeautifulSoup(x, "html.parser").get_text() if "<" in x or ">" in x else x)
negative_reviews['Review'] = negative_reviews['Review'].astype(str).apply(lambda x: BeautifulSoup(x, "html.parser").get_text() if "<" in x or ">" in x else x)

# 3. remove URL
positive_reviews['Review'] = positive_reviews['Review'].apply(lambda x: re.sub(r"http\S+|www\S+", "", x))
negative_reviews['Review'] = negative_reviews['Review'].apply(lambda x: re.sub(r"http\S+|www\S+", "", x))

# 4. remove non-alphabetical
positive_reviews['Review'] = positive_reviews['Review'].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))
negative_reviews['Review'] = negative_reviews['Review'].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))


# 5. remove extra spaces
positive_reviews['Review'] = positive_reviews['Review'].apply(lambda x: re.sub(r"\s+", " ", x).strip())
negative_reviews['Review'] = negative_reviews['Review'].apply(lambda x: re.sub(r"\s+", " ", x).strip())


# 6. perform contractions
positive_reviews['Review'] = positive_reviews['Review'].apply(contractions.fix)
negative_reviews['Review'] = negative_reviews['Review'].apply(contractions.fix)


# print average len
print(f"Average review length after cleaning (Positive): {positive_reviews['Review'].apply(len).mean():.2f} characters")
print(f"Average review length after cleaning (Negative): {negative_reviews['Review'].apply(len).mean():.2f} characters")

# random_reviews = positive_reviews.sample(n=5, random_state=RANDOM_NUM)
# print(random_reviews[['Review', 'Rating']])

Three sample reviews before data cleaning + preprocessing:
                                                    Review  Rating
602720                                  They hold pencils!     5.0
1361624  I have been using these batteries for over a m...     5.0
1067633  Nice and Compact - does a perfect job punching...     5.0
Average review length before cleaning (Positive): 271.36 characters
Average review length before cleaning (Negative): 386.62 characters
Average review length after cleaning (Positive): 257.95 characters
Average review length after cleaning (Negative): 366.48 characters


## remove the stop words 

In [7]:
print(f"Average review length before preprocessing (Positive): {positive_reviews['Review'].apply(len).mean():.2f} characters")
print(f"Average review length before preprocessing (Negative): {negative_reviews['Review'].apply(len).mean():.2f} characters")
def remove_stop_words(review):
    # tokens = word_tokenize(review)
    tokens = review.split(" ")
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

positive_reviews['Review'] = positive_reviews['Review'].apply(remove_stop_words)

# random_reviews = positive_reviews.sample(n=5, random_state=RANDOM_NUM)
# print(random_reviews[['Review', 'Rating']])


Average review length before preprocessing (Positive): 257.95 characters
Average review length before preprocessing (Negative): 366.48 characters


## perform lemmatization  

In [8]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def perform_lemmatization(review):
    lemmatizer = WordNetLemmatizer()
    tokens = review.split(" ")
    pos_tags = nltk.pos_tag(tokens)
    
    lemmatized_tokens = []
    for word, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)
        lemmatized_tokens.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
    
    return ' '.join(lemmatized_tokens)

positive_reviews['Review'] = positive_reviews['Review'].apply(perform_lemmatization)

print(f"Average review length after preprocessing (Positive): {positive_reviews['Review'].apply(len).mean():.2f} characters")
print(f"Average review length after preprocessing (Negative): {negative_reviews['Review'].apply(len).mean():.2f} characters")

print("Three sample reviews after data cleaning + preprocessing:")
random_reviews = positive_reviews.sample(n=3, random_state=RANDOM_NUM)
print(random_reviews[['Review', 'Rating']])


Average review length after preprocessing (Positive): 156.88 characters
Average review length after preprocessing (Negative): 366.48 characters
Three sample reviews after data cleaning + preprocessing:
                                                    Review  Rating
602720                                         hold pencil     5.0
1361624  use battery month phone charge like new great ...     5.0
1067633  nice compact perfect job punch planner insert ...     5.0


# TF-IDF Feature Extraction

In [10]:
all_reviews = pd.concat([positive_reviews, negative_reviews], ignore_index=True)

reviews = all_reviews['Review']
labels = all_reviews['Sentiment']

tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english') 

tfidf_features = tfidf_vectorizer.fit_transform(reviews)

tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df['Sentiment'] = labels.values

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=RANDOM_NUM)

# all_reviews = pd.concat([positive_reviews, negative_reviews], ignore_index=True)

# reviews = all_reviews['Review']
# labels = all_reviews['Sentiment']

# X_train_texts, X_test_texts, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=RANDOM_NUM)

# tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')  
# X_train = tfidf_vectorizer.fit_transform(X_train_texts)  
# X_test = tfidf_vectorizer.transform(X_test_texts)        

# tfidf_train_df = pd.DataFrame(X_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# tfidf_train_df['Sentiment'] = y_train.values  
# print(X_train)
# print(y_train)

# Perceptron

In [11]:
perceptron = Perceptron()
perceptron.fit(X_train, y_train)

y_train_pred = perceptron.predict(X_train)

y_test_pred = perceptron.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1-Score: {train_f1:.4f}")

print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Testing Precision: {test_precision:.4f}")
print(f"Testing Recall: {test_recall:.4f}")
print(f"Testing F1-Score: {test_f1:.4f}")


Training Accuracy: 0.9330
Training Precision: 0.9283
Training Recall: 0.9385
Training F1-Score: 0.9334
Testing Accuracy: 0.9220
Testing Precision: 0.9154
Testing Recall: 0.9300
Testing F1-Score: 0.9226


# SVM

In [12]:
svm_model = SVC(kernel='linear', random_state=RANDOM_NUM)
svm_model.fit(X_train, y_train)

y_train_pred = svm_model.predict(X_train)

y_test_pred = svm_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1-Score: {train_f1:.4f}")

print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Testing Precision: {test_precision:.4f}")
print(f"Testing Recall: {test_recall:.4f}")
print(f"Testing F1-Score: {test_f1:.4f}")

Training Accuracy: 0.9496
Training Precision: 0.9383
Training Recall: 0.9624
Training F1-Score: 0.9502
Testing Accuracy: 0.9400
Testing Precision: 0.9284
Testing Recall: 0.9535
Testing F1-Score: 0.9408


# Logistic Regression

In [13]:
logistic_model = LogisticRegression(random_state=RANDOM_NUM)
logistic_model.fit(X_train, y_train)

y_train_pred = logistic_model.predict(X_train)

y_test_pred = logistic_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1-Score: {train_f1:.4f}")

print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Testing Precision: {test_precision:.4f}")
print(f"Testing Recall: {test_recall:.4f}")
print(f"Testing F1-Score: {test_f1:.4f}")


Training Accuracy: 0.9463
Training Precision: 0.9356
Training Recall: 0.9586
Training F1-Score: 0.9470
Testing Accuracy: 0.9415
Testing Precision: 0.9345
Testing Recall: 0.9495
Testing F1-Score: 0.9420


# Naive Bayes

In [14]:
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(reviews)

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1-Score: {train_f1:.4f}")

print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Testing Precision: {test_precision:.4f}")
print(f"Testing Recall: {test_recall:.4f}")
print(f"Testing F1-Score: {test_f1:.4f}")


Training Accuracy: 0.9337
Training Precision: 0.9285
Training Recall: 0.9394
Training F1-Score: 0.9339
Testing Accuracy: 0.9365
Testing Precision: 0.9324
Testing Recall: 0.9425
Testing F1-Score: 0.9374
