In [None]:
# ! pip install bs4 # in case you don't have it installed
# ! pip install contractions
# # Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
import os
os.chdir('/content/drive/Shared drives/USC_CSCI544-Applied NLP/HWs/HW1') # where the files for this project are

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read Data

In [None]:
df=pd.read_table('amazon_reviews_us_Office_Products_v1_00.tsv', on_bad_lines='skip')


  df=pd.read_table('amazon_reviews_us_Office_Products_v1_00.tsv', on_bad_lines='skip')


## Keep Reviews and Ratings

In [None]:
df = df[['review_body', 'star_rating']]

 ## We form three classes and select 20000 reviews randomly from each class.



In [None]:
## Create three class labels

df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')

# df['sentiment'] = np.where(df['star_rating'] <= 2, 0,  # Negative: 0
#                           np.where(df['star_rating'] > 3, 2, 1))  # Positive: 2, Neutral: 1

df = df.dropna(subset=['review_body'], how='all')

df['sentiment'] = np.where(df['star_rating'] > 3, 1, 0)  # Positive: 1, Negative: 0

# Print review counts per class
print("Before Discard")
print("Number of positive reviews:", df[df['sentiment'] == 1].shape[0])
print("Number of negative reviews:", df[df['sentiment'] == 0].shape[0])
print("Number of neutral reviews (discarded):", len(df[df['star_rating'] != 3]))

df = df[df['star_rating'] != 3]  # Discard neutral reviews (rating 3)

# Print review counts per class
print("Number of positive reviews:", df[df['sentiment'] == 1].shape[0])
print("Number of negative reviews:", df[df['sentiment'] == 0].shape[0])
print("Number of neutral reviews (discarded):", len(df[df['star_rating'] == 3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = np.where(df['star_rating'] > 3, 1, 0)  # Positive: 1, Negative: 0


Before Discard
Number of positive reviews: 2001122
Number of negative reviews: 639035
Number of neutral reviews (discarded): 2446471
Number of positive reviews: 2001122
Number of negative reviews: 445349
Number of neutral reviews (discarded): 0


In [None]:
# Randomly select 20,000 positive reviews
positive_reviews = df[df['sentiment'] == 1].sample(100000, random_state=42)

# Randomly select 20,000 negative reviews
negative_reviews = df[df['sentiment'] == 0].sample(100000, random_state=42)

# Concatenate the selected reviews to form the downsized DataFrame
downsized_df = pd.concat([positive_reviews, negative_reviews])

# # Print the first few rows of the downsized DataFrame
# print("Downsized DataFrame:")
# print(downsized_df.head())

## Print review counts per class
print("Number of positive reviews:", downsized_df[downsized_df['sentiment'] == 1].shape[0])
print("Number of negative reviews:", downsized_df[downsized_df['sentiment'] == 0].shape[0])
print("Number of neutral reviews (discarded):", len(downsized_df[downsized_df['star_rating'] == 3]))

Number of positive reviews: 100000
Number of negative reviews: 100000
Number of neutral reviews (discarded): 0


# Data Cleaning



# Pre-processing

## perform lemmatization, remove stop words, tokenize etc

In [None]:
def clean_text(text):
    # Remove HTML tags
    if isinstance(text, str) and text:
      soup = BeautifulSoup(text, 'html.parser')
      text = soup.get_text()

      # Remove special characters and digits
      text = re.sub(r'[^a-zA-Z\s]', '', text)

      # Remove URLs
      text = re.sub(r'https?://\S+', '', text)

      # Convert to lowercase
      text = text.lower()

      # Tokenize
      words = nltk.word_tokenize(text)

      # Remove stop words
      stop_words = set(nltk.corpus.stopwords.words('english'))
      words = [word for word in words if word not in stop_words]

      # Lemmatize
      lemmatizer = WordNetLemmatizer()
      words = [lemmatizer.lemmatize(word) for word in words]

      return ' '.join(words)
    else:
      return ''  # Return an empty string for empty or non-string inputs

downsized_df['clean_review'] = downsized_df['review_body'].apply(clean_text)

# Print the average length of the cleaned reviews
print("Average length of cleaned reviews:", downsized_df['clean_review'].str.len().mean())

  soup = BeautifulSoup(text, 'html.parser')


Average length of cleaned reviews: 191.54224


# TF-IDF Feature Extraction

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(downsized_df['clean_review'])
y = downsized_df['sentiment']

# shape of the feature matrix
print("Shape of feature matrix (X):", X.shape)

## Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Shape of feature matrix (X): (200000, 127016)


# Perceptron

In [None]:
perceptron_classifier = Perceptron(max_iter=1000)
perceptron_classifier.fit(X_train, y_train)
y_pred_perceptron = perceptron_classifier.predict(X_test)
print("Perceptron Accuracy:", accuracy_score(y_test, y_pred_perceptron))
print("Perceptron Classification Report:\n", classification_report(y_test, y_pred_perceptron))


Perceptron Accuracy: 0.853525
Perceptron Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86     20007
           1       0.87      0.83      0.85     19993

    accuracy                           0.85     40000
   macro avg       0.85      0.85      0.85     40000
weighted avg       0.85      0.85      0.85     40000



# Logistic Regression

In [None]:
logreg_classifier = LogisticRegression(multi_class='ovr')  # Handle multi-class classification
logreg_classifier.fit(X_train, y_train)
y_pred_logreg = logreg_classifier.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))


Logistic Regression Accuracy: 0.894875
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.90     20007
           1       0.90      0.89      0.89     19993

    accuracy                           0.89     40000
   macro avg       0.89      0.89      0.89     40000
weighted avg       0.89      0.89      0.89     40000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Naive Bayes

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.8574
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86     20007
           1       0.88      0.83      0.85     19993

    accuracy                           0.86     40000
   macro avg       0.86      0.86      0.86     40000
weighted avg       0.86      0.86      0.86     40000



# SVM

In [None]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.895575
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.90     20007
           1       0.90      0.89      0.90     19993

    accuracy                           0.90     40000
   macro avg       0.90      0.90      0.90     40000
weighted avg       0.90      0.90      0.90     40000

