In [3]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz

# download dataset: https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz






## 1 Read Data

In [5]:
import requests
import gzip
df=pd.read_table('/content/amazon_reviews_us_Office_Products_v1_00.tsv',delimiter='\t',header=0,on_bad_lines='skip')
df.head()



  df=pd.read_table('/content/amazon_reviews_us_Office_Products_v1_00.tsv',delimiter='\t',header=0,on_bad_lines='skip')


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31
3,US,52782374,R1PR37BR7G3M6A,B00D7H8XB6,868449945,AmazonBasics 12-Sheet High-Security Micro-Cut ...,Office Products,1,2.0,3.0,N,Y,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...,2015-08-31
4,US,24045652,R3BDDDZMZBZDPU,B001XCWP34,33521401,"Derwent Colored Pencils, Inktense Ink Pencils,...",Office Products,4,0.0,0.0,N,Y,Four Stars,Gorgeous colors and easy to use,2015-08-31


## Keep Reviews and Ratings

In [6]:
df=df[['star_rating','review_body']]

# 'star_rating', 'review_body'


In [7]:
df.head()

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use


 ## We form two classes and select 50000 reviews randomly from each class.



In [8]:
class1 = [1, 2, 3]
class2 = [4, 5]

class1 = df[df['star_rating'].isin(class1)].sample(n=50000, random_state=42,replace=True)
class2 = df[df['star_rating'].isin(class2)].sample(n=50000, random_state=42,replace=True)

balanced_df = pd.concat([class1, class2])
balanced_df = balanced_df.sample(frac=1, random_state=42)

print(class1.head(15))
print(class2.head(15))
# print output
print('')
print('')
print('')

# print Average length before data cleaning
class_1_clean=class1['review_body'].str.len().mean()
class_2_clean=class2['review_body'].str.len().mean()
balanced_clean=balanced_df['review_body'].str.len().mean()
print("Average length class 1 reviews before data is clean",class_1_clean)
print("Average length class 2 reviews before data is clean",class_2_clean)
print("Average length of balanced_df reviews before data is clean",balanced_clean)



        star_rating                                        review_body
654228            2                            Different than expected
844223            3                         its ink, what more to say.
768424            3  the color doesn't have the best quality althou...
1858111           3  Does not state in description that you need a ...
1371080           1  Opened box and looked pretty good. A closer lo...
643315            3  I liked the binder.  The color was ligher than...
597041            1  Blue light never goes out.<br />Clarity power ...
1136860           3  item works as described and is good value for ...
330870            1  At this point it deserves none but I will give...
795637            1  I'm a 19 year old technologically inclined col...
2269554         3.0  This is a nice product if it works well but i ...
2125933           3  It is a good pen but again I would recommend t...
487107            2  It didn't work. The other person said that the...
189496

In [9]:
print(balanced_df)

        star_rating                                        review_body
26707             5                     Awesome deal - great print job
624926            5  Great product! Exactly what we were looking fo...
1308258           1                                               Fair
642442            5  We are converting many videos to DVD's and the...
1815100           4  Needed this for my school got it fast. Would o...
...             ...                                                ...
922637            3  I bought those for Operation Christmas Childre...
398620            5  So glad I only needed this and don't have to r...
33368             5                            Exactly what we wanted.
895661            1  fabric of ribbon is very thin and hard to work...
1715693           1  Cute pens but not one worked:( I bought them f...

[100000 rows x 2 columns]


# 2 Data Cleaning



In [10]:
#- convert all reviews into lowercase
balanced_df['review_body']=balanced_df['review_body'].str.lower()


In [11]:
#- remove the URLs from the reviews
balanced_df['review_body']=balanced_df['review_body'].str.replace(r'http\S+|www\S+|https\S+','',case=False)
#- remove the HTMLs from the reviews
balanced_df['review_body']=balanced_df['review_body'].str.replace(r'<.*?>','')

  balanced_df['review_body']=balanced_df['review_body'].str.replace(r'http\S+|www\S+|https\S+','',case=False)
  balanced_df['review_body']=balanced_df['review_body'].str.replace(r'<.*?>','')


In [12]:
#- remove non-alphabetical characters
balanced_df['clean_review_body'] = balanced_df['review_body'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
print(balanced_df)

        star_rating                                        review_body  \
26707             5                     awesome deal - great print job   
624926            5  great product! exactly what we were looking fo...   
1308258           1                                               fair   
642442            5  we are converting many videos to dvd's and the...   
1815100           4  needed this for my school got it fast. would o...   
...             ...                                                ...   
922637            3  i bought those for operation christmas childre...   
398620            5  so glad i only needed this and don't have to r...   
33368             5                            exactly what we wanted.   
895661            1  fabric of ribbon is very thin and hard to work...   
1715693           1  cute pens but not one worked:( i bought them f...   

                                         clean_review_body  
26707                        awesome deal  great p

In [13]:
#- remove extra spaces
balanced_df['clean_review_body']=balanced_df['clean_review_body'].str.strip()

In [14]:
#- perform contractions
# import library
!pip install contractions



In [15]:
import contractions
balanced_df['clean_review_body']=balanced_df['clean_review_body'].apply(lambda x:'' if pd.isna(x)or not isinstance(x,str) else x)
balanced_df['clean_review_body']=balanced_df['clean_review_body'].apply(lambda x: contractions.fix(x))


In [16]:
# Remove column name 'review_body'
balanced_df=balanced_df.drop(['review_body'], axis=1)

In [17]:
balanced_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 26707 to 1715693
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   star_rating        100000 non-null  object
 1   clean_review_body  100000 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [18]:
print(balanced_df)

        star_rating                                  clean_review_body
26707             5                      awesome deal  great print job
624926            5  great product exactly what we were looking for...
1308258           1                                               fair
642442            5  we are converting many videos to dvds and thes...
1815100           4  needed this for my school got it fast would or...
...             ...                                                ...
922637            3  i bought those for operation christmas childre...
398620            5  so glad i only needed this and do not have to ...
33368             5                             exactly what we wanted
895661            1  fabric of ribbon is very thin and hard to work...
1715693           1  cute pens but not one worked i bought them for...

[100000 rows x 2 columns]


In [19]:
# Average length of reviews before and after data cleaning (with a comma between them)
balanced_clean=balanced_df['clean_review_body'].str.len().mean()

In [20]:
print("Average length of balanced_df reviews after data is clean",balanced_clean)

Average length of balanced_df reviews after data is clean 266.17936


# 3 Pre-processing

## remove the stop words

In [21]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

balanced_df['filtered_reviews'] = balanced_df['clean_review_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

print(balanced_df)

        star_rating                                  clean_review_body  \
26707             5                      awesome deal  great print job   
624926            5  great product exactly what we were looking for...   
1308258           1                                               fair   
642442            5  we are converting many videos to dvds and thes...   
1815100           4  needed this for my school got it fast would or...   
...             ...                                                ...   
922637            3  i bought those for operation christmas childre...   
398620            5  so glad i only needed this and do not have to ...   
33368             5                             exactly what we wanted   
895661            1  fabric of ribbon is very thin and hard to work...   
1715693           1  cute pens but not one worked i bought them for...   

                                          filtered_reviews  
26707                         awesome deal great p

In [23]:
# Remove column name 'clean_review_body'
balanced_df=balanced_df.drop(['clean_review_body'], axis=1)

In [24]:
print(balanced_df)

        star_rating                                   filtered_reviews
26707             5                       awesome deal great print job
624926            5  great product exactly looking wish two pages s...
1308258           1                                               fair
642442            5  converting many videos dvds labels essential w...
1815100           4  needed school got fast would order form paper ...
...             ...                                                ...
922637            3  bought operation christmas children project un...
398620            5  glad needed replace tv easy install works expe...
33368             5                                     exactly wanted
895661            1                       fabric ribbon thin hard work
1715693           1  cute pens one worked bought daughter friends u...

[100000 rows x 2 columns]


## perform lemmatization  

In [25]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)



In [26]:
balanced_df['filtered_reviews'] = balanced_df['filtered_reviews'].apply(lemmatize_text)

In [27]:
type(balanced_df)

pandas.core.frame.DataFrame

In [28]:
# Average length of reviews before and after data preprocessing (with comma between them))
balanced_clean=balanced_df['filtered_reviews'].str.len().mean()

In [29]:
print("Average length of balanced_df reviews after data is pre-processed",balanced_clean)

Average length of balanced_df reviews after data is pre-processed 165.4256


In [30]:
## Train Test Split
# Split your dataset into 80% training dataset and 20% testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(balanced_df['filtered_reviews'], balanced_df['star_rating'],test_size=0.2)

# 4 TF-IDF and BoW Feature Extraction

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
u = TfidfVectorizer()
#transformed train reviews
tv_train_reviews=u.fit_transform(X_train)
#transformed test reviews
tv_test_reviews=u.transform(X_test)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)



Tfidf_train: (80000, 72335)
Tfidf_test: (20000, 72335)


In [33]:
#Countervectorize
u1=CountVectorizer()

#transformed train reviews
cv_train_reviews=u1.fit_transform(X_train)
#transformed test reviews
cv_test_reviews=u1.transform(X_test)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)


BOW_cv_train: (80000, 72335)
BOW_cv_test: (20000, 72335)


In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

cv_train_array = cv_train_reviews.toarray()
tv_train_array = tv_train_reviews.toarray()
cv_test_array = cv_test_reviews.toarray()
tv_test_array = tv_test_reviews.toarray()

# 5 Perceptron Using Both Features

In [None]:
#Train a Perceptron model on your training dataset using the sklearn built-in implementation.
#Report Precision, Recall, and f1-score for training Perceptron using both BoW and TF-IDF features.
# These 6 values should be printed in two separate lines by the .py file
# for first BoW and then TF-IDF as follows - Precision Recall F1 - Precision Recall F1

# MULTILAYER PERCEPTRON CLASSIFIER
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(cv_train_array,y_train)
predmlp = mlp.predict(cv_test_array)
print("Confusion Matrix for Multilayer Perceptron Classifier:")
print(confusion_matrix(y_test,predmlp))
print("Score:",round(accuracy_score(y_test,predmlp)*100,2))
print("Classification Report:")
print(classification_report(y_test,predmlp))


mlp1 = MLPClassifier()
mlp1.fit(tv_train_array,y_train)
predmlp1 = mlp1.predict(tv_test_array)
print("Confusion Matrix for Multilayer Perceptron Classifier:")
print(confusion_matrix(y_test,predmlp1))
print("Score:",round(accuracy_score(y_test,predmlp1)*100,2))
print("Classification Report:")
print(classification_report(y_test,predmlp1))


# 6 SVM Using Both Features

In [None]:
#Train an SVM model on your training datasets using the sklearn built-in im- plementation.
#Report Precision, Recall, and f1-score
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=10,random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_array,y_train)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_array,y_train)
print(svm_tfidf)


In [None]:
#Predicting the model for bag of words
svm_bow_predict=svm.predict(cv_test_array)
print(svm_bow_predict)
#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_array)
print(svm_tfidf_predict)

In [None]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(y_test,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)
#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(y_test,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

In [None]:
#Classification report for bag of words
svm_bow_report=classification_report(y_test,svm_bow_predict)
print(svm_bow_report)
#Classification report for tfidf features
svm_tfidf_report=classification_report(y_test,svm_tfidf_predict)
print(svm_tfidf_report)

# 7 Logistic Regression Using Both Features

In [1]:
#Train a Logistic Regression model on your training datasets using the sklearn built-in implementation.
#Report Precision, Recall, and f1-score

from sklearn.linear_model import LogisticRegression,SGDClassifier
#training the model
lr=LogisticRegression(penalty='l2',max_iter=10,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_array,y_train)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_array,y_train)
print(lr_tfidf)


NameError: ignored

In [None]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_array)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_array)
print(lr_tfidf_predict)

In [None]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

In [None]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

In [None]:
#Classification report for bag of words
lr_bow_report=classification_report(y_test,lr_bow_predict)
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(y_test,lr_tfidf_predict)
print(lr_tfidf_report)

# 8 Naive Bayes Using Both Features

In [None]:
#Train a Naive Bayes model on your training dataset using the sklearn built- in implementation
#Report Precision, Recall, and f1-score
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_array,y_train)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_array,y_train)
print(mnb_tfidf)


In [None]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_array)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_array)
print(mnb_tfidf_predict)

In [None]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(y_test,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(y_test,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

In [None]:
#Classification report for bag of words
mnb_bow_report=classification_report(y_test,mnb_bow_predict)
print(mnb_bow_report)
#Classification report for tfidf features
mnb_tfidf_report=classification_report(y_test,mnb_tfidf_predict)
print(mnb_tfidf_report)