In [1]:
# imports liberaries
import warnings 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import json 
import nltk
import spacy
import string
import unicodedata
from bs4 import BeautifulSoup
from textblob import TextBlob 
from nltk.stem import WordNetLemmatizer

from IPython import display 
display.set_matplotlib_formats('svg')
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
#load the dataset
data = pd.read_csv("/content/gdrive/MyDrive/Reviews.csv")
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
#check the shape
data.shape

(568454, 10)

In [5]:
data = data[1:5000]

In [6]:
data = data[['Score','Text']]
data.head()

Unnamed: 0,Score,Text
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...
5,4,I got a wild hair for taffy and ordered this f...


In [7]:
# check whether it's having null values or not
data.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of       Score   Text
1     False  False
2     False  False
3     False  False
4     False  False
5     False  False
...     ...    ...
4995  False  False
4996  False  False
4997  False  False
4998  False  False
4999  False  False

[4999 rows x 2 columns]>

In [8]:
# check unique categories
data['Score'].value_counts()

5    3133
4     712
1     469
3     395
2     290
Name: Score, dtype: int64

In [9]:
# if rating is above 3 we will consider it as 1 else as 0.
data['Score'] = data['Score'].apply(lambda x: 1 if x > 3 else 0) # positive as 1 and negative as 0

#### visualiza words using WordCloud

In [10]:
data['Score'].value_counts()

1    3845
0    1154
Name: Score, dtype: int64

In [11]:
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

# get the text based on scores!
score_zero = data['Text'][data['Score'] == 0]
score_one = data['Text'][data['Score'] == 1]

#word tokenize the data(convert to string)
def list_tokenizer(Score):
    Scores = " ".join(Score)
    Scores = nltk.word_tokenize(Scores)
    return str(Scores)

# visulizer
def word_cloud(Score, number):
    wc = WordCloud(background_color= 'pink', max_font_size= 45, max_words= 10)
    wc.generate(Score)
    plt.imshow(wc, interpolation= 'bilinear')
    plt.title(f'WorldCloud for {number}')
    plt.axis('off')



In [13]:
# Let's visualizing the Scores
#word_cloud(list_tokenizer(score_zero), 0)

In [14]:
#word_cloud(list_tokenizer(score_one), 0)

#### Preprocessing

In [14]:
# 1. lowering words
data['Text'] = data['Text'].str.lower()
data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled as jumbo salted peanut...
2,1,this is a confection that has been around a fe...
3,0,if you are looking for the secret ingredient i...
4,1,great taffy at a great price. there was a wid...
5,1,i got a wild hair for taffy and ordered this f...


In [15]:
# 2. remove punctuation!
data['Text'] = data['Text'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))
data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled as jumbo salted peanut...
2,1,this is a confection that has been around a fe...
3,0,if you are looking for the secret ingredient i...
4,1,great taffy at a great price there was a wide...
5,1,i got a wild hair for taffy and ordered this f...


In [16]:
#3. remove stopwords
from spacy.lang.en.stop_words import STOP_WORDS
data['Text'] = data['Text'].apply(lambda x: " ".join([i for i in x.split() if i not in STOP_WORDS]))
data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection centuries light pillowy citrus gela...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered pound bag taffy en...


In [17]:
#4. remove url and tags! 
data['Text'] = data['Text'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection centuries light pillowy citrus gela...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered pound bag taffy en...


In [18]:
# 5. remove html tags! 
data['Text'] = data['Text'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection centuries light pillowy citrus gela...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered pound bag taffy en...


In [20]:
# 6. spelling correction 
# %time 
#data['Text'] = data['Text'].apply(lambda x: TextBlob(x).correct())

#data.head()

In [19]:
# 7. remove emails
data['Text'] = data['Text'].apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', '', x))
data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection centuries light pillowy citrus gela...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered pound bag taffy en...


In [20]:
# 8. remove extra spaces
data['Text'] = data['Text'].apply(lambda x: " ".join(x.split()))
data.head()

Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection centuries light pillowy citrus gela...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered pound bag taffy en...


In [21]:
# 9. lemmatizer 
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
%time
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

data["Text"] = data["Text"].apply(lambda text: lemmatize_words(text))
data.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.34 µs


Unnamed: 0,Score,Text
1,0,product arrived labeled jumbo salted peanutsth...
2,1,confection century light pillowy citrus gelati...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered pound bag taffy en...


#### Convert text to numbers

In [22]:
# data split 
from sklearn.model_selection import train_test_split 
xtrain, xtest, ytrain, ytest = train_test_split(data['Text'], data['Score'], test_size = 0.3)

In [23]:
xtrain.shape
#xtest.shape

(3499,)

In [24]:
# method 1: Bag of words(BOW)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

xtrain_bow = vectorizer.fit_transform(xtrain).toarray()
xtest_bow = vectorizer.transform(xtest).toarray()


In [25]:
# method 2: TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer  

tf_vectorizer = TfidfVectorizer()

# let's convert 
xtrain_tf = tf_vectorizer.fit_transform(xtrain).toarray()
xtest_tf = tf_vectorizer.transform(xtest).toarray()

#### model: Gaussina NB


In [26]:
from sklearn.naive_bayes import GaussianNB

#BOW
clf_bow = GaussianNB().fit(xtrain_bow, ytrain)  # fitting 
prediction_bow = clf_bow.predict(xtest_bow)  # predictions

#TF-IDF
clf_tf = GaussianNB().fit(xtrain_tf, ytrain)
prediction_tf = clf_tf.predict(xtest_tf)

#### Model evaluation

In [27]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

def metrics(prediction, actual): 
    print('Confusion_matrix \n', confusion_matrix(actual, prediction))
    print('\nAccuracy:', accuracy_score(actual, prediction))
    print('\nclassification_report\n')
    print(classification_report(actual, prediction))
    
#BOW   
metrics(prediction_bow, ytest)

Confusion_matrix 
 [[164 182]
 [431 723]]

Accuracy: 0.5913333333333334

classification_report

              precision    recall  f1-score   support

           0       0.28      0.47      0.35       346
           1       0.80      0.63      0.70      1154

    accuracy                           0.59      1500
   macro avg       0.54      0.55      0.53      1500
weighted avg       0.68      0.59      0.62      1500



In [28]:
#TF-IDF
metrics(prediction_tf, ytest)

Confusion_matrix 
 [[160 186]
 [424 730]]

Accuracy: 0.5933333333333334

classification_report

              precision    recall  f1-score   support

           0       0.27      0.46      0.34       346
           1       0.80      0.63      0.71      1154

    accuracy                           0.59      1500
   macro avg       0.54      0.55      0.52      1500
weighted avg       0.68      0.59      0.62      1500

