In [1]:
import warnings
warnings.filterwarnings('ignore')

### Best Practices
1. Preprocessing and Cleaning
2. Train Test Split
3. BOW,TF-IDF,Word2Vec
4. Train ML Algo

### Load DataSet

In [2]:
import pandas as pd
data = pd.read_csv('Data/all_kindle_review.csv')

In [3]:
data.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400


In [4]:
df = data[['reviewText','rating']]

In [5]:
df.head(2)

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5


In [6]:
df.shape

(12000, 2)

In [7]:
df.isna().sum()

reviewText    0
rating        0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0.1    12000 non-null  int64 
 1   Unnamed: 0      12000 non-null  int64 
 2   asin            12000 non-null  object
 3   helpful         12000 non-null  object
 4   rating          12000 non-null  int64 
 5   reviewText      12000 non-null  object
 6   reviewTime      12000 non-null  object
 7   reviewerID      12000 non-null  object
 8   reviewerName    11962 non-null  object
 9   summary         11998 non-null  object
 10  unixReviewTime  12000 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 1.0+ MB


In [9]:
df.rating.unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [10]:
df.rating.value_counts()

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

### Preprocessing

In [11]:
# positive review is 1 and negative review is 0
df.rating = data.rating.apply(lambda x:0 if x<3 else 1)

In [12]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",1
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,1
3,Aggie is Angela Lansbury who carries pocketboo...,1
4,I did not expect this type of book to be in li...,1


In [13]:
df.rating.unique()

array([1, 0], dtype=int64)

In [14]:
df.rating.value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [15]:
df.reviewText = df.reviewText.str.lower()

In [16]:
df.head(2)

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1


In [17]:
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 

In [18]:
## Removing special characters
df['reviewText']=df['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))

In [19]:
## Remove the stopswords
df['reviewText']=df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))

In [20]:
## Remove url
df['reviewText']=df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))

In [21]:
## Remove html tags
df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

In [22]:
## Remove any additional spaces
df['reviewText']=df['reviewText'].apply(lambda x: " ".join(x.split()))

In [23]:
df.head(2)

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1


In [24]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [25]:
def lem_w(text):
    return ' '.join([lem.lemmatize(w) for w in text.split()])

In [26]:
df.reviewText = df.reviewText.apply(lambda x: lem_w((x)))

In [27]:
df.head(3)

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four book wasnt expecti...,1


### Model Training with BOW & TFIDF

In [28]:
from sklearn.model_selection import train_test_split 

In [31]:
Xtrain,Xtest,ytrain,ytest = train_test_split(df.reviewText,df.rating, test_size = 0.20,random_state=42)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
Xtrain_bow = bow.fit_transform(Xtrain).toarray()
Xtest_bow = bow.transform(Xtest).toarray()

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
Xtrain_tfidf = tfidf.fit_transform(Xtrain).toarray()
Xtest_tfidf = tfidf.transform(Xtest).toarray()

In [39]:
Xtrain_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
from sklearn.naive_bayes import GaussianNB

In [45]:
nb_Model = GaussianNB().fit(Xtrain_bow,ytrain) 

In [49]:
nb_Model_tf = GaussianNB().fit(Xtrain_tfidf,ytrain)

In [51]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [53]:
ypred_bow = nb_Model.predict(Xtest_bow)
ypred_tf = nb_Model_tf.predict(Xtest_tfidf)

In [54]:
accuracy_score(ypred_bow,ytest),accuracy_score(ypred_tf,ytest)

(0.5745833333333333, 0.57875)

In [59]:
print('bow :- \n',confusion_matrix(ypred_bow,ytest),'\n\n','tfidf :- \n', confusion_matrix(ypred_tf,ytest))

bow :- 
 [[499 717]
 [304 880]] 

 tfidf :- 
 [[488 696]
 [315 901]]


# Word2Vec

In [66]:
df.head(2)

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read didnt want put read one sitti...,1


In [67]:
df.shape

(12000, 2)

In [79]:
import gensim

In [65]:
docs = list(df.reviewText)
len(docs)

12000

In [72]:
words = [w.split() for w in docs]
len(words)

12000

In [76]:
model = gensim.models.Word2Vec(words)

In [78]:
model.wv.index_to_key[:5]

['book', 'story', 'read', 'one', 'character']

In [80]:
model.corpus_count

12000

In [81]:
model.epochs

5

In [85]:
import numpy as np
def avgWV(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis = 0)

In [86]:
X = []
for w in words:
    X.append(avgWV(w))

In [88]:
len(X),len(X[0])

(12000, 100)

In [92]:
X[0].reshape(1,-1).shape

(1, 100)

In [95]:
dfwv = pd.DataFrame()
for i in range(0,len(X)):
    dfwv = pd.concat([dfwv,pd.DataFrame(X[i].reshape(1,-1))],axis = 0,ignore_index = True)

In [96]:
dfwv.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.193763,0.279106,-0.044467,0.141857,0.107871,-0.466126,-0.077451,0.532469,-0.052136,-0.206572,...,0.379132,0.002741,-0.029775,0.052788,0.407013,0.348429,0.057285,-0.391147,0.126682,-0.090979
1,-0.217396,0.109931,-0.097034,0.233596,0.21889,-0.565343,0.380792,0.703182,-0.369581,-0.368192,...,0.534721,0.116042,-0.076574,-0.065245,0.525329,0.113267,0.120367,-0.285023,0.243395,-0.445458
2,-0.248413,0.211981,-0.03736,0.168468,0.220052,-0.591495,0.343882,0.729237,-0.243123,-0.345097,...,0.47526,0.176894,0.039547,-0.036995,0.414921,0.137021,0.13771,-0.272912,0.1126,-0.203263


In [97]:
dfwv['output'] = df.rating

In [98]:
dfwv.sample()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,output
7553,-0.314995,0.063043,0.089129,0.323853,0.353265,-0.498281,0.694354,0.827807,-0.285841,-0.50872,...,0.069026,-0.195198,-0.234344,0.527728,0.126993,0.3772,-0.114625,-0.052858,-0.372876,1


In [99]:
X = dfwv.drop(['output'],axis = 1)
y = dfwv.output

In [101]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.25,random_state = 42)

In [103]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=120,max_depth=5,n_jobs=-1)

In [104]:
rfc.fit(Xtrain,ytrain)

In [106]:
ypred = rfc.predict(Xtest)

In [107]:
accuracy_score(ytest,ypred)

0.754

In [109]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.71      0.48      0.57      1024
           1       0.77      0.90      0.83      1976

    accuracy                           0.75      3000
   macro avg       0.74      0.69      0.70      3000
weighted avg       0.75      0.75      0.74      3000



In [110]:
confusion_matrix(ytest,ypred)

array([[ 490,  534],
       [ 204, 1772]], dtype=int64)