## Best Practices : 
1. Preprocessing and cleaning
2. Train Test Split
3. BOW, TFIDF, Word2Vec
4. Train ML Algorithms


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [32]:
data = pd.read_csv('all_kindle_review .csv')

In [33]:
data = data[['reviewText', 'rating']]

In [34]:
data['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [35]:
data['rating'].value_counts()

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [36]:
data.shape

(12000, 2)

### Data preprocessing

In [37]:
# 0 for negative review and 1 for positive 
data['rating'] = data['rating'].apply(lambda x : 0 if x < 3 else 1)

In [38]:
data['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

## Text Preprocessing

In [39]:
data['reviewText'] = data['reviewText'].str.lower()

In [40]:
data.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [66]:
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
# Removing special characters :
data['reviewText'] = data['reviewText'].apply(lambda x: re.sub('[^a-z A-Z]', '', x))

# Remove the stopwords
data['reviewText'] = data['reviewText'].apply(lambda x : ' '.join([i for i in x.split() if i not in stopwords.words('english')]))

# Remove URLs
data['reviewText'] = data['reviewText'].apply(lambda x : re.sub(r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#?&//=]+", '', str(x)))

# remove html tags
data['reviewText'] = data['reviewText'].apply(lambda x : BeautifulSoup(x, 'html.parser').get_text())

# Remove any additional extra space 
data['reviewText'] = data['reviewText'].apply(lambda x : ' '.join(x.split()))

In [67]:
data

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read want put read one sitting sex...,1
2,ill start saying first four books expecting co...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1
...,...,...
11995,valentine cupid vampire jena ian another vampi...,1
11996,read seven books series apocalypticadventure o...,1
11997,book really cuppa situation man capturing woma...,1
11998,tried use charge kindle even register charging...,0


In [None]:
# test re
# print(re.sub('[^a-z A-Z]', ' ', 'my@ name! is harsh'))




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting html5lib
  Using cached html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: html5lib
Successfully installed html5lib-1.1



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [68]:
# lemmetizer

from nltk.stem import WordNetLemmatizer

In [69]:
lemmtizer = WordNetLemmatizer()

def lemmetize_word(text):
    return ' '.join([lemmtizer.lemmatize(j) for j in text.split()])

In [72]:
data['reviewText'] = data['reviewText'].apply(lemmetize_word)

In [73]:
data

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read want put read one sitting sex...,1
2,ill start saying first four book expecting con...,1
3,aggie angela lansbury carry pocketbook instead...,1
4,expect type book library pleased find price right,1
...,...,...
11995,valentine cupid vampire jena ian another vampi...,1
11996,read seven book series apocalypticadventure on...,1
11997,book really cuppa situation man capturing woma...,1
11998,tried use charge kindle even register charging...,0


In [75]:
### Train Test Split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data['reviewText'], data['rating'])

In [77]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [79]:
bow = CountVectorizer()
x_train_bow = bow.fit_transform(x_train).toarray()
x_test_bow = bow.transform(x_test).toarray()

tfidf = TfidfVectorizer()
x_train_tfidf = tfidf.fit_transform(x_train).toarray()
x_test_tfidf = tfidf.transform(x_test).toarray()

In [82]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

In [81]:
gnb = GaussianNB()
mnb = MultinomialNB()

In [83]:
def model_train(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    return accuracy_score(y_test, y_pred)

In [84]:
model_train(gnb, x_train_bow, x_test_bow, y_train, y_test)

0.5986666666666667

In [85]:
model_train(gnb, x_train_tfidf, x_test_tfidf, y_train, y_test)

0.595

In [86]:
model_train(mnb, x_train_tfidf, x_test_tfidf, y_train, y_test)

0.6933333333333334

In [None]:
model_train(mnb, x_train_bow, x_test_bow, y_train, y_test)