## Best Practices

1. Preprocessing And Cleaning(feature engineering)
2. Train Test Split
3. Bow, Tf-Idf, Word2Vec
4. Train ML algorithms

In [1]:
# Load the dataset
import pandas as pd
data = pd.read_csv('datasets/all_kindle_review.csv')

In [2]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [3]:
df = data[['reviewText','rating']]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [None]:
df.shape

(12000, 2)

In [None]:
## missing values
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [6]:
df['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [7]:
df['rating'].value_counts()

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [8]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


## Preprocessing and Cleaning

In [9]:
df = df.copy()

In [10]:
## positive review is 1 and negative review is 0
df['rating'] = df['rating'].apply(lambda x:0 if x<3 else 1)

In [11]:
df['rating'].unique()

array([1, 0], dtype=int64)

In [12]:
## 1. Lower All the Cases
df['reviewText'] = df['reviewText'].str.lower()

In [None]:
df.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [14]:
import re
import nltk
from nltk.corpus import stopwords

In [15]:
from bs4 import BeautifulSoup

In [16]:
## Removing Special Characters
df['reviewText'] = df['reviewText'].apply(lambda x:re.sub('[^a-z A-Z 0-9-]+','',x))

## Removing the stopwords
# df['reviewText'] = df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

## Removing url
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','', str(x)))

## Removing  html tags
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

## Removing any additional spaces
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.split()))

In [17]:
df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [18]:
## Lemmatizer 
from  nltk.stem import WordNetLemmatizer

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [21]:
df['reviewText'] = df['reviewText'].apply(lambda x:lemmatize_words(x))

In [22]:
df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four book wasnt expecti...,1
3,aggie angela lansbury carry pocketbook instead...,1
4,expect type book library pleased find price right,1


In [23]:
## Train Test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df['reviewText'],df['rating'],test_size=0.20)

In [24]:
X_train.shape

(9600,)

## Implementing Word2Vec

In [25]:
import gensim

In [26]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

In [27]:
# Tokenize the reviews
X_train_tokens = [word_tokenize(review.lower()) for review in X_train]
X_test_tokens = [word_tokenize(review.lower()) for review in X_test]

In [28]:
X_train_tokens

[['iam',
  'literary',
  'critic',
  'tough',
  'time',
  'spelling',
  'correctly',
  'said',
  'give',
  'author',
  'one',
  'star',
  'writing',
  'book',
  'seriesiam',
  'huge',
  'fan',
  'avid',
  'reader',
  'military',
  'space',
  'opera',
  'strong',
  'heroine',
  'gave',
  'work',
  'try',
  'hooked',
  'enough',
  'buy',
  'last',
  'novel',
  'see',
  'ended',
  'hence',
  'second',
  'star',
  'bashing',
  'work',
  'iam',
  'prude',
  'religious',
  'do-gooder',
  'sexual',
  'relation',
  'deleted',
  'wouldnt',
  'much',
  'story',
  'left',
  'whats',
  'left',
  'believeable',
  'least',
  'any-one',
  'military',
  'experience',
  'common',
  'sensethe',
  'fact',
  'series',
  'huge',
  'potential',
  'science',
  'isnt',
  'bad',
  'main',
  'character',
  'loveable',
  'many',
  'battle',
  'scene',
  'well',
  'described',
  'five',
  'star',
  'work'],
 ['enjoyable',
  'story',
  'start',
  'twist',
  'plot',
  'change',
  'constantly',
  'trying',
  'figure

1. vector_size=100: Dimension of word embeddings.
2. window=5: Context window size.
3. min_count=2: Ignores words that appear less than twice.
4. sg=1: Skip-gram model (use sg=0 for CBOW).

In [29]:
# Train Word2Vec on training tokens
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=2, workers=4, sg=1)

In [None]:
#to get all the Vocabulary
word2vec_model.wv.index_to_key

['book',
 'story',
 'read',
 'one',
 'character',
 'like',
 'good',
 'would',
 'really',
 'love',
 'time',
 'get',
 'author',
 'reading',
 'series',
 'much',
 'first',
 'well',
 'even',
 'didnt',
 'short',
 'know',
 'way',
 'could',
 'great',
 'make',
 'little',
 '-',
 'sex',
 'dont',
 'thing',
 'want',
 'two',
 'think',
 'find',
 'plot',
 'also',
 'romance',
 'im',
 'life',
 'end',
 'go',
 'see',
 'enjoyed',
 'never',
 'scene',
 'take',
 'woman',
 'kindle',
 'written',
 'many',
 'lot',
 'say',
 'work',
 'bit',
 'thought',
 'going',
 'give',
 'found',
 'year',
 'writing',
 'interesting',
 'liked',
 'got',
 'loved',
 'novel',
 'feel',
 'another',
 'still',
 'better',
 'back',
 'though',
 'man',
 'enough',
 'come',
 'people',
 'hot',
 'reader',
 'made',
 'something',
 'review',
 'cant',
 'part',
 'free',
 'page',
 'need',
 'star',
 'friend',
 'bad',
 'keep',
 'new',
 'world',
 'wasnt',
 'doesnt',
 'relationship',
 'enjoy',
 'felt',
 'recommend',
 'together',
 'next',
 'start',
 'best',
 

In [31]:
# word2vec_model.corpus_count
word2vec_model.corpus_total_words

528255

In [32]:
word2vec_model.epochs

5

In [33]:
def avg_word2vec(tokens, model, vector_size):
    """Compute the average Word2Vec for a list of tokens."""
    vectors = [model.wv[word] for word in tokens if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

In [35]:
# Convert reviews to average Word2Vec vectors
X_train_avg = np.array([avg_word2vec(tokens, word2vec_model, 100) for tokens in X_train_tokens])
X_test_avg = np.array([avg_word2vec(tokens, word2vec_model, 100) for tokens in X_test_tokens])

In [36]:
X_train_avg.shape

(9600, 100)

In [37]:
y_train.shape

(9600,)

In [38]:
X_train_avg[0]

array([-6.85541704e-02,  1.33944884e-01,  1.18232341e-02,  7.59147853e-02,
       -2.52504945e-02, -3.50893080e-01,  1.30040899e-01,  4.45157170e-01,
       -1.57052010e-01, -2.46515930e-01,  4.36806343e-02, -2.64905483e-01,
       -9.60062817e-02,  1.60317779e-01,  6.68976009e-02, -2.48591840e-01,
        6.30708337e-02, -2.68512726e-01, -1.19125351e-01, -4.20164883e-01,
        7.38725513e-02,  6.27173334e-02,  1.00196011e-01, -3.87590267e-02,
       -6.05193600e-02,  1.64753437e-01, -1.32651001e-01, -7.53857568e-02,
       -9.64473188e-02,  8.28819536e-03,  1.90445215e-01,  3.61470804e-02,
        8.14405456e-02, -2.12549433e-01, -1.11483715e-01,  2.33321130e-01,
        8.25535432e-02, -1.79540232e-01, -4.65773270e-02, -3.21557015e-01,
       -3.53576802e-02, -2.05681667e-01,  1.73125900e-02,  1.59416333e-01,
        3.59129429e-01, -5.17748110e-02, -1.61928728e-01,  2.00907048e-02,
        1.40834510e-01,  1.21113181e-01,  5.86525276e-02, -1.83487639e-01,
       -1.02905214e-01, -

In [39]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB().fit(X_train_avg,y_train)

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [41]:
y_pred = nb_model.predict(X_test_avg)

In [42]:
print(accuracy_score(y_test, y_pred))

0.7479166666666667


In [43]:
confusion_matrix(y_test, y_pred)

array([[ 593,  184],
       [ 421, 1202]], dtype=int64)

In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.58      0.76      0.66       777
           1       0.87      0.74      0.80      1623

    accuracy                           0.75      2400
   macro avg       0.73      0.75      0.73      2400
weighted avg       0.78      0.75      0.75      2400



In [46]:
from sklearn.linear_model import LogisticRegression
# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train_avg, y_train)

# Predict on the test set
y_pred_LR = log_reg.predict(X_test_avg)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred_LR)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_LR)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_LR)}")

Accuracy: 0.81125
Confusion Matrix:
[[ 475  302]
 [ 151 1472]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.61      0.68       777
           1       0.83      0.91      0.87      1623

    accuracy                           0.81      2400
   macro avg       0.79      0.76      0.77      2400
weighted avg       0.81      0.81      0.81      2400



In [48]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Perform Grid Search
grid_search = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_avg, y_train)

# Best model evaluation
best_model = grid_search.best_estimator_
y_pred_GS = best_model.predict(X_test_avg)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_GS)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_GS)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_GS)}")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'C': 10}
Accuracy: 0.8120833333333334
Confusion Matrix:
[[ 482  295]
 [ 156 1467]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.62      0.68       777
           1       0.83      0.90      0.87      1623

    accuracy                           0.81      2400
   macro avg       0.79      0.76      0.77      2400
weighted avg       0.81      0.81      0.81      2400

