In [2]:
from gensim.models import Word2Vec,KeyedVectors
import spacy

In [3]:
import pandas as pd
import re
df = pd.read_csv(r"all_kindle_reviews .csv")

In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


#Best Practices

-> Preprocessing and cleaning
-> Train Test Split
-> BOW, TF-IDF, Word2Vec
-> Train ML algorithms

In [5]:
df=df[['reviewText','rating']]

In [6]:
df.shape

(12000, 2)

In [7]:
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [8]:
df['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [9]:
df['rating'].value_counts()

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [10]:
#preprocessing and cleaning

#rating < 3 --> negative else positive

df['rating']=df['rating'].apply(lambda x:0 if x < 3 else 1)

In [11]:
df['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [12]:
## converting to lower case

df['reviewText'] = df['reviewText'].str.lower()

In [13]:
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import lxml


In [14]:
df.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1


In [15]:
nlp = spacy.load('en_core_web_sm')

In [16]:
X = df['reviewText']
y = df['rating']

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [18]:
X_train.shape,X_test.shape

((8040,), (3960,))

In [19]:
X_train

10028    pretending to live a life you don't so you can...
5677     was just getting into the story when bammm it ...
9622     this book has some great historical facts but ...
3232     really enjoyed this story.  keeps you guessing...
10528    william sent me a copy of this book, my very f...
                               ...                        
11964    i downloaded this book before reading any revi...
5191     this was by far one of the hottest books i've ...
5390     even though this book was free i had some rese...
860      a little too mushy and &#34;must take care of ...
7270     this book was good. it has a good set up of ch...
Name: reviewText, Length: 8040, dtype: object

In [20]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()

X_train_bow = bow.fit_transform(X_train).toarray()

X_test_bow = bow.transform(X_test).toarray()



In [21]:
from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier()



In [22]:
X_train_token = [sentence.split() for sentence in X_train]
X_test_token = [sentence.split() for sentence in X_test]

In [64]:
w2v = Word2Vec(sentences=X_train_token,vector_size=100,window=5,min_count=1,workers=4,epochs=5)
def document_to_vector(doc, model):
    words = [word for word in doc if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)  # Handle empty cases
    return np.mean(model.wv[words], axis=0)

In [65]:
X_train_word2vec = np.array([document_to_vector(doc, w2v) for doc in X_train_token])
X_test_word2vec = np.array([document_to_vector(doc, w2v) for doc in X_test_token])

# Now fit your model using X_train_word2vec
model1.fit(X_train_word2vec, y_train)

In [66]:
y_pred = model1.predict(X_test_word2vec)

Without cleaning the dataset

In [68]:
from sklearn.metrics import accuracy_score,classification_report

accuracy_score(y_pred,y_test)

0.7340909090909091

In [69]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.38      0.71      0.50       737
           1       0.92      0.74      0.82      3223

    accuracy                           0.73      3960
   macro avg       0.65      0.72      0.66      3960
weighted avg       0.82      0.73      0.76      3960



for converting a dataset to w2v
1. Data cleaning
2. Train W2V model from gensim library
3. Before training in W2V, the training data should tokenized
4. We should convert each data to list of words
5. These W2V provides embeddings for each indiviual words but for   ML models it requires numerical features for entire sentence
6. So we should create a function to convert sentences to vectors
7. words = [word for word in doc if word in model.wv]
8. Then the vector is of size 100 we should be avg
9. Training the model
10. Prediction of the model
11. For better prediction score we should perform data preprocessing and cleaning

with cleaning the dataset

In [29]:
X

0        jace rankin may be short, but he's nothing to ...
1        great short read.  i didn't want to put it dow...
2        i'll start by saying this is the first of four...
3        aggie is angela lansbury who carries pocketboo...
4        i did not expect this type of book to be in li...
                               ...                        
11995    valentine cupid is a vampire- jena and ian ano...
11996    i have read all seven books in this series. ap...
11997    this book really just wasn't my cuppa.  the si...
11998    tried to use it to charge my kindle, it didn't...
11999    taking instruction is a look into the often hi...
Name: reviewText, Length: 12000, dtype: object

Cleaning the dataset

In [34]:
import string

import nltk
from nltk.corpus import stopwords

X = [sentence.lower() for sentence in X]

X = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in X]

import re

X = [re.sub(r'\d+', '', sentence) for sentence in X]


# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

X = [' '.join([word for word in sentence.split() if word not in stop_words]) for sentence in X]

from nltk.stem import WordNetLemmatizer

# Download necessary data
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()






[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [36]:
X_train1,X_test1,y_train1,y_test1 = train_test_split(X,y,test_size=0.25,random_state=42)

In [38]:
X_train1_token = [sentence.split() for sentence in X_train1]
X_test1_token = [sentence.split() for sentence in X_test1]

In [71]:
X_train1_token

[['short',
  'enough',
  'read',
  'chili',
  'cooking',
  'keeps',
  'interest',
  'beginning',
  'make',
  'decision',
  'happened',
  'end',
  'wasnt',
  'problem',
  'mei',
  'felt',
  'got',
  'moneys',
  'worth',
  'better',
  'snickers',
  'lot',
  'less',
  'calories',
  'uumluumluuml'],
 ['jolene',
  'benate',
  'top',
  'world',
  'coveted',
  'spot',
  'warrant',
  'squad',
  'female',
  'team',
  'member',
  'damn',
  'good',
  'job',
  'everyone',
  'knew',
  'one',
  'thing',
  'missing',
  'existence',
  'personal',
  'life',
  'love',
  'one',
  'true',
  'love',
  'stripped',
  'away',
  'six',
  'long',
  'years',
  'ago',
  'day',
  'still',
  'felt',
  'acute',
  'pain',
  'losing',
  'husband',
  'paul',
  'benate',
  'jo',
  'knew',
  'try',
  'move',
  'give',
  'heart',
  'push',
  'meet',
  'someone',
  'new',
  'couldnt',
  'itpaul',
  'wasnt',
  'dead',
  'everyone',
  'assumed',
  'cut',
  'jolene',
  'life',
  'working',
  'classified',
  'projects',
  'ter

In [41]:
#lemmatization

X_train1 = [' '.join([lemmatizer.lemmatize(word) for word in sentence]) for sentence in X_train1_token]
X_test1 = [' '.join(lemmatizer.lemmatize(word) for word in sentence) for sentence in X_test1_token]

In [42]:
X_train1

['short enough read chili cooking keep interest beginning make decision happened end wasnt problem mei felt got money worth better snicker lot less calorie uumluumluuml',
 'jolene benate top world coveted spot warrant squad female team member damn good job everyone knew one thing missing existence personal life love one true love stripped away six long year ago day still felt acute pain losing husband paul benate jo knew try move give heart push meet someone new couldnt itpaul wasnt dead everyone assumed cut jolene life working classified project terrified someone would use get taken possibility away keep tab beautiful wife knew hadnt moved someone else hadnt could stand longer decided brought current military complex working someone trying get shot time started protecting hisjolene abducted protection detail shipped someone idea going decided keeping cool head would best line defense pay attention could find opportunity escape nothing could shocked coming face face husband mourned yea

In [70]:
w2v1 = Word2Vec(sentences=X_train1_token,workers=1,min_count=4,window=5,vector_size=100,epochs=5)

def doc_to_vec(doc,model):
    words = [word for word in doc if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words],axis=0)

In [60]:
X_train1 = np.array([doc_to_vec(doc, w2v) for doc in X_train1])
X_test1 = np.array([doc_to_vec(doc, w2v) for doc in X_test1])


In [61]:
model1.fit(X_train1,y_train1)

In [62]:
y_pred1 = model1.predict(X_test1)

In [63]:
accuracy_score(y_pred1,y_test1)

0.6586666666666666

In [73]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()  # Default `max_iter` is 200, which corresponds to the number of epochs

model.fit(X_train1,y_train1)

accuracy_score(model.predict(X_test1),y_test1)

0.6586666666666666

In [79]:
X_train1[4]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [77]:
type(X_test1)

numpy.ndarray

In [82]:
# Checking words that are not in the Word2Vec model's vocabulary
missing_words = [word for sentence in X_test_token for word in sentence if word not in w2v.wv]
print("Missing words from the test data:", len(missing_words))


Missing words from the test data: 20062
