In [58]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import numpy as np
from tqdm import tqdm


1. Cleaning and Preprocessing the data

2. Split the training and testing data 

3. Apply BagOfEords, TFIDF, Word2Vec algorithms

4. apply the machine learning algorithm



In [2]:
df = pd.read_csv('all_kindle_review.csv')

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [4]:
df = df[['rating','reviewText']]

In [5]:
df.head()

Unnamed: 0,rating,reviewText
0,3,"Jace Rankin may be short, but he's nothing to ..."
1,5,Great short read. I didn't want to put it dow...
2,3,I'll start by saying this is the first of four...
3,3,Aggie is Angela Lansbury who carries pocketboo...
4,4,I did not expect this type of book to be in li...


In [6]:
# check for null values

df.isnull().sum()

rating        0
reviewText    0
dtype: int64

In [None]:
# check for the value count of the ratings, we can give positive rating(1) if rating is more or equal to 3 and negative rating wif rating is less than 3

print('Unique count of the ratings:::',df['rating'].value_counts()) # checking if the data set is imbalanced or balanced

print('unique values of the ratings:',df['rating'].unique()) # checking for the unique value ratings

df['rating']=df['rating'].apply(lambda x: 0 if x<3  else 1)

df.head()

Unique count of the ratings::: rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64
unique values of the ratings: [3 5 4 2 1]


Unnamed: 0,rating,reviewText
0,1,"Jace Rankin may be short, but he's nothing to ..."
1,1,Great short read. I didn't want to put it dow...
2,1,I'll start by saying this is the first of four...
3,1,Aggie is Angela Lansbury who carries pocketboo...
4,1,I did not expect this type of book to be in li...


In [17]:
df['reviewText'] = df['reviewText'].str.lower()
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub('[^A-Z a-z 0-9]+','',x))
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([y for y in x.split() if y  not in stopwords.words('english')]))

In [19]:
# apply lemmatization
lematizer = WordNetLemmatizer()
df['reviewText'] = df['reviewText'].apply(lambda x: lematizer.lemmatize(x))

In [20]:
df['reviewText'].head()

0    jace rankin may short hes nothing mess man hau...
1    great short read didnt want put read one sitti...
2    ill start saying first four books wasnt expect...
3    aggie angela lansbury carries pocketbooks inst...
4    expect type book library pleased find price right
Name: reviewText, dtype: object

In [37]:
# Split the training and test data using train_test_split
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(df['reviewText'],df['rating'],test_size=0.25,random_state=42)


In [38]:
print("training input feature size:",X_train.size)
print("training output feature size:",Y_train.size)
print("test input feature size:",X_test.size)
print("test output feature size:",Y_test.size)


training input feature size: 9000
training output feature size: 9000
test input feature size: 3000
test output feature size: 3000


In [39]:
# Apply the BOW for converting the data into vectors

from sklearn.feature_extraction.text import CountVectorizer
bow_model = CountVectorizer(ngram_range=(1,1))

X_train_converted=bow_model.fit_transform(X_train)
X_test_converted=bow_model.transform(X_test)


In [40]:
X_train_converted = X_train_converted
X_test_converted = X_test_converted

In [42]:
from sklearn.naive_bayes import MultinomialNB

naive_model = MultinomialNB()
naive_model.fit(X_train_converted,Y_train)

Y_test_predicted_BOW=naive_model.predict(X_test_converted)

from sklearn.metrics import confusion_matrix,accuracy_score

print('confusion matrix :',confusion_matrix(y_true=Y_test,y_pred=Y_test_predicted_BOW))
print('accuracy score ',accuracy_score(y_true=Y_test,y_pred=Y_test_predicted_BOW))

confusion matrix : [[ 698  326]
 [ 149 1827]]
accuracy score  0.8416666666666667


In [43]:
# Apply TFIDF model to convert the data  to vectors
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer()
X_train_converted_tfidf= tfidf_vector.fit_transform(X_train)
X_test_converted_tfidf = tfidf_vector.transform(X_test)


from sklearn.naive_bayes import MultinomialNB

naive_model = MultinomialNB()
naive_model.fit(X_train_converted_tfidf,Y_train)

Y_test_predicted_tfidf=naive_model.predict(X_test_converted_tfidf)

from sklearn.metrics import confusion_matrix,accuracy_score

print('confusion matrix :',confusion_matrix(y_true=Y_test,y_pred=Y_test_predicted_tfidf))
print('accuracy score ',accuracy_score(y_true=Y_test,y_pred=Y_test_predicted_tfidf))

confusion matrix : [[  83  941]
 [   2 1974]]
accuracy score  0.6856666666666666


In [71]:
# Apply Word2Vec model to convert the data into vectors

from gensim.models import Word2Vec
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score



# Prepare data for Word2Vec: tokenize sentences
X_train_tokens = X_train.apply(lambda x: x.split())
X_test_tokens = X_test.apply(lambda x: x.split())

# Train Word2Vec model on tokenized training data
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4, epochs=25)

# Function to get average Word2Vec vector for a document
def average_word2vec(tokens, model, vector_size=100):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)

# Transform train and test data to average Word2Vec vectors
X_train_avg_w2v = np.vstack([average_word2vec(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_avg_w2v = np.vstack([average_word2vec(tokens, word2vec_model) for tokens in X_test_tokens])

# Fit Naive Bayes (GaussianNB is suitable for continuous features like Word2Vec)

gnb = GaussianNB()
gnb.fit(X_train_avg_w2v, Y_train)

# Predict on test set
Y_test_predicted_w2v = gnb.predict(X_test_avg_w2v)


print('confusion matrix :', confusion_matrix(y_true=Y_test, y_pred=Y_test_predicted_w2v))
print('accuracy score ', accuracy_score(y_true=Y_test, y_pred=Y_test_predicted_w2v))

# Predict a sample data
sample_text = "this book was very interesting and enjoyable"
sample_tokens = sample_text.lower().split()
sample_vector = average_word2vec(sample_tokens, word2vec_model).reshape(1, -1)
sample_prediction = gnb.predict(sample_vector)
print("Sample prediction:", sample_prediction[0])


confusion matrix : [[ 839  185]
 [ 539 1437]]
accuracy score  0.7586666666666667
Sample prediction: 1


In [76]:
X_train_tokens_sample = X_train.apply(lambda x: x.split())
X_test_tokens_sample = X_test.apply(lambda x: x.split())


In [74]:
word2vec_model_sample = Word2Vec(sentences=X_train_tokens_sample,vector_size=100,epochs=25)

In [77]:
X_train_tokens_sample    

11675    [short, enough, read, chili, cooking, keeps, i...
10950    [jolene, benate, top, world, coveted, spot, wa...
4256     [man, load, one, lets, see, 35, year, old, vir...
10088    [characters, good, like, way, main, character,...
9030     [much, say, hot, good, story, quick, read, wor...
                               ...                        
11964    [downloaded, book, reading, reviews, usually, ...
5191     [far, one, hottest, books, ive, ever, gotten, ...
5390     [even, though, book, free, reservations, based...
860      [little, mushy, 34must, take, care, women, fol...
7270     [book, good, good, set, charaterswith, backgro...
Name: reviewText, Length: 9000, dtype: object

In [78]:
X_train_tokens_sample = X_train.apply(lambda x: x.split())
X_test_tokens_sample = X_test.apply(lambda x: x.split())

word2vec_model_new = Word2Vec(sentences=X_train_tokens_sample,epochs=25)

