In [1]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("bharadwaj6/kindle-reviews")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/kindle-reviews


In [2]:
import pandas as pd
import numpy as np

In [3]:
path

'/kaggle/input/kindle-reviews'

In [4]:
import pandas as pd
import os

# Assuming the dataset file is named 'kindle_reviews.csv'
file_name = 'kindle_reviews.csv'
full_file_path = os.path.join(path, file_name)

# Read the data into a pandas DataFrame
df = pd.read_csv(full_file_path)

# Display the first few rows of the DataFrame to confirm
print(df.head())

   Unnamed: 0        asin helpful  overall  \
0           0  B000F83SZQ  [0, 0]        5   
1           1  B000F83SZQ  [2, 2]        4   
2           2  B000F83SZQ  [2, 2]        4   
3           3  B000F83SZQ  [1, 1]        5   
4           4  B000F83SZQ  [0, 1]        4   

                                          reviewText   reviewTime  \
0  I enjoy vintage books and movies so I enjoyed ...   05 5, 2014   
1  This book is a reissue of an old one; the auth...   01 6, 2014   
2  This was a fairly interesting read.  It had ol...   04 4, 2014   
3  I'd never read any of the Amy Brewster mysteri...  02 19, 2014   
4  If you like period pieces - clothing, lingo, y...  03 19, 2014   

       reviewerID                         reviewerName             summary  \
0  A1F6404F1VG29J                           Avidreader  Nice vintage story   
1   AN0N05A9LIJEQ                             critters        Different...   
2   A795DMNCJILA6                                  dot               Oldie

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [6]:
main_df = df[['reviewText', 'overall']].iloc[:25000]
main_df.shape

(25000, 2)

In [7]:
## missing values
main_df.isnull().sum()

Unnamed: 0,0
reviewText,0
overall,0


In [8]:
main_df.dropna(inplace=True)

In [9]:
main_df.shape

(25000, 2)

In [10]:
main_df['overall'].unique()

array([5, 4, 3, 2, 1])

In [11]:
main_df['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
5,11381
4,7473
3,3564
2,1482
1,1100


In [12]:
## preprocessing & cleaning
main_df['overall'] = main_df['overall'].apply(lambda x: 1 if x > 3 else 0)

In [13]:
main_df['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
1,18854
0,6146


In [14]:
## Data become highly imbalanced we need to apply SMOTE later to handle this.

In [15]:
## 1. Lower all the words.
main_df['reviewText'] = main_df['reviewText'].str.lower()

In [16]:
main_df.head()

Unnamed: 0,reviewText,overall
0,i enjoy vintage books and movies so i enjoyed ...,1
1,this book is a reissue of an old one; the auth...,1
2,this was a fairly interesting read. it had ol...,1
3,i'd never read any of the amy brewster mysteri...,1
4,"if you like period pieces - clothing, lingo, y...",1


In [17]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
from bs4 import BeautifulSoup

In [19]:
## Removing special characters
main_df['reviewText'] = main_df['reviewText'].apply(lambda x: re.sub("[^a-z A-Z 0-9-]+", "", x))
## removing stopwords
main_df['reviewText'] = main_df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
## Removing URL's
main_df['reviewText'] = main_df['reviewText'].apply(lambda x: re.sub(r"(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "", str(x)))
## Removing html tags
main_df['reviewText'] = main_df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
## Removing any additional spaces
main_df['reviewText'] = main_df['reviewText'].apply(lambda x: " ".join(x.split()))

In [20]:
main_df.head()

Unnamed: 0,reviewText,overall
0,enjoy vintage books movies enjoyed reading boo...,1
1,book reissue old one author born 1910 era say ...,1
2,fairly interesting read old- style terminology...,1
3,id never read amy brewster mysteries one reall...,1
4,like period pieces - clothing lingo enjoy myst...,1


In [21]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [22]:
main_df['reviewText'] = main_df['reviewText'].apply(lambda x: lemmatize_words(x))
main_df.head()

Unnamed: 0,reviewText,overall
0,enjoy vintage book movie enjoyed reading book ...,1
1,book reissue old one author born 1910 era say ...,1
2,fairly interesting read old- style terminology...,1
3,id never read amy brewster mystery one really ...,1
4,like period piece - clothing lingo enjoy myste...,1


In [23]:
X = main_df['reviewText']
y = main_df['overall']

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [25]:
X_train.shape, y_train.shape

((20000,), (20000,))

In [26]:
## Applying Bag of Words vectorizer
from sklearn.feature_extraction.text import CountVectorizer
bw = CountVectorizer(max_features=2500, ngram_range=(1,2))
X_train_bw = bw.fit_transform(X_train).toarray()
X_test_bw = bw.transform(X_test).toarray()

In [27]:
## Applying TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf = CountVectorizer(max_features=2500, ngram_range=(1,2))
X_train_tf = tf.fit_transform(X_train).toarray()
X_test_tf = tf.transform(X_test).toarray()

In [28]:
X_train_bw.shape, X_train_tf.shape

((20000, 2500), (20000, 2500))

## Model Creation

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [30]:
model_bw = MultinomialNB()
model_bw.fit(X_train_bw, y_train)
y_pred_train_bw = model_bw.predict(X_train_bw)
y_pred_test_bw = model_bw.predict(X_test_bw)
print("Train Accuracy Score with BoW: ",accuracy_score(y_train, y_pred_train_bw))
print("Test Accuracy Score with BoW: ", accuracy_score(y_test, y_pred_test_bw))

Train Accuracy Score with BoW:  0.8313
Test Accuracy Score with BoW:  0.8176


In [31]:
model_tf = MultinomialNB()
model_tf.fit(X_train_tf, y_train)
y_pred_train_tf = model_tf.predict(X_train_tf)
y_pred_test_tf = model_tf.predict(X_test_tf)
print("Train Accuracy Score with BoW: ", accuracy_score(y_train, y_pred_train_tf))
print("Test Accuracy Score with BoW: ", accuracy_score(y_test, y_pred_test_tf))

Train Accuracy Score with BoW:  0.8313
Test Accuracy Score with BoW:  0.8176


## GOOGLE-WORD TO VEC Implementation.

In [32]:
!pip install gensim



In [35]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")




In [37]:
def vectorize(tokens, model, dim = 300):
    vec = []
    review = [model[word] for word in tokens if word in model]
    review = " ".join(review)
    vec.append(review)
    if len(vec) == 0:
        return np.zeros(dim)
    return np.mean(vec, axis = 0)

In [49]:
def vectorize(text, model, dim=300):
    words = text.split()
    valid_words = [model[word] for word in words if word in model]
    if not valid_words:
        return np.zeros(dim)
    return np.mean(valid_words, axis=0)

X_train_vectors = X_train.apply(lambda x: vectorize(x, model))

In [53]:
X_test_vectors = X_test.apply(lambda x: vectorize(x, model))

In [54]:
X_train_vectors

Unnamed: 0,reviewText
23311,"[0.09367439, -0.018481353, 0.0023028245, 0.085..."
23623,"[0.049963664, 0.035877317, -0.051309984, 0.096..."
1020,"[0.046395518, 0.008264726, -0.017038284, 0.097..."
12645,"[0.067020595, -0.024576822, -0.019731794, 0.11..."
1533,"[0.059573174, 0.024373373, -0.014482498, 0.148..."
...,...
21575,"[0.10348038, 0.0073641925, -0.002718321, 0.039..."
5390,"[0.055655424, -0.0017694585, -0.033528946, 0.1..."
860,"[0.04634916, -0.0016150841, -0.0738138, 0.1077..."
15795,"[0.026751708, 0.08647461, -0.013614909, 0.1287..."


In [55]:
model_w2v = MultinomialNB()
model_w2v.fit(X_train_vectors, y_train)
y_pred_train_w2v = model_w2v.predict(X_train_vectors)
y_pred_test_w2v = model_w2v.predict(X_test_vectors)
print("Train Accuracy Score with BoW: ", accuracy_score(y_train, y_pred_train_w2v))
print("Test Accuracy Score with BoW: ", accuracy_score(y_test, y_pred_test_w2v))

ValueError: setting an array element with a sequence.

In [57]:
from sklearn.naive_bayes import GaussianNB

In [58]:
# Convert the Series of NumPy arrays into a 2D NumPy array
X_train_vectors_np = np.vstack(X_train_vectors.values)
X_test_vectors_np = np.vstack(X_test_vectors.values)

model_w2v = GaussianNB()
# Fit the model using the 2D NumPy arrays
model_w2v.fit(X_train_vectors_np, y_train)
y_pred_train_w2v = model_w2v.predict(X_train_vectors_np)
y_pred_test_w2v = model_w2v.predict(X_test_vectors_np)
print("Train Accuracy Score with W2V: ", accuracy_score(y_train, y_pred_train_w2v))
print("Test Accuracy Score with W2V: ", accuracy_score(y_test, y_pred_test_w2v))

Train Accuracy Score with W2V:  0.69505
Test Accuracy Score with W2V:  0.6876
