In [22]:
!pip install nltk
!pip install bs4
!pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
Using cached scipy-1.16.3-cp312-cp312-win_amd64.whl (38.6 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn

   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- 

In [23]:
import pandas as pd
import re 
import nltk
import sklearn
from nltk.corpus import stopwords
from bs4 import BeautifulSoup


In [4]:
df=pd.read_csv('all_kindle_review.csv')

In [5]:
df=df[['reviewText','rating']]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [6]:
## missing values
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [7]:
df['rating'].unique()

array([3, 5, 4, 2, 1])

In [8]:
## Preprocessing and cleaning 
df['rating']=df['rating'].apply(lambda x:0 if x<3 else 1)
df['reviewText']=df['reviewText'].str.lower()

In [10]:
df['reviewText']=df['reviewText'].apply(lambda x:re.sub('[^a-z A-Z 0-9]+','',x))
df['reviewText']=df['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
df['reviewText'] = df['reviewText'].astype(str)
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
df['reviewText']=df['reviewText'].apply(lambda x:" ".join(x.split()))

In [11]:
df.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [16]:
## Applying lemmitization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...


True

In [17]:
lemmatizer=WordNetLemmatizer()

In [18]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [19]:
df['reviewText']=df['reviewText'].apply(lambda x: lemmatize_words(x))

In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df['reviewText'],df['rating'],test_size=0.25)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()



In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()


In [30]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(9000, 35801))

In [31]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb_model_bow=nb.fit(X_train_bow,y_train)
nb_model_tfidf=nb.fit(X_train_tfidf,y_train)

In [33]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred_bow=nb_model_bow.predict(X_test_bow)
y_pred_tfidf=nb_model_tfidf.predict(X_test_tfidf)

In [34]:
print("Accuracy score of BOW",accuracy_score(y_test,y_pred_bow))
print("Accuracy score of TFIDF",accuracy_score(y_test,y_pred_tfidf))

Accuracy score of BOW 0.6253333333333333
Accuracy score of TFIDF 0.5916666666666667
