In [16]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords


In [6]:
data= pd.read_csv('all_kindle_review.csv')
data.head()
df= data[['reviewText','rating']]
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [8]:
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [9]:
df['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [10]:
df['rating']=df['rating'].apply(lambda x: 0 if x<3 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating']=df['rating'].apply(lambda x: 0 if x<3 else 1)


In [11]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",1
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,1
3,Aggie is Angela Lansbury who carries pocketboo...,1
4,I did not expect this type of book to be in li...,1


In [13]:
df['reviewText']=df['reviewText'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviewText']=df['reviewText'].str.lower()


In [20]:
#removing speciaL characters
df['reviewText']=df['reviewText'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+','',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviewText']=df['reviewText'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+','',x))


In [22]:
#applying stopwords
#The " ".join() method takes the filtered list of words and joins them back into a single string with spaces in between.
'''x.split()
The split() method splits the string x into a list of words, using whitespace as the delimiter.
Example: "This is a great product" becomes ['This', 'is', 'a', 'great', 'product'].'''


df['reviewText']=df['reviewText'].apply(lambda x:" ".join([i for i in x.split() if i not in stopwords.words('english')] ))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviewText']=df['reviewText'].apply(lambda x:" ".join([i for i in x.split() if i not in stopwords.words('english')] ))


In [24]:
#applying lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join( [lemmatizer.lemmatize(word) for word in text.split() ])

df['reviewText']=df['reviewText'].apply(lambda x:lemmatize_words(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviewText']=df['reviewText'].apply(lambda x:lemmatize_words(x))


### Train Test Split 

In [27]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(df['reviewText'],df['rating'], test_size= 0.20,random_state= 42 )

### Converting sentences into vectors

In [31]:
'''import gensim
from gensim.models import Word2Vec

wv=Word2Vec(window=5)

# Tokenize the sentences in X_train
X_train_tokens = [sentence.split() for sentence in X_train]

# Initialize and train the Word2Vec model
wv = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
wv.build_vocab(X_train_tokens)
wv.train(X_train_tokens, total_examples=wv.corpus_count, epochs=wv.epochs)

# Transform the sentences in X_train to vectors
X_train_wv = [wv.wv[sentence] for sentence in X_train_tokens]'''

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

bow=CountVectorizer()
tf=TfidfVectorizer()

X_train_b=bow.fit_transform(X_train).toarray()
X_train_tf=tf.fit_transform(X_train).toarray()

X_test_b= bow.transform(X_test).toarray()
X_test_tf= tf.transform(X_test).toarray()



In [41]:
### Model Training
from sklearn.naive_bayes import MultinomialNB
multi=MultinomialNB()
bow_model=multi.fit(X_train_b,y_train)
tf_model=multi.fit(X_train_tf,y_train)

In [45]:
y_pred_bow=bow_model.predict(X_test_b)
y_pred_tf=tf_model.predict(X_test_tf)

In [51]:
from sklearn.metrics import accuracy_score,classification_report

bow_accur=accuracy_score(y_pred_bow,y_test)
print("Your bow accuracy is :",bow_accur)
tf_accur= accuracy_score(y_pred_tf,y_test)
print("Your tfidf accuracy is :",tf_accur)




Your bow accuracy is : 0.7033333333333334
Your tfidf accuracy is : 0.69375


In [52]:
classification= classification_report(y_pred_bow,y_test)
print(classification)

              precision    recall  f1-score   support

           0       0.12      0.96      0.21        99
           1       1.00      0.69      0.82      2301

    accuracy                           0.70      2400
   macro avg       0.56      0.83      0.51      2400
weighted avg       0.96      0.70      0.79      2400

