In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('output.csv')

In [3]:
df=df.rename(columns={' Review':'review'})

In [4]:
df

Unnamed: 0,review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [5]:
df.head(10)

Unnamed: 0,review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [6]:
df.columns

Index(['review', 'Liked'], dtype='object')

In [7]:
df.isnull().sum()

review    0
Liked     0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(4)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
df['Liked'].value_counts()

Liked
1    499
0    497
Name: count, dtype: int64

In [12]:
df['review_len'] = df['review'].astype(str).apply(len)

In [13]:
df

Unnamed: 0,review,Liked,review_len
0,Wow... Loved this place.,1,24
1,Crust is not good.,0,18
2,Not tasty and the texture was just nasty.,0,41
3,Stopped by during the late May bank holiday of...,1,87
4,The selection on the menu was great and so wer...,1,59
...,...,...,...
995,I think food should have flavor and texture an...,0,66
996,Appetite instantly gone.,0,24
997,Overall I was not impressed and would not go b...,0,50
998,"The whole experience was underwhelming, and I ...",0,91


In [14]:
df.describe()

Unnamed: 0,Liked,review_len
count,996.0,996.0
mean,0.501004,58.459839
std,0.50025,32.341292
min,0.0,11.0
25%,0.0,33.0
50%,1.0,51.0
75%,1.0,80.0
max,1.0,149.0


# Text cleaning

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navi8\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navi8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\navi8\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def clean(doc):
    doc=doc.lower()
    doc=re.sub(r'[^a-zA-Z\s]','',doc)
    tokens=word_tokenize(doc)
    stop_words=set(stopwords.words('english'))
    filtered=[word for word in tokens if word not in stop_words]
    lemmatizer=WordNetLemmatizer()
    lemmatized=[lemmatizer.lemmatize(word) for word in filtered]
    return " ".join(lemmatized)

In [17]:
df['clean_review']=df['review'].apply(clean)

In [18]:
df

Unnamed: 0,review,Liked,review_len,clean_review
0,Wow... Loved this place.,1,24,wow loved place
1,Crust is not good.,0,18,crust good
2,Not tasty and the texture was just nasty.,0,41,tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,87,stopped late may bank holiday rick steve recom...
4,The selection on the menu was great and so wer...,1,59,selection menu great price
...,...,...,...,...
995,I think food should have flavor and texture an...,0,66,think food flavor texture lacking
996,Appetite instantly gone.,0,24,appetite instantly gone
997,Overall I was not impressed and would not go b...,0,50,overall impressed would go back
998,"The whole experience was underwhelming, and I ...",0,91,whole experience underwhelming think well go n...


# Bag of Words

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer(ngram_range=(1,2),min_df=2,max_features=3000)
text_ct=bow.fit_transform(df['clean_review'])

In [20]:
pd.DataFrame(text_ct.toarray(),columns=bow.get_feature_names_out())

Unnamed: 0,absolutely,absolutely amazing,acknowledged,actually,added,ago,almost,also,also taste,although,...,wrap,wrong,year,year ago,yet,youd,youre,yummy,zero,zero star
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(text_ct,df['Liked'],test_size=0.2,random_state=42,stratify=df['Liked'])

In [22]:
from sklearn.naive_bayes import BernoulliNB
model=BernoulliNB()
model.fit(X_train,y_train)

In [23]:
from sklearn.metrics import  accuracy_score
y_pred=model.predict(X_test)

In [24]:
accuracy_score(y_test,y_pred)

0.785

In [25]:
y_pred=model.predict(X_train)
accuracy_score(y_train,y_pred)

0.9158291457286433

# Multinomial

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(text_ct,df['Liked'],test_size=0.2,random_state=42,stratify=df['Liked'])

In [27]:
from sklearn.naive_bayes import MultinomialNB
multi_nb=MultinomialNB()
multi_nb.fit(X_train,y_train)

In [28]:
from sklearn.metrics import  classification_report
y_pred=multi_nb.predict(X_test)

In [29]:
accuracy_score(y_test,y_pred)

0.795

In [30]:
y_pred=multi_nb.predict(X_train)
accuracy_score(y_train,y_pred)

0.907035175879397

# TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(ngram_range=(1,2),stop_words='english',max_features=3000,min_df=2)
x_tfidf=tfidf.fit_transform(df['clean_review'])

In [32]:
pd.DataFrame(x_tfidf.toarray(),columns=tfidf.get_feature_names_out())

Unnamed: 0,absolutely,absolutely amazing,acknowledged,actually,added,ago,amazing,ambiance,ambience,anytime,...,wow,wrap,wrong,year,year ago,youd,youre,yummy,zero,zero star
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.572488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train_test splitting

In [46]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_tfidf,df['Liked'],test_size=0.2,random_state=42,stratify=df['Liked'])

# ModelBuilding

In [47]:
from sklearn.naive_bayes import BernoulliNB
model=BernoulliNB()
model.fit(X_train,y_train)

In [48]:
from sklearn.metrics import  accuracy_score
y_pred=model.predict(X_test)

In [49]:
accuracy_score(y_test,y_pred)

0.81

In [50]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.81
Precision: 0.81
Recall: 0.81
F1 Score: 0.81

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       100
           1       0.81      0.81      0.81       100

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.81       200
weighted avg       0.81      0.81      0.81       200


Confusion Matrix:
 [[81 19]
 [19 81]]


In [51]:
y_pred=model.predict(X_train)
accuracy_score(y_train,y_pred)

0.9133165829145728

# MultinomialNB

In [40]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_tfidf,df['Liked'],test_size=0.2,random_state=42,stratify=df['Liked'])

In [41]:
from sklearn.naive_bayes import MultinomialNB
multi_nb=MultinomialNB()
multi_nb.fit(X_train,y_train)

In [42]:
from sklearn.metrics import  classification_report
y_pred=multi_nb.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.805
Precision: 0.7747747747747747
Recall: 0.86
F1 Score: 0.8151658767772512

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.75      0.79       100
           1       0.77      0.86      0.82       100

    accuracy                           0.81       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.81      0.81      0.80       200


Confusion Matrix:
 [[75 25]
 [14 86]]


In [44]:
y_pred=multi_nb.predict(X_train)
accuracy_score(y_train,y_pred)

0.914572864321608

In [52]:
import pickle
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(tfidf,open('vectorizer.pkl','wb'))