In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
data = pd.read_csv("train.TSV",sep="\t" , quoting=3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
data["Liked"].value_counts() / len(data) *100 # a balanced data set

Liked
1    50.0
0    50.0
Name: count, dtype: float64

In [7]:
# Cleaning data 
import nltk
import re

In [8]:
# remove stop words
nltk.download("stopwords")

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [9]:
from nltk.corpus import stopwords

In [10]:
data['Review'][0]

'Wow... Loved this place.'

In [11]:
review = re.sub("[^a-zA-Z]",' ',data['Review'][0])
review

'Wow    Loved this place '

In [12]:
review = review.lower()
review

'wow    loved this place '

In [13]:
# tokenzing
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [14]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
# remove stopwords
preview = []
for word in review:
    if word not in stopwords.words("english"):
        print(word , )
        preview.append(word)

wow
loved
place


In [16]:
review = [word for word in review if word not in stopwords.words("english") ]
review

['wow', 'loved', 'place']

In [17]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [18]:
review = [ ps.stem(word) for word in review]
review

['wow', 'love', 'place']

In [19]:
review = " ".join(review)
review

'wow love place'

In [20]:
corpus = []
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
for i in range(len(data)):
    review = re.sub("[^a-zA-Z]",' ',data['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

corpus

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would go back',
 'cashier care ever say still end wayyy overpr',
 'tri cape cod ravoli chicken cranberri mmmm',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'highli recommend',
 'waitress littl slow servic',
 'place worth time let alon vega',
 'like',
 'burritto blah',
 'food amaz',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'never brought salad ask',
 'hole wall great mexican street taco friendli staff',
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 'also combo like burger fri beer decent deal',
 'like final blow',
 'found place acc

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000 , )

In [26]:
x = cv.fit_transform(corpus).toarray()
x.shape

(1000, 1565)

In [28]:
 y = data["Liked"].values
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

In [29]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.22)

In [58]:
from sklearn.naive_bayes import GaussianNB # 69.25
from sklearn.ensemble import RandomForestClassifier # 75.22
from sklearn.neural_network import MLPClassifier # 79.55
model = MLPClassifier(activation="tanh" ,learning_rate="adaptive" , verbose=4)
model.fit(x_train , y_train)

Iteration 1, loss = 0.68901731
Iteration 2, loss = 0.64201212
Iteration 3, loss = 0.60057664
Iteration 4, loss = 0.56252082
Iteration 5, loss = 0.52693591
Iteration 6, loss = 0.49219234
Iteration 7, loss = 0.46030722
Iteration 8, loss = 0.42986565
Iteration 9, loss = 0.40100476
Iteration 10, loss = 0.37466626
Iteration 11, loss = 0.35010981
Iteration 12, loss = 0.32723251
Iteration 13, loss = 0.30638403
Iteration 14, loss = 0.28692763
Iteration 15, loss = 0.26916098
Iteration 16, loss = 0.25294737
Iteration 17, loss = 0.23767494
Iteration 18, loss = 0.22388505
Iteration 19, loss = 0.21115534
Iteration 20, loss = 0.19941508
Iteration 21, loss = 0.18860519
Iteration 22, loss = 0.17864248
Iteration 23, loss = 0.16927259
Iteration 24, loss = 0.16063639
Iteration 25, loss = 0.15280907
Iteration 26, loss = 0.14542564
Iteration 27, loss = 0.13864715
Iteration 28, loss = 0.13235153
Iteration 29, loss = 0.12634462
Iteration 30, loss = 0.12085564
Iteration 31, loss = 0.11568298
Iteration 32, los

In [59]:
y_pred = model.predict(x_test)
from sklearn.metrics import recall_score , precision_score , accuracy_score , f1_score, classification_report , ConfusionMatrixDisplay , confusion_matrix

In [60]:
confusion_matrix(y_test , y_pred)

array([[93, 23],
       [24, 80]], dtype=int64)

In [61]:
accuracy_score(y_true= y_test , y_pred=y_pred)

0.7863636363636364