In [1]:
import pandas as pd

In [2]:
# Load the dataset
df= pd.read_csv('../Datasets/Restaurant_Reviews.tsv',sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
review1= df['Review'][0]

In [4]:
df.shape

(1000, 2)

In [5]:
# pattern matching ----> [^a-zA-Z]
import re

In [6]:
# REmove unwanted characters by space
review1=re.sub('[^a-zA-Z]',' ',review1)  
review1

'Wow    Loved this place '

In [7]:
# covert Upper case to Lower case
review1= review1.lower()
review1

'wow    loved this place '

In [8]:
# nltk (Natural language toolkit)
import nltk

In [9]:
# Remove stop-word
from nltk.corpus import stopwords

In [10]:
sw=stopwords.words('english')

In [11]:
len(sw)

179

#1st Review Preprocessing

In [12]:
review1

'wow    loved this place '

In [13]:
for word in review1.split(' '):
    if not word in stopwords.words('english'):
        print(word,'is not in stopwords')

wow is not in stopwords
 is not in stopwords
 is not in stopwords
 is not in stopwords
loved is not in stopwords
place is not in stopwords
 is not in stopwords


In [14]:
review1_1 =[word for word in review1.split(' ') if not word in stopwords.words('english')]
review1_1


['wow', '', '', '', 'loved', 'place', '']

In [15]:
# Stemming
from nltk.stem.porter import PorterStemmer

In [16]:
ps= PorterStemmer()
ps.stem('loved')

'love'

In [17]:
ps.stem('liked')

'like'

In [18]:
ps.stem('walking')

'walk'

In [19]:
review1_1

['wow', '', '', '', 'loved', 'place', '']

In [20]:
review1_2=[ps.stem(word) for word in review1_1]

In [21]:
review1_2 = set(review1_2)

In [22]:
# Process string
' '.join(review1_2)   # for Sequance we use RNN

' place love wow'

# All Review Preprocessing

In [23]:
# all reviews processing
all_reviews=[] # Countvectorizer having review in the list
for review in df['Review']:
    msg= re.sub('[^a-zA-Z]',' ',review)
    msg= msg.lower()
    msg1= [ps.stem(word) for word in msg.split(' ') if not word in stopwords.words('english')]
    msg1= set(msg1)
    processed_msg= ' '.join(msg1)
    all_reviews.append(processed_msg)

In [24]:
len(all_reviews)

1000

In [25]:
all_reviews

[' place love wow',
 ' crust good',
 ' tasti nasti textur',
 'steve  love bank stop late rick may recommend holiday',
 ' price select great menu',
 ' damn get want pho angri',
 ' honeslti fresh tast',
 ' could warmer ahead time made like kept tell rubber potato',
 ' great fri',
 ' touch great',
 ' servic prompt',
 ' would back go',
 ' cashier wayyy ever end overpr say still care',
 ' chicken mmmm tri ravoli cape cod cranberri',
 ' human sure hair disgust pretti',
 ' cash indic shock sign',
 ' recommend highli',
 ' slow littl waitress servic',
 ' vega time alon worth let place',
 'like ',
 ' blah burritto',
 ' food amaz',
 ' servic cute also',
 ' could beauti less interior care',
 'perform ',
 ' good velvet cake ohhh red right stuff',
 ' never ask brought salad',
 ' wall friendli staff street hole mexican taco great',
 ' warm restaur run tabl total hour around sever like overwhelm get food luke took',
 ' salmon worst sashimi',
 ' combo decent burger like beer deal also fri',
 'like  blo

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
# find unique word then make the columns then covert review to matrix
cv= CountVectorizer()
X=cv.fit_transform(all_reviews).toarray()

In [28]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
X.shape

(1000, 1565)

In [30]:
y= df['Liked']

In [31]:
y.shape

(1000,)

In [32]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

In [33]:
x_train,x_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=9)

# Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
model= RandomForestClassifier(random_state=45)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.72

In [36]:
y_pre= model.predict(x_test)

In [37]:
# making the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pre)

array([[77, 18],
       [38, 67]], dtype=int64)

# GauusianNB

In [38]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)
classifier.score(x_test,y_test)

0.67

In [39]:
y_pred =classifier.predict(x_test)

In [40]:
confusion_matrix(y_test,y_pred)

array([[48, 47],
       [19, 86]], dtype=int64)