In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [52]:
ds = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [53]:
ds.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [34]:
#Preprocessing 
import re
import nltk
# nltk.download('stopwords') #Downloding stop word corpus
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [40]:
ps = PorterStemmer()

In [56]:
corpus = []
for sent in ds['Review']:
    corpus.append(' '.join([ps.stem(word) for word in re.sub('[^a-zA-z]',' ',sent).lower().split() if word not in  stopwords.words('english')]))

In [58]:
ds['Processed text'] = corpus

In [64]:
ds.head()

Unnamed: 0,Review,Liked,Processed text
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust good
2,Not tasty and the texture was just nasty.,0,tasti textur nasti
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great price


In [65]:
# Creating bag of words model
from sklearn.feature_extraction.text import CountVectorizer

In [68]:
cv = CountVectorizer()

In [76]:
X = cv.fit_transform(corpus).toarray()
y =  ds['Liked'].values

In [75]:
# Total number of words in Corpus
count = 0
for sent in ds['Processed text']:
    count += len(sent.split(' '))
count

5486

In [88]:
from sklearn.model_selection import train_test_split

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


#### Let's try all the classification algorithms :
### 1. Logistic Regression


In [107]:
from sklearn.linear_model import LogisticRegression

In [108]:
classifier = LogisticRegression()

In [109]:
classifier.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [110]:
from sklearn.metrics import confusion_matrix

In [112]:
confusion_matrix(y_test,classifier.predict(X_test))

array([[124,  28],
       [ 44, 104]], dtype=int64)

 Conclusion :  72 wrong predictions from total 300 sample

### 2. K Nearest Neighbour(K-NN)

In [114]:
from sklearn.neighbors import KNeighborsClassifier

In [140]:
classifier = KNeighborsClassifier(n_neighbors=5)

In [141]:
classifier.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [142]:
confusion_matrix(y_test,classifier.predict(X_test))

array([[123,  29],
       [ 73,  75]], dtype=int64)

Conclusion : 102 wrong predictions from total 300 sample

### 3. Support Vector Machine

In [149]:
from sklearn.svm import SVC

In [166]:
classifier = SVC(kernel='rbf') 

In [167]:
classifier.fit(X_train,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [168]:
confusion_matrix(y_test,classifier.predict(X_test))

array([[  0, 152],
       [  0, 148]], dtype=int64)

Conclusion:  *Acuracy paradox*

In [169]:
# 4. Naive Bayes

In [170]:
from sklearn.naive_bayes import GaussianNB

In [171]:
classifier = GaussianNB()

In [172]:
classifier.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [173]:
confusion_matrix(y_test,classifier.predict(X_test))

array([[ 82,  70],
       [ 23, 125]], dtype=int64)

In [185]:
# 93 wrong out of 300

In [176]:
# 5. Random Forest
from sklearn.ensemble import RandomForestClassifier

In [178]:
classifier = RandomForestClassifier()

In [179]:
classifier.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [180]:
confusion_matrix(y_test,classifier.predict(X_test))

array([[132,  20],
       [ 68,  80]], dtype=int64)

In [181]:
# 88 wrong prediction among 300 samples