# Importing the libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\odilon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting = 3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Cleaning the texts

In [4]:
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])

In [5]:
review

'Wow    Loved this place '

In [6]:
ps = PorterStemmer()

In [7]:
def remove_special_chars(review):
    words = re.sub('[^a-zA-Z]', ' ', review).lower().split()
    return ' '.join([ps.stem(word) for word in words if not word in set(stopwords.words('english'))])

In [8]:
dataset['Review'] = dataset['Review'].apply(remove_special_chars)

In [9]:
dataset.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


In [10]:
corpus = dataset['Review'].values

# Creating the Bag of Words model

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer(max_features=1500)

In [13]:
X = cv.fit_transform(corpus).toarray()

In [14]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
y = dataset.iloc[:, 1].values

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
y_pred = classifier.predict(X_test)

In [20]:
classifier.score(X_test, y_test)

0.70999999999999996

In [21]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[76, 21],
       [37, 66]], dtype=int64)

In [22]:
from sklearn.metrics import classification_report

In [23]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.67      0.78      0.72        97
          1       0.76      0.64      0.69       103

avg / total       0.72      0.71      0.71       200



In [24]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) /(tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

print(" Accuracy: {} \n Precision: {} \n Recall: {} \n F1 Score: {}".format(accuracy, precision, recall, f1))

 Accuracy: 0.71 
 Precision: 0.7586206896551724 
 Recall: 0.6407766990291263 
 F1 Score: 0.6947368421052632
