In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3)

In [3]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


<h2>Cleaning the dataset for NLP</h2>

In [4]:
import re
import nltk

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/stiffler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords

In [7]:
from nltk.stem import PorterStemmer

In [8]:
ps = PorterStemmer()

<p>Applying the above process to the all 1000 reviews</p>

In [9]:
corpus = []
for i in range(len(dataset)):
    #Removing extra characters
    review = re.sub('[^a-zA-Z]', ' ' ,dataset['Review'][i])
    #Converting the review to Lowercase
    review = review.lower()
    #making the review an array of words
    review = review.split()
    #applying stemming and stopping the unceccessary words
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #rejoining the words back to the full sentence
    review = ' '.join(review)
    corpus.append(review)

<h3>Creating the bag of words Model</h3>

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)

In [21]:
X = cv.fit_transform(corpus).toarray()
#Making depending variabe Matrix
y = dataset.iloc[:,-1].values

<h4>Splitting the dataset</h4>

In [24]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 0)

<h3>Fitting the classification model (Naive Bayes) to the training set</h3>

In [25]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

Evaluating the model, by <b>Confusion Matrix</b>

In [30]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,classifier.predict(X_test))

In [31]:
cm

array([[55, 42],
       [12, 91]])

<b>Accuracy of Model</b>

In [37]:
print "The model is %r percent accurate." %(classifier.score(X_test,y_test)*100)

The model is 73.0 percent accurate.


<h3>Fitting the classification model (Decision Trees) to the training set</h3>

In [43]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

<b>Accuracy of Model</b>

In [44]:
print "The model is %r percent accurate." %(classifier.score(X_test,y_test)*100)

The model is 71.0 percent accurate.
