# NLP analysis of Restaurant reviews
## using Logistic regression and random forest

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
#for cleasing datasets import regular expression library
import re
#import natural kit library for NLP
import nltk

In [4]:
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#to remove stopwords from the datasets
from nltk.corpus import stopwords
#for stemming 
from nltk.stem.porter import PorterStemmer

In [6]:
#create empty array to add clean text
clean_data =[]
dataset.shape

(1000, 2)

In [7]:
#we need to clean 1000 reviews 
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review.lower() #change all text in smallcase
    review = review.split() #convert review into array using split
    ps = PorterStemmer() #create PorterStemmer object to extract root word
    review= [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] #loop to extract root word
    review = ' '.join(review) #concatenate the entire review to have its original form
    clean_data.append(review) #add all the reviews to clean_data

In [9]:
clean_data[1]

'crust good'

## What is fit and transform
In layman's terms, fit_transform means to do some calculation and then do transformation (say calculating the means of columns from some data and then replacing the missing values). So for training set, you need to both calculate and do transformation.

But for testing set, Machine learning applies prediction based on what was learned during the training set and so it doesn't need to calculate, it just performs the transformation.
https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models

In [10]:
#create bag of word model from review
from sklearn.feature_extraction.text import CountVectorizer
#set the max feature counter to your choice of number 
# "max_features" is attribute to experiment with to get better results
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(clean_data).toarray() # X contains corpus (dependent variable) 
y = dataset.iloc[:,1].values

In [11]:
#for training purpose we will split data into training set and test set
from sklearn.cross_validation import train_test_split #model selection



In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [13]:
#fiting using random forest model 
from sklearn.ensemble import RandomForestClassifier

In [14]:
# n_estimators can be said as number of trees, experiment with n_estimators to get better results  
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy') 
#fit the model
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=501, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Confusion matrix and accuracy, prediction calculation
Let's now define the most basic terms, which are whole numbers (not rates):
<br>
true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.<br>
true negatives (TN): We predicted no, and they don't have the disease.</br>
false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")<br>
false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")<br>
<br>
Accuracy: Overall, how often is the classifier correct?<br>
(TP+TN)/total<br>
Precision: When it predicts yes, how often is it correct?<br>
TP/predicted yes<br>
https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/

In [16]:
#prediction
y_pred = model.predict(X_test)
#calculate accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,y_test)
accuracy

0.73499999999999999

In [17]:
#to check accuracy
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_pred)
cm

array([[74, 25],
       [28, 73]], dtype=int64)

In [25]:
accuracy_rf = (74+73)/200
accuracy_rf

0.735

In [18]:
#since sklearn.cross_validation  has been depracted we'll use sklearn.model_selection for logistic regression
from sklearn.model_selection import train_test_split
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2,random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_trn, y_trn)

In [21]:
#make prediction
y_prd = clf.predict(X_tst)
#check accuracy
ac=accuracy_score(y_prd,y_tst)
ac

0.755

In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_tst,y_prd)

array([[79, 17],
       [32, 72]], dtype=int64)

In [26]:
accuracy_lr = (79+72)/200
accuracy_lr

0.755