In [1]:
# Natural Language Processing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv(r'C:\Users\kiz4774\OneDrive\OneDrive - Baylor Scott & White Health\BACKup May 2019\ALL DEsktop FILES\PY\Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset.shape

(1000, 2)

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kiz4774\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [44]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [45]:
#Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy=(cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print('Accuracy is:', Accuracy)
#Precision = TP / (TP + FP)
Precision=cm[0][0]/(cm[0][0]+cm[0][1])
print('Precision is:', Precision)
# Recall = TP / (TP + FN)
Recall = cm[0][0]/(cm[0][0]+cm[1][0])
print('Recall is:', Recall)
F1_Score = 2 * Precision * Recall / (Precision + Recall)
print('F1 Score is:', F1_Score)

Accuracy is: 0.73
Precision is: 0.5670103092783505
Recall is: 0.8208955223880597
F1 Score is: 0.6707317073170731


In [8]:
# Now lets try another classification method, like random forest

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
# from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

  from numpy.core.umath_tests import inner1d


array([[87, 10],
       [46, 57]], dtype=int64)

In [43]:
#Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy=(cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print('Accuracy is:', Accuracy)
#Precision = TP / (TP + FP)
Precision=cm[0][0]/(cm[0][0]+cm[0][1])
print('Precision is:', Precision)
# Recall = TP / (TP + FN)
Recall = cm[0][0]/(cm[0][0]+cm[1][0])
print('Recall is:', Recall)
F1_Score = 2 * Precision * Recall / (Precision + Recall)
print('F1 Score is:', F1_Score)

Accuracy is: 0.72
Precision is: 0.8969072164948454
Recall is: 0.6541353383458647
F1 Score is: 0.7565217391304349


In [47]:
# Now lets try another classification method, like Logistic Reg

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[76, 21],
       [37, 66]], dtype=int64)

In [48]:
#Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy=(cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print('Accuracy is:', Accuracy)
#Precision = TP / (TP + FP)
Precision=cm[0][0]/(cm[0][0]+cm[0][1])
print('Precision is:', Precision)
# Recall = TP / (TP + FN)
Recall = cm[0][0]/(cm[0][0]+cm[1][0])
print('Recall is:', Recall)
F1_Score = 2 * Precision * Recall / (Precision + Recall)
print('F1 Score is:', F1_Score)

Accuracy is: 0.71
Precision is: 0.7835051546391752
Recall is: 0.672566371681416
F1 Score is: 0.7238095238095238
