Check the performance of different classifiers by applying the following metrics:
* Confusion Matrix
* Accuracy (how many of the predicted results are similar to the test set results? 
* Precision (measuring exactness; when it predicts yes, how often is it correct?)
* Recall (measuring completeness; when it's actually yes, how often does it predict yes?)
* F1 Score (compromise between Precision and Recall）
* Save the results within a dataframe and export it to a csv


In [28]:
import numpy as np
import pandas as pd
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review) 
    corpus.append(review)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [32]:
#Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score

def model_performance(y_pred):
    value_list = []
    cm = confusion_matrix(y_test, y_pred)
    TN = cm[0][0]
    value_list.append(cm[0][0])
    TP = cm[1][1]
    value_list.append(cm[1][1])
    FP = cm[0][1]
    value_list.append(cm[0][1])
    FN = cm[1][0]
    value_list.append(cm[1][0])
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    value_list.append(Accuracy)
    Precision = TP / (TP + FP)
    value_list.append(round(Precision, 3))
    Recall = TP / (TP + FN)
    value_list.append(round(Recall, 3))
    F1 = 2 * Precision * Recall / (Precision + Recall)
    value_list.append(round(F1, 3))
    return print((cm),'\n'
    'True Negatives:', cm[0][0],'\n'
    'True Positives:', cm[1][1],'\n'
    'False Positives:', cm[0][1],'\n'
    'False Negatives:', cm[1][0],'\n'
    'Accurary:', Accuracy,'\n'
    'Precision:', round(Precision, 3),'\n'
    'Recall:', round(Recall, 3),'\n'
    'F1 Score:', round(F1, 3)), value_list

In [34]:
list_nb = model_performance(y_pred)[1]
list_nb

[[55 42]
 [12 91]] 
True Negatives: 55 
True Positives: 91 
False Positives: 42 
False Negatives: 12 
Accurary: 0.73 
Precision: 0.684 
Recall: 0.883 
F1 Score: 0.771


[55, 91, 42, 12, 0.73, 0.684, 0.883, 0.771]

In [35]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
class_rf = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0)
class_rf.fit(X_train, y_train)
y_pred = class_rf.predict(X_test)
list_rf = model_performance(y_pred)[1]
list_rf

[[87 10]
 [45 58]] 
True Negatives: 87 
True Positives: 58 
False Positives: 10 
False Negatives: 45 
Accurary: 0.725 
Precision: 0.853 
Recall: 0.563 
F1 Score: 0.678


[87, 58, 10, 45, 0.725, 0.853, 0.563, 0.678]

In [36]:
#Support Vector Machine Classifier
from sklearn.svm import SVC
class_svm = SVC(kernel = 'linear', random_state = 0)
class_svm.fit(X_train, y_train)
y_pred = class_svm.predict(X_test)
list_svm = model_performance(y_pred)[1]
list_svm

[[79 18]
 [24 79]] 
True Negatives: 79 
True Positives: 79 
False Positives: 18 
False Negatives: 24 
Accurary: 0.79 
Precision: 0.814 
Recall: 0.767 
F1 Score: 0.79


[79, 79, 18, 24, 0.79, 0.814, 0.767, 0.79]

In [37]:
#Kernel SVM Classifier
class_kern = SVC(kernel = 'rbf', random_state = 0)
class_kern.fit(X_train, y_train)
y_pred = class_kern.predict(X_test)
list_kern = model_performance(y_pred)[1]
list_kern

[[89  8]
 [36 67]] 
True Negatives: 89 
True Positives: 67 
False Positives: 8 
False Negatives: 36 
Accurary: 0.78 
Precision: 0.893 
Recall: 0.65 
F1 Score: 0.753


[89, 67, 8, 36, 0.78, 0.893, 0.65, 0.753]

In [38]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
class_tree.fit(X_train, y_train)
y_pred = class_tree.predict(X_test)
list_tree = model_performance(y_pred)[1]
list_tree

[[78 19]
 [31 72]] 
True Negatives: 78 
True Positives: 72 
False Positives: 19 
False Negatives: 31 
Accurary: 0.75 
Precision: 0.791 
Recall: 0.699 
F1 Score: 0.742


[78, 72, 19, 31, 0.75, 0.791, 0.699, 0.742]

In [39]:
from sklearn.neighbors import KNeighborsClassifier
class_knn = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
class_knn.fit(X_train, y_train)
y_pred = class_knn.predict(X_test)
list_knn = model_performance(y_pred)[1]
list_knn

[[79 18]
 [52 51]] 
True Negatives: 79 
True Positives: 51 
False Positives: 18 
False Negatives: 52 
Accurary: 0.65 
Precision: 0.739 
Recall: 0.495 
F1 Score: 0.593


[79, 51, 18, 52, 0.65, 0.739, 0.495, 0.593]

In [40]:
from sklearn.linear_model import LogisticRegression
class_log = LogisticRegression(random_state = 0)
class_log.fit(X_train, y_train)
y_pred = class_log.predict(X_test)
list_log = model_performance(y_pred)[1]
list_log

[[80 17]
 [28 75]] 
True Negatives: 80 
True Positives: 75 
False Positives: 17 
False Negatives: 28 
Accurary: 0.775 
Precision: 0.815 
Recall: 0.728 
F1 Score: 0.769


[80, 75, 17, 28, 0.775, 0.815, 0.728, 0.769]

In [41]:
#artificial neural network
#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

#initialize artificial neural network
import tensorflow as tf
ann = tf.keras.models.Sequential()

#adding input layer and first hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
#adding second hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

#adding output layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

#compiling the ANN
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#training ANN on training set
ann.fit(X_train_scaled, y_train, batch_size=32, epochs=100)

#predict test set results
y_pred = ann.predict(X_test_scaled)

#change to boolean
y_pred = (y_pred > 0.5)

#check performance of ANN
list_ann = model_performance(y_pred)[1]
list_ann


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

[77, 85, 20, 18, 0.81, 0.81, 0.825, 0.817]

In [42]:
df = pd.DataFrame(zip(list_nb, list_rf, list_svm, list_kern, list_tree, list_knn, list_log, list_ann), index = ['True Neg', 'True Pos', 'False Pos',
                                    'False Neg', 'Accuracy', 'Precision',
                                    'Recall', 'F1 Score'], columns = ['Naive Bayes', 'Random Forest', 'Linear SVM', 
                                                                      'Kernel SVM', 'Decision Tree', 'K-NN', 'Log Reg.', 'ANN'])
df

Unnamed: 0,Naive Bayes,Random Forest,Linear SVM,Kernel SVM,Decision Tree,K-NN,Log Reg.,ANN
True Neg,55.0,87.0,79.0,89.0,78.0,79.0,80.0,77.0
True Pos,91.0,58.0,79.0,67.0,72.0,51.0,75.0,85.0
False Pos,42.0,10.0,18.0,8.0,19.0,18.0,17.0,20.0
False Neg,12.0,45.0,24.0,36.0,31.0,52.0,28.0,18.0
Accuracy,0.73,0.725,0.79,0.78,0.75,0.65,0.775,0.81
Precision,0.684,0.853,0.814,0.893,0.791,0.739,0.815,0.81
Recall,0.883,0.563,0.767,0.65,0.699,0.495,0.728,0.825
F1 Score,0.771,0.678,0.79,0.753,0.742,0.593,0.769,0.817


In [43]:
df.to_csv('model_selection.csv', index=True, header=True)