Check the performance of different classifiers by applying the following metrics:
* Confusion Matrix
* Accuracy (how many of the predicted results are similar to the test set results? 
* Precision (measuring exactness; when it predicts yes, how often is it correct?)
* Recall (measuring completeness; when it's actually yes, how often does it predict yes?)
* F1 Score (compromise between Precision and Recall）
* Save the results within a dataframe and export it to a csv


In [130]:
import numpy as np
import pandas as pd
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [131]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review) 
    corpus.append(review)

In [132]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [133]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [135]:
#Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [136]:
from sklearn.metrics import confusion_matrix, accuracy_score

def model_performance(y_pred):
    value_list = []
    cm = confusion_matrix(y_test, y_pred)
    TN = cm[0][0]
    value_list.append(cm[0][0])
    TP = cm[1][1]
    value_list.append(cm[1][1])
    FP = cm[0][1]
    value_list.append(cm[0][1])
    FN = cm[1][0]
    value_list.append(cm[1][0])
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    value_list.append(Accuracy)
    Precision = TP / (TP + FP)
    value_list.append(round(Precision, 3))
    Recall = TP / (TP + FN)
    value_list.append(round(Recall, 3))
    F1 = 2 * Precision * Recall / (Precision + Recall)
    value_list.append(round(F1, 3))
    return print((cm),'\n'
    'True Negatives:', cm[0][0],'\n'
    'True Positives:', cm[1][1],'\n'
    'False Positives:', cm[0][1],'\n'
    'False Negatives:', cm[1][0],'\n'
    'Accurary:', Accuracy,'\n'
    'Precision:', round(Precision, 3),'\n'
    'Recall:', round(Recall, 3),'\n'
    'F1 Score:', round(F1, 3)), value_list

In [137]:
list_nb = model_performance(y_pred)[1]
list_nb

[[55 42]
 [12 91]] 
True Negatives: 55 
True Positives: 91 
False Positives: 42 
False Negatives: 12 
Accurary: 0.73 
Precision: 0.684 
Recall: 0.883 
F1 Score: 0.771


[55, 91, 42, 12, 0.73, 0.684, 0.883, 0.771]

In [138]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
class_rf = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0)
class_rf.fit(X_train, y_train)
y_pred = class_rf.predict(X_test)
list_rf = model_performance(y_pred)[1]
list_rf

[[90  7]
 [40 63]] 
True Negatives: 90 
True Positives: 63 
False Positives: 7 
False Negatives: 40 
Accurary: 0.765 
Precision: 0.9 
Recall: 0.612 
F1 Score: 0.728


[90, 63, 7, 40, 0.765, 0.9, 0.612, 0.728]

In [139]:
#Support Vector Machine Classifier
from sklearn.svm import SVC
class_svm = SVC(kernel = 'linear', random_state = 0)
class_svm.fit(X_train, y_train)
y_pred = class_svm.predict(X_test)
list_svm = model_performance(y_pred)[1]
list_svm

[[79 18]
 [24 79]] 
True Negatives: 79 
True Positives: 79 
False Positives: 18 
False Negatives: 24 
Accurary: 0.79 
Precision: 0.814 
Recall: 0.767 
F1 Score: 0.79


[79, 79, 18, 24, 0.79, 0.814, 0.767, 0.79]

In [140]:
#Kernel SVM Classifier
class_kern = SVC(kernel = 'rbf', random_state = 0)
class_kern.fit(X_train, y_train)
y_pred = class_kern.predict(X_test)
list_kern = model_performance(y_pred)[1]
list_kern

[[89  8]
 [36 67]] 
True Negatives: 89 
True Positives: 67 
False Positives: 8 
False Negatives: 36 
Accurary: 0.78 
Precision: 0.893 
Recall: 0.65 
F1 Score: 0.753


[89, 67, 8, 36, 0.78, 0.893, 0.65, 0.753]

In [141]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
class_tree.fit(X_train, y_train)
y_pred = class_tree.predict(X_test)
list_tree = model_performance(y_pred)[1]
list_tree

[[80 17]
 [32 71]] 
True Negatives: 80 
True Positives: 71 
False Positives: 17 
False Negatives: 32 
Accurary: 0.755 
Precision: 0.807 
Recall: 0.689 
F1 Score: 0.743


[80, 71, 17, 32, 0.755, 0.807, 0.689, 0.743]

In [142]:
from sklearn.neighbors import KNeighborsClassifier
class_knn = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
class_knn.fit(X_train, y_train)
y_pred = class_knn.predict(X_test)
list_knn = model_performance(y_pred)[1]
list_knn

[[88  9]
 [57 46]] 
True Negatives: 88 
True Positives: 46 
False Positives: 9 
False Negatives: 57 
Accurary: 0.67 
Precision: 0.836 
Recall: 0.447 
F1 Score: 0.582


[88, 46, 9, 57, 0.67, 0.836, 0.447, 0.582]

In [143]:
from sklearn.linear_model import LogisticRegression
class_log = LogisticRegression(random_state = 0)
class_log.fit(X_train, y_train)
y_pred = class_log.predict(X_test)
list_log = model_performance(y_pred)[1]
list_log

[[80 17]
 [27 76]] 
True Negatives: 80 
True Positives: 76 
False Positives: 17 
False Negatives: 27 
Accurary: 0.78 
Precision: 0.817 
Recall: 0.738 
F1 Score: 0.776


[80, 76, 17, 27, 0.78, 0.817, 0.738, 0.776]

In [144]:
df = pd.DataFrame(zip(list_nb, list_rf, list_svm, list_kern, list_tree, list_knn, list_log), index = ['True Neg', 'True Pos', 'False Pos',
                                    'False Neg', 'Accuracy', 'Precision',
                                    'Recall', 'F1 Score'], columns = ['Naive Bayes', 'Random Forest', 'Linear SVM', 
                                                                      'Kernel SVM', 'Decision Tree', 'K-NN', 'Log Reg.'])
df

Unnamed: 0,Naive Bayes,Random Forest,Linear SVM,Kernel SVM,Decision Tree,K-NN,Log Reg.
True Neg,55.0,90.0,79.0,89.0,80.0,88.0,80.0
True Pos,91.0,63.0,79.0,67.0,71.0,46.0,76.0
False Pos,42.0,7.0,18.0,8.0,17.0,9.0,17.0
False Neg,12.0,40.0,24.0,36.0,32.0,57.0,27.0
Accuracy,0.73,0.765,0.79,0.78,0.755,0.67,0.78
Precision,0.684,0.9,0.814,0.893,0.807,0.836,0.817
Recall,0.883,0.612,0.767,0.65,0.689,0.447,0.738
F1 Score,0.771,0.728,0.79,0.753,0.743,0.582,0.776


In [145]:
df.to_csv('model_selection.csv', index=True, header=True)