Assess performance of different classifiers by applying the following metrics:
* Confusion Matrix
* Accuracy (how many of the predicted results are similar to the test set results)
* Precision （measuring exactness; when the model predicts yes, how often is it correct?)
* Recall (measuring completeness; when it's actually yes, how often does it predict yes?)
* F1 Score (compromise between Precision and Recall)
* calculation time

--> Save the results within a dataframe and export it to a csv

In [1]:
#loading the dataset
#take the 10000 most frequently occuring words in the dataset
from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words=10000)

In [2]:
#encoding integer sequences with multi-hot-encoding
import numpy as np
def vectorize_sequences(sequences, dimension=10000):
    #set up 0-matrix; it has shape len(sequences) * dimension 
    #a sequence is one example
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)


In [3]:
#vectorize the labels
y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")

In [4]:
#prepare validation set
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]


In [5]:
from sklearn.metrics import confusion_matrix, accuracy_score

def model_performance(y_pred):
    value_list = []
    cm = confusion_matrix(y_test, y_pred)
    TN = cm[0][0]
    value_list.append(cm[0][0])
    TP = cm[1][1]
    value_list.append(cm[1][1])
    FP = cm[0][1]
    value_list.append(cm[0][1])
    FN = cm[1][0]
    value_list.append(cm[1][0])
    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    value_list.append(Accuracy)
    Precision = TP / (TP + FP)
    value_list.append(round(Precision, 3))
    Recall = TP / (TP + FN)
    value_list.append(round(Recall, 3))
    F1 = 2 * Precision * Recall / (Precision + Recall)
    value_list.append(round(F1, 3))
    value_list.append(round(used_time, 3))
    return print((cm),'\n'
    'True Negatives:', cm[0][0],'\n'
    'True Positives:', cm[1][1],'\n'
    'False Positives:', cm[0][1],'\n'
    'False Negatives:', cm[1][0],'\n'
    'Accurary:', Accuracy,'\n'
    'Precision:', round(Precision, 3),'\n'
    'Recall:', round(Recall, 3),'\n'
    'F1 Score:', round(F1, 3),'\n' 
    'used time:', round(used_time, 3)), value_list

In [6]:
import time
start_time = time.time()

#Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
used_time = time.time() - start_time

list_nb = model_performance(y_pred)[1]
list_nb

[[10832  1668]
 [ 5921  6579]] 
True Negatives: 10832 
True Positives: 6579 
False Positives: 1668 
False Negatives: 5921 
Accurary: 0.69644 
Precision: 0.798 
Recall: 0.526 
F1 Score: 0.634 
used time: 68.975


[10832, 6579, 1668, 5921, 0.69644, 0.798, 0.526, 0.634, 68.975]

In [7]:
import time
start_time = time.time()

from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(16, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid"),
])

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics='accuracy')

history = model.fit(x_train,
                    y_train,
                    #20 iterations over all samples in train. data 
                   epochs=20,
                    #512 samples per epoch
                   batch_size=512,
                    #monitor loss & accur. of model with val. data
                   validation_data=(x_test, y_test))

y_pred_deeplearning = model.predict(x_test)

used_time = time.time() - start_time

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
y_pred_deeplearning_0_1 = []
for i in y_pred_deeplearning:
    if i > 0.5:
        i = 1
    else:
        i = 0
    y_pred_deeplearning_0_1.append(i)
    i += 1

In [9]:
list_deeplearning = model_performance(y_pred_deeplearning_0_1)[1]
list_deeplearning

[[10809  1691]
 [ 2052 10448]] 
True Negatives: 10809 
True Positives: 10448 
False Positives: 1691 
False Negatives: 2052 
Accurary: 0.85028 
Precision: 0.861 
Recall: 0.836 
F1 Score: 0.848 
used time: 76.541


[10809, 10448, 1691, 2052, 0.85028, 0.861, 0.836, 0.848, 76.541]

In [10]:
import time
start_time = time.time()

#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
class_rf = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0)
class_rf.fit(x_train, y_train)
y_pred_rf = class_rf.predict(x_test)

used_time = time.time() - start_time

list_rf = model_performance(y_pred_rf)[1]
list_rf

[[10483  2017]
 [ 2395 10105]] 
True Negatives: 10483 
True Positives: 10105 
False Positives: 2017 
False Negatives: 2395 
Accurary: 0.82352 
Precision: 0.834 
Recall: 0.808 
F1 Score: 0.821 
used time: 99.018


[10483, 10105, 2017, 2395, 0.82352, 0.834, 0.808, 0.821, 99.018]

In [11]:
#Decision Tree Classifier
start_time = time.time()
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
class_tree.fit(x_train, y_train)
y_pred = class_tree.predict(x_test)
used_time = time.time() - start_time
list_tree = model_performance(y_pred)[1]
list_tree

[[8939 3561]
 [3550 8950]] 
True Negatives: 8939 
True Positives: 8950 
False Positives: 3561 
False Negatives: 3550 
Accurary: 0.71556 
Precision: 0.715 
Recall: 0.716 
F1 Score: 0.716 
used time: 243.844


[8939, 8950, 3561, 3550, 0.71556, 0.715, 0.716, 0.716, 243.844]

In [12]:
start_time = time.time()
from sklearn.neighbors import KNeighborsClassifier
class_knn = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
class_knn.fit(x_train, y_train)
y_pred = class_knn.predict(x_test)
used_time = time.time() - start_time
list_knn = model_performance(y_pred)[1]
list_knn

[[7520 4980]
 [3820 8680]] 
True Negatives: 7520 
True Positives: 8680 
False Positives: 4980 
False Negatives: 3820 
Accurary: 0.648 
Precision: 0.635 
Recall: 0.694 
F1 Score: 0.664 
used time: 269.229


[7520, 8680, 4980, 3820, 0.648, 0.635, 0.694, 0.664, 269.229]

In [13]:
import pandas as pd
df = pd.DataFrame(zip(list_nb, list_deeplearning, list_rf, list_tree, list_knn), index = ['True Neg', 'True Pos', 'False Pos',
                                    'False Neg', 'Accuracy', 'Precision',
                                    'Recall', 'F1 Score', 'used_time'], columns = ['Naive Bayes', 'Deep Learning',
                                                                                  'Random Forest', 'Decision Tree', 'KNN'])
df

Unnamed: 0,Naive Bayes,Deep Learning,Random Forest,Decision Tree,KNN
True Neg,10832.0,10809.0,10483.0,8939.0,7520.0
True Pos,6579.0,10448.0,10105.0,8950.0,8680.0
False Pos,1668.0,1691.0,2017.0,3561.0,4980.0
False Neg,5921.0,2052.0,2395.0,3550.0,3820.0
Accuracy,0.69644,0.85028,0.82352,0.71556,0.648
Precision,0.798,0.861,0.834,0.715,0.635
Recall,0.526,0.836,0.808,0.716,0.694
F1 Score,0.634,0.848,0.821,0.716,0.664
used_time,68.975,76.541,99.018,243.844,269.229


In [14]:
df.to_csv('20220330_model_selection.csv', index=True, header=True)