### Model Comparison 
We try to evaluate the optimal model using different metrics :


*   Accuracy, Precision and Recall 
*   F1-score 
*   Since we have a binary classification probelem we also use ROC curve 



In [None]:
%matplotlib inline

import re 
from time import time

import pandas as pd

# Neural Net classifier 
import tensorflow as tf 
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# SVM classifier
from sklearn import svm

# GDBT classifier 
from sklearn.ensemble import GradientBoostingClassifier

# Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

# Xgboost model 
!pip install xgboost 
import xgboost as xgb

# preprocessing module
import preprocess



In [None]:
# model evaluation 
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold


from statistics import mean

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 18.4 MB/s eta 0:00:01[K     |▋                               | 20 kB 14.0 MB/s eta 0:00:01[K     |▉                               | 30 kB 10.9 MB/s eta 0:00:01[K     |█▏                              | 40 kB 9.4 MB/s eta 0:00:01[K     |█▌                              | 51 kB 7.9 MB/s eta 0:00:01[K     |█▊                              | 61 kB 7.9 MB/s eta 0:00:01[K     |██                              | 71 kB 7.7 MB/s eta 0:00:01[K     |██▍                             | 81 kB 8.6 MB/s eta 0:00:01[K     |██▋                             | 92 kB 8.0 MB/s eta 0:00:01[K     |███                             | 102 kB 7.5 MB/s eta 0:00:01[K     |███▎                            | 112 kB 7.5 MB/s eta 0:00:01[K     |███▌                            | 122 kB 7.5 MB/s eta 0:00:01[K     |███

In [None]:
# import and prep dataset 
# import the advertising dataset 
# Each ad has a binary label 0 for commercial and 1 for informational ads 
df = preprocess.get_data('./data/task1_dataset.csv', columns_to_drop= False )
df = df.iloc[: , 1:]

# removing links from the data 
df["text"] = df["text"].apply(lambda text: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text, flags=re.MULTILINE))

clean_data = preprocess.preprocess_data(df['text'])

df['clean_data'] = clean_data
df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(r'[0-9]+', '', x))
df['clean_data'] = df['clean_data'].apply(lambda x: re.sub(' +', ' ', x))

df.head()

Unnamed: 0,label,text,clean_data
0,0,"""My Thistle subscription has changed my work f...",thistle subscription changed work home life sa...
1,0,"After I bought this toy for my child, he staye...",bought toy child stayed away ipadtablet tv pro...
2,0,A Black Friday purchase you actually won't reg...,black friday purchase actually wont regret sto...
3,0,"The Nutcracker, accompany you and your family ...",nutcracker accompany family warm holiday
4,0,ohora’s NEW COLLECTION - shop right away ⠀ It...,ohora’s new collection shop right away ⠀ getti...


In [None]:
# vectorize data 
vect_data = preprocess.tokenized_tfidf(df['clean_data'])
data = tf.data.Dataset.from_tensor_slices((df['clean_data'], df['label'])).batch(1024)

In [None]:
# Create the optimal model found with each machine learning algorithm 
svm_model = svm.SVC( C = 10, kernel = 'rbf')

nv_model = MultinomialNB()

gdbt_model = GradientBoostingClassifier(
    n_estimators=500, 
    learning_rate=0.01, 
    max_depth=13, 
    random_state=0)

xgboost_model = xgb.XGBClassifier(
    colsample_bytree = 0.5,
    gamma = 3, 
    learning_rate  = 0.3, 
    max_depth = 20, 
    reg_lambda = 10, 
    scale_pos_weight= 3, 
    subsample = 0.8,
    objective="binary:logistic")

In [None]:
def create_model(vectorize_layer, units = 10, vocab_size =10000, embedding_dim = 50, maxlen = 100 , dropout = 0.2 ):
  model = Sequential([
      vectorize_layer,
      layers.Embedding(input_dim = vocab_size , output_dim = embedding_dim, input_length=maxlen),
      layers.Dropout( rate = dropout ),
      layers.GlobalMaxPooling1D(),
      layers.Dropout( rate = dropout),
      layers.Dense(units, activation='relu'),
      layers.Dropout( rate = dropout ),
      layers.Dense(1, activation='sigmoid')              
  ])
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy', 
                         tf.keras.metrics.Precision(), 
                         tf.keras.metrics.Recall(), 
                         tf.keras.metrics.AUC()
                        ])
  return model

vectorize_layer = layers.TextVectorization( max_tokens = 10000, output_sequence_length = 100)
vectorize_layer.adapt( data.map(lambda x, y: x) )

In [None]:
# Perform cross-validation to each machine learning classifier
folds = 10    

# Define dictionary with performance metrics
scoring = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score), 
           'roc_auc_score': make_scorer(roc_auc_score)}
  
svm     = cross_validate(svm_model, vect_data , df['label'] , cv = folds, scoring=scoring)
nb      = cross_validate(nv_model, vect_data , df['label'] , cv = folds, scoring=scoring)
gdbt    = cross_validate(gdbt_model, vect_data , df['label'] , cv = folds, scoring=scoring)
xgboost = cross_validate(xgboost_model, vect_data , df['label'] , cv = folds, scoring=scoring)

In [None]:
# Cross validation with the tensorflow nn is performed differently
acc_per_fold = []
loss_per_fold = []
prec_per_fold = []
recall_per_fold = []
auc_per_fold = []
kfold = KFold(n_splits = folds, shuffle=True)           # Define the K-fold Cross Validator

for train, test in kfold.split(df['clean_data'], df['label']):

    train_data = tf.data.Dataset.from_tensor_slices(  (df['clean_data'].iloc[train], df['label'].iloc[train]) ).batch(1024)
    test_data = tf.data.Dataset.from_tensor_slices( (df['clean_data'].iloc[test], df['label'].iloc[test]) ).batch(1024)
    ann_model = create_model(vectorize_layer)

    history = ann_model.fit(train_data, epochs = 20, batch_size = 10, verbose = False, validation_data = test_data) 
    scores = ann_model.evaluate(test_data)
    loss_per_fold.append(scores[0])
    acc_per_fold.append(scores[1])
    prec_per_fold.append(scores[2])
    recall_per_fold.append(scores[3])
    auc_per_fold.append(scores[4])



In [None]:
# Manually calculate accuracy, precesion, recall and f1score ann in cross validation
ann = {}
ann['test_accuracy'] = mean(acc_per_fold)
ann['test_precision']= mean(prec_per_fold)
ann['test_recall'] = mean(recall_per_fold)
ann['test_f1_score'] = (2 * mean(prec_per_fold) * mean(recall_per_fold)) / (mean(prec_per_fold) + mean(recall_per_fold))   
ann['test_roc_auc_score'] = mean(auc_per_fold)
# Create a dataframe with the models performance metrics scores
models_scores_table = pd.DataFrame({'Support Vector':[svm['test_accuracy'].mean(),
                                                                 svm['test_precision'].mean(),
                                                                 svm['test_recall'].mean(),
                                                                 svm['test_f1_score'].mean(), 
                                                                 svm['test_roc_auc_score'].mean()],
                                    
                                    'Naive Bayes':[nb['test_accuracy'].mean(),
                                                              nb['test_precision'].mean(),
                                                              nb['test_recall'].mean(),
                                                              nb['test_f1_score'].mean(), 
                                                              nb['test_roc_auc_score'].mean()],
                                      
                                    'GDBT':[gdbt['test_accuracy'].mean(),
                                                       gdbt['test_precision'].mean(),
                                                       gdbt['test_recall'].mean(),
                                                       gdbt['test_f1_score'].mean(), 
                                                       gdbt['test_roc_auc_score'].mean()],
                                      
                                    'XgBoost':[xgboost['test_accuracy'].mean(),
                                               xgboost['test_precision'].mean(),
                                               xgboost['test_recall'].mean(),
                                               xgboost['test_f1_score'].mean(), 
                                               xgboost['test_roc_auc_score'].mean()], 
                                    'ANN': [ann['test_accuracy'],
                                            ann['test_precision'],
                                            ann['test_recall'],
                                            ann['test_f1_score'], 
                                            ann['test_roc_auc_score']  
                                            ]
                                     },
                                  index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])

# Add 'Best Score' column
models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
models_scores_table

Unnamed: 0,Support Vector,Naive Bayes,GDBT,XgBoost,ANN,Best Score
Accuracy,0.907069,0.909413,0.845139,0.8436,0.908342,Naive Bayes
Precision,0.922169,0.911496,0.894729,0.788465,0.916122,Support Vector
Recall,0.890119,0.907854,0.783841,0.94106,0.899656,XgBoost
F1 Score,0.905813,0.909615,0.835529,0.858014,0.907814,Naive Bayes
AUC,0.90714,0.909418,0.845394,0.843194,0.970479,ANN
