# 🔐 Authentication

## 📚 Loading libraries

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import shap
import sys
import tsfresh

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils.const import *
from utils.helperFunctions import *

import warnings
warnings.filterwarnings("ignore")

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## 📍 Variables

In [None]:
# Listing IDs
ids = ['id1', 'id2', 'id3']
# Choose what ID to process
id = 'id1'
ids_remove = [x for x in ids if x != id]

# Filter features and keep only relevant ones
filterFeatures = True

# Undersampling
fairUndersampling = False       # Each class same number
targetedUndersampling = True    # Downsample most frequent class
customBalance = False           # Downsample by specifying number of samples for each label

# If True, perform authentication. If False, perform identification
# - Authentication: binary classification, unbalanced
# - Identificatiom: multiclass classification, balanced
authentication = False

## 🤖 Models

In [None]:
names = [
    'AdaBoost',
    'Decision Tree',
    'Gaussian Naive Bayes',
    'Nearest Neighbors',
    'Neural Network',
    'Quadratic Discriminant Analysis',
    'Random Forest',
    'Support Vector Machine'
]

classifiers = [
    AdaBoostClassifier(random_state=SEED),
    DecisionTreeClassifier(random_state=SEED),
    GaussianNB(),
    KNeighborsClassifier(),
    MLPClassifier(random_state=SEED),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(random_state=SEED),
    SVC(random_state=SEED),
]

parameters = [
    # AdaBoostClassifier
    {
        'n_estimators': [50, 100, 150, 200]
    },
    # DecisionTreeClassifier
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': np.arange(3, 20)
    },
    # GaussianNB
    {
        'var_smoothing': np.logspace(0, -9, num=100)
    },
    # KNeighborsClassifier
    {
        'n_neighbors': list(range(1, 20)),
        'weights': ['uniform', 'distance']
    },
    # MLPClassifier
    {
        'hidden_layer_sizes': [(50, ), (100, ), (200, )],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd']
    },
    # QuadraticDiscriminantAnalysis
    {
        'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]
    },
    # RandomForestClassifier
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'n_estimators': [100, 200, 300, 400, 500]
    },
    # SVC
    {
        'kernel': ['linear', 'rbf'],
        'C': np.arange(1, 5, 1),
        'gamma': np.arange(0.2, 1, 0.2)
    },
]

## 📚 Datasets

In [None]:
dff = []
for dataset in DATASETS:
    file = os.path.join(PROCESSED, dataset)
    if file.split('.')[-1] == 'parquet':
        df = pd.read_parquet(file)
        dff.append(df)

    df = pd.concat(dff)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
axs = axs.ravel()

df['id1'].value_counts().sort_index().plot(
    kind='bar', title='ID1 Distribution', xlabel='IDs', ylabel='Occurences', ax=axs[0])
df['id2'].value_counts().sort_index().plot(
    kind='bar', title='ID2 Distribution', xlabel='IDs', ylabel='Occurences', ax=axs[1])

plt.tight_layout()

In [None]:
if customBalance:
    id1_dict = {}
    id2_dict = {}

In [None]:
if targetedUndersampling:
    df_x = df.drop(id, axis=1)
    df_x = df_x.drop(ids_remove, axis=1)
    if customBalance:
        X_resampled, y_resampled = RandomUnderSampler(
            sampling_strategy=id2_dict, random_state=SEED).fit_resample(df_x, df[id])
    else:
        X_resampled, y_resampled = RandomUnderSampler(random_state=SEED).fit_resample(df_x, df[id])

    X_resampled[id] = y_resampled
    # for id_remove in ids_remove:
    #     X_resampled[id_remove]
    df = X_resampled

    df[id].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution', xlabel='IDs', ylabel='Occurences')

In [None]:
beforeFeat = df.shape[1]
tsfresh.utilities.dataframe_functions.impute(df)

if filterFeatures:
    df = tsfresh.select_features(df, df[id])
    afterFeat = df.shape[1]

    print(f'[🔥 FILTER]\n\tBefore: {beforeFeat}\n\tAfter: {afterFeat}')


## 💪 Training

In [None]:
# Loading labels
labels = df[id][:, np.newaxis]

if authentication:
    # Lists of datasets
    X_trains = []
    X_tests = []
    Y_trains = []
    Y_tests = []

    # Translating to authentication, i.e., taking only one label
    # Saving different dataset, one for each label
    for label in np.unique(labels):
        labels_auth = []
        for l in labels:
            if l == label:
                labels_auth.append(1)
            else:
                labels_auth.append(0)

        labels_auth = np.array(labels_auth)

        # Loading features
        features = df.drop(id, axis=1)

        # Train and test split
        X_train, X_test, Y_train, Y_test = train_test_split(
            features, labels_auth, test_size=0.2, random_state=SEED)

        cols = []
        for col in X_train.columns:
            cols.append(col.replace('z2__', ''))

        X_train.columns = cols
        X_test.columns = cols

        X_trains.append(X_train)
        X_tests.append(X_test)
        Y_trains.append(Y_train)
        Y_tests.append(Y_test)
else:
    # Loading features
    features = df.drop(id, axis=1)
    if not targetedUndersampling:
        for id_remove in ids_remove:
            features = features.drop(id_remove, axis=1)

    # Train and test split
    X_train, X_test, Y_train, Y_test = train_test_split(
        features, labels, test_size=0.2, random_state=SEED)

    cols = []
    for col in X_train.columns:
        cols.append(col.replace('z2__', ''))

    X_train.columns = cols
    X_test.columns = cols

In [None]:
train_scores = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
best_params = []

# Iterate over classifiers
for name, clf, param in zip(names, classifiers, parameters):
    if authentication:
        score_trains = []
        accuracy_tests = []
        precision_tests = []
        recall_tests = []
        f1_tests = []
        for i, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_trains, X_tests, Y_trains, Y_tests)):
            # Defining GridSearch
            grid = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
            print(f'[🤖 MODEL] {name} ({i+1}/{len(X_trains)})', end='\r')
            # Fitting the model
            grid.fit(X_train, Y_train)
            # Training score
            score_trains.append(grid.best_estimator_.score(X_train, Y_train))
            # Test scores
            Y_pred = grid.best_estimator_.predict(X_test)
            accuracy_tests.append(accuracy_score(Y_test, Y_pred))
            precision_tests.append(precision_score(Y_test, Y_pred))
            recall_tests.append(recall_score(Y_test, Y_pred))
            f1_tests.append(f1_score(Y_test, Y_pred))
            
        print()
        print(f'\t[💪 TRAIN]\t{round(np.mean(score_trains), 3)}')
        print(f'\t[📊 ACCURACY]\t{round(np.mean(accuracy_tests), 3)}')
        print(f'\t[📊 PRECISION]\t{round(np.mean(precision_tests), 3)}')
        print(f'\t[📊 RECALL]\t{round(np.mean(recall_tests), 3)}')
        print(f'\t[📊 F1 SCORE]\t{round(np.mean(f1_tests), 3)}\n')
        best_params.append(grid.best_params_)
    else:
        print(f'[🤖 MODEL] {name}')
        # Defining GridSearch
        grid = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
        # Fitting the model
        grid.fit(X_train, Y_train)
        # Training score
        score_train = grid.best_estimator_.score(X_train, Y_train)
        print(f'\t[💪 TRAIN]\t{round(score_train, 3)}')
        # Test scores
        Y_pred = grid.best_estimator_.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        precision = precision_score(Y_test, Y_pred, average='macro')
        recall = recall_score(Y_test, Y_pred, average='macro')
        f1 = f1_score(Y_test, Y_pred, average='macro')

        print(f'\t[📊 ACCURACY]\t{round(accuracy, 3)}')
        print(f'\t[📊 PRECISION]\t{round(precision, 3)}')
        print(f'\t[📊 RECALL]\t{round(recall, 3)}')
        print(f'\t[📊 F1 SCORE]\t{round(f1, 3)}\n')

        train_scores.append(score_train)
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        best_params.append(grid.best_params_)

    # Feature importance for Random Forest
    if name == 'Random Forest':
        # Confusion Matrix
        conf_matrix = confusion_matrix(y_true=Y_test, y_pred=Y_pred)
        # Explainable ML
        impurity = grid.best_estimator_.feature_importances_
        std = np.std([tree.feature_importances_ for tree in grid.best_estimator_.estimators_], axis=0)
        explainer = shap.TreeExplainer(grid.best_estimator_)
        shap_values = explainer.shap_values(X_test)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i, s=conf_matrix[i, j],
                va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix for Random Forest', fontsize=18)
plt.show()

## 🔝 Feature Importance

In [None]:
forest_impurity = pd.Series(impurity, index=X_train.columns).nlargest(20)

fig, ax = plt.subplots()
forest_impurity.plot.bar(ax=ax)  # , yerr=std)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
# plt.xticks(rotation = 90)
fig.tight_layout()

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")