# 🔐 Authentication

## 📚 Loading libraries

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import shap
import sys
import tsfresh

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils.const import *
from utils.helperFunctions import *

import warnings
warnings.filterwarnings("ignore")

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## 📍 Variables

In [None]:
# Listing IDs
ids = ['id1', 'id2', 'id3']
# Choose what ID to process
id = 'id1'
ids_remove = [x for x in ids if x != id]

# Filter features and keep only relevant ones
filterFeatures = True

# Undersampling
fairUndersampling = False       # Each class same number
targetedUndersampling = True    # Downsample most frequent class
customBalance = False           # Downsample by specifying number of samples for each label

# Choose whether to separate Charge and Discharge cycling or not
separate = False

# If True, perform authentication. If False, perform identification
# - Authentication: binary classification, unbalanced
# - Identificatiom: multiclass classification, balanced
authentication = True

# Results names and folders
if not os.path.exists(RESULTS):
    os.mkdir(RESULTS)
    os.mkdir(FIGURES)

if authentication:
    saveBase = id.upper() + '_AUTH'
else:
    saveBase = id.upper() + '_IDENT'

imageFolder = os.path.join(FIGURES, saveBase)
if not os.path.exists(imageFolder):
    os.mkdir(imageFolder)

## 🤖 Models

In [None]:
names = [
    'AdaBoost',
    'Decision Tree',
    'Gaussian Naive Bayes',
    'Nearest Neighbors',
    'Neural Network',
    'Quadratic Discriminant Analysis',
    'Random Forest',
    'Support Vector Machine'
]

classifiers = [
    AdaBoostClassifier(random_state=SEED),
    DecisionTreeClassifier(random_state=SEED),
    GaussianNB(),
    KNeighborsClassifier(),
    MLPClassifier(random_state=SEED),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(random_state=SEED),
    SVC(random_state=SEED),
]

parameters = [
    # AdaBoostClassifier
    {
        'n_estimators': [50, 100, 150, 200]
    },
    # DecisionTreeClassifier
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': np.arange(3, 20)
    },
    # GaussianNB
    {
        'var_smoothing': np.logspace(0, -9, num=100)
    },
    # KNeighborsClassifier
    {
        'n_neighbors': list(range(1, 20)),
        'weights': ['uniform', 'distance']
    },
    # MLPClassifier
    {
        'hidden_layer_sizes': [(50, ), (100, ), (200, )],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd']
    },
    # QuadraticDiscriminantAnalysis
    {
        'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]
    },
    # RandomForestClassifier
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'n_estimators': [100, 200, 300, 400, 500]
    },
    # SVC
    {
        'kernel': ['linear', 'rbf'],
        'C': np.arange(1, 5, 1),
        'gamma': np.arange(0.2, 1, 0.2)
    },
]

## 📚 Datasets

In [None]:
if not separate:
    dff = []
    for dataset in DATASETS:
        dir = os.path.join(PROCESSED, dataset)
        for file in os.listdir(dir):
            if file.split('.')[-1] == 'parquet':
                df = pd.read_parquet(os.path.join(dir, file))
                dff.append(df)

        df = pd.concat(dff)
else:
    dff_charge = []
    dff_discharge = []
    for dataset in DATASETS:
        dir = os.path.join(PROCESSED, dataset)
        for file in os.listdir(dir):
            if file.split('.')[-1] == 'parquet':
                df = pd.read_parquet(os.path.join(dir, file))
                if file.split('.')[0] == 'charge':
                    dff_charge.append(df)
                elif file.split('.')[0] == 'discharge':
                    dff_discharge.append(df)

    df_charge = pd.concat(dff_charge)
    df_discharge = pd.concat(dff_discharge)

In [None]:
if not separate:
    fig, axs = plt.subplots(1, 2, figsize=(8, 4))
    axs = axs.ravel()

    df['id1'].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution', xlabel='IDs', ylabel='Occurences', ax=axs[0])
    df['id2'].value_counts().sort_index().plot(
        kind='bar', title='ID2 Distribution', xlabel='IDs', ylabel='Occurences', ax=axs[1])

else:
    fig, axs = plt.subplots(2, 2, figsize=(8, 8))
    axs = axs.ravel()

    df_charge['id1'].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution on Charge', xlabel='IDs', ylabel='Occurences', ax=axs[0])
    df_charge['id2'].value_counts().sort_index().plot(
        kind='bar', title='ID2 Distribution on Charge', xlabel='IDs', ylabel='Occurences', ax=axs[1])
    df_discharge['id1'].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution on Discharge', xlabel='IDs', ylabel='Occurences', ax=axs[2])
    df_discharge['id2'].value_counts().sort_index().plot(
        kind='bar', title='ID2 Distribution on Discharge', xlabel='IDs', ylabel='Occurences', ax=axs[3])

plt.tight_layout()
saveDistPath = os.path.join(imageFolder, saveBase)
saveDistPath += '_unbalancedDistribution.pdf'
plt.savefig(saveDistPath)

In [None]:
if targetedUndersampling:
    if not separate:
        df_x = df.drop(id, axis=1)
        df_x = df_x.drop(ids_remove, axis=1)
        
        X_resampled, y_resampled = RandomUnderSampler(random_state=SEED).fit_resample(df_x, df[id])

        X_resampled[id] = y_resampled
        df = X_resampled

        df[id].value_counts().sort_index().plot(
            kind='bar', title='ID Distribution', xlabel='IDs', ylabel='Occurences')
    else:
        dfc_x = df_charge.drop(id, axis=1)
        dfd_x = df_discharge.drop(id, axis=1)

        dfc_x = dfc_x.drop(ids_remove, axis=1)
        dfd_x = dfd_x.drop(ids_remove, axis=1)
        
        Xc_resampled, yc_resampled = RandomUnderSampler(random_state=SEED).fit_resample(dfc_x, df_charge[id])
        Xd_resampled, yd_resampled = RandomUnderSampler(random_state=SEED).fit_resample(dfd_x, df_discharge[id])

        Xc_resampled[id] = yc_resampled
        Xd_resampled[id] = yd_resampled
        
        df_charge = Xc_resampled
        df_discharge = Xd_resampled

        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
        axs = axs.ravel()

        df_charge[id].value_counts().sort_index().plot(
            kind='bar', title='ID Distribution on Charge', xlabel='IDs', ylabel='Occurences', ax=axs[0])
        df_discharge[id].value_counts().sort_index().plot(
            kind='bar', title='ID Distribution on Discharge', xlabel='IDs', ylabel='Occurences', ax=axs[1])

saveBalPath = os.path.join(imageFolder, saveBase)
saveBalPath += '_balancedDistribution.pdf'
plt.savefig(saveBalPath)

In [None]:
if not separate:
    beforeFeat = df.shape[1]
    tsfresh.utilities.dataframe_functions.impute(df)

    if filterFeatures:
        df = tsfresh.select_features(df, df[id])
        afterFeat = df.shape[1]

        print(f'[🔥 FILTER]\n\tBefore: {beforeFeat}\n\tAfter: {afterFeat}')
else:
    beforeFeat_c = df_charge.shape[1]
    beforeFeat_d = df_discharge.shape[1]

    tsfresh.utilities.dataframe_functions.impute(df_charge)
    tsfresh.utilities.dataframe_functions.impute(df_discharge)

    if filterFeatures:
        df_charge = tsfresh.select_features(df_charge, df_charge[id])
        df_discharge = tsfresh.select_features(df_discharge, df_discharge[id])
        
        afterFeat_c = df_charge.shape[1]
        afterFeat_d = df_discharge.shape[1]

        print(f'[🔥 CHARGE]\n\tBefore: {beforeFeat_c}\n\tAfter: {afterFeat_c}')
        print()
        print(f'[🔥 DISCHARGE]\n\tBefore: {beforeFeat_d}\n\tAfter: {afterFeat_d}')

## 💪 Training

In [None]:
if not separate:
    # Loading labels
    labels = df[id][:, np.newaxis]

    if authentication:

        # Lists of datasets
        X_trains = []
        X_tests = []
        Y_trains = []
        Y_tests = []
        
        # Translating to authentication, i.e., taking only one label
        # Saving different dataset, one for each label
        for label in np.unique(labels):
            labels_auth = []
            for l in labels:
                if l == label:
                    labels_auth.append(1)
                else:
                    labels_auth.append(0)

            labels_auth = np.array(labels_auth)

            # Loading features
            features = df.drop(id, axis=1)

            # Train and test split
            X_train, X_test, Y_train, Y_test = train_test_split(
                features, labels_auth, test_size=0.2, random_state=SEED)

            cols = []
            for col in X_train.columns:
                cols.append(col.replace('dQ/dV__', ''))

            X_train.columns = cols
            X_test.columns = cols

            X_trains.append(X_train)
            X_tests.append(X_test)
            Y_trains.append(Y_train)
            Y_tests.append(Y_test)
    else:
        # Loading features
        features = df.drop(id, axis=1)
        # for id_remove in ids_remove:
        #     features = features.drop(id_remove, axis=1)

        # Train and test split
        X_train, X_test, Y_train, Y_test = train_test_split(
            features, labels, test_size=0.2, random_state=SEED)

        cols = []
        for col in X_train.columns:
            cols.append(col.replace('dQ/dV__', ''))

        X_train.columns = cols
        X_test.columns = cols
else:
    # Loading labels
    labels_c = df_charge[id][:, np.newaxis]
    labels_d = df_discharge[id][:, np.newaxis]

    if authentication:

        # Lists of datasets
        X_trains_c = []
        X_tests_c = []
        Y_trains_c = []
        Y_tests_c = []

        X_trains_d = []
        X_tests_d = []
        Y_trains_d = []
        Y_tests_d = []

        # Translating to authentication, i.e., taking only one label
        # Saving different dataset, one for each label
        for label in np.unique(labels_c):
            labels_auth_c = []
            for l in labels_c:
                if l == label:
                    labels_auth_c.append(1)
                else:
                    labels_auth_c.append(0)

            labels_auth_c = np.array(labels_auth_c)

            # Loading features
            features_c = df_charge.drop(id, axis=1)

            # Train and test split
            X_train_c, X_test_c, Y_train_c, Y_test_c = train_test_split(
                features_c, labels_auth_c, test_size=0.2, random_state=SEED)

            cols = []
            for col in X_train_c.columns:
                cols.append(col.replace('dQ/dV__', ''))

            X_train_c.columns = cols
            X_test_c.columns = cols

            X_trains_c.append(X_train_c)
            X_tests_c.append(X_test_c)
            Y_trains_c.append(Y_train_c)
            Y_tests_c.append(Y_test_c)

        for label in np.unique(labels_d):
            labels_auth_d = []
            for l in labels_d:
                if l == label:
                    labels_auth_d.append(1)
                else:
                    labels_auth_d.append(0)

            labels_auth_d = np.array(labels_auth_d)

            # Loading features
            features_d = df_discharge.drop(id, axis=1)

            # Train and test split
            X_train_d, X_test_d, Y_train_d, Y_test_d = train_test_split(
                features_d, labels_auth_d, test_size=0.2, random_state=SEED)

            cols = []
            for col in X_train_d.columns:
                cols.append(col.replace('dQ/dV__', ''))

            X_train_d.columns = cols
            X_test_d.columns = cols

            X_trains_d.append(X_train_d)
            X_tests_d.append(X_test_d)
            Y_trains_d.append(Y_train_d)
            Y_tests_d.append(Y_test_d)

    else:
        # Loading features
        features_c = df_charge.drop(id, axis=1)
        features_d = df_discharge.drop(id, axis=1)
        # for id_remove in ids_remove:
        #     features_c = features_c.drop(id_remove, axis=1)
        #     features_d = features_d.drop(id_remove, axis=1)

        # Train and test split
        X_train_c, X_test_c, Y_train_c, Y_test_c = train_test_split(
            features_c, labels_c, test_size=0.2, random_state=SEED)
        X_train_d, X_test_d, Y_train_d, Y_test_d = train_test_split(
            features_d, labels_d, test_size=0.2, random_state=SEED)
        
        cols_c = []
        cols_d = []
        for col in X_train_c.columns:
            cols_c.append(col.replace('dQ/dV__', ''))
        for col in X_train_d.columns:
            cols_d.append(col.replace('dQ/dV__', ''))

        X_train_c.columns = cols_c
        X_test_c.columns = cols_c
        X_train_d.columns = cols_d
        X_test_d.columns = cols_d

In [None]:
if not separate:
    train_scores = []

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    best_params = []
else:
    train_scores_c = []
    train_scores_d = []

    accuracy_scores_c = []
    precision_scores_c = []
    recall_scores_c = []
    f1_scores_c = []
    accuracy_scores_d = []
    precision_scores_d = []
    recall_scores_d = []
    f1_scores_d = []
    
    best_params_c = []
    best_params_d = []

# Iterate over classifiers
for name, clf, param in zip(names, classifiers, parameters):
    if not separate:
        if authentication:
            score_trains = []
            accuracy_tests = []
            precision_tests = []
            recall_tests = []
            f1_tests = []
            for i, (X_train, X_test, Y_train, Y_test) in enumerate(zip(X_trains, X_tests, Y_trains, Y_tests)):
                # Defining GridSearch
                grid = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
                print(f'[🤖 MODEL] {name} ({i+1}/{len(X_trains)})', end='\r')
                # Fitting the model
                grid.fit(X_train, Y_train)
                # Training score
                score_trains.append(grid.best_estimator_.score(X_train, Y_train))
                # Test scores
                Y_pred = grid.best_estimator_.predict(X_test)
                accuracy_tests.append(accuracy_score(Y_test, Y_pred))
                precision_tests.append(precision_score(Y_test, Y_pred))
                recall_tests.append(recall_score(Y_test, Y_pred))
                f1_tests.append(f1_score(Y_test, Y_pred))
            print()
            print(f'\t[💪 TRAIN]\t{round(np.mean(score_trains), 3)}')
            print(f'\t[📊 ACCURACY]\t{round(np.mean(accuracy_tests), 3)}')
            print(f'\t[📊 PRECISION]\t{round(np.mean(precision_tests), 3)}')
            print(f'\t[📊 RECALL]\t{round(np.mean(recall_tests), 3)}')
            print(f'\t[📊 F1 SCORE]\t{round(np.mean(f1_tests), 3)}\n')
            # Saving results
            train_scores.append(round(np.mean(score_trains), 3))
            accuracy_scores.append(round(np.mean(accuracy_tests), 3))
            precision_scores.append(round(np.mean(precision_tests), 3))
            recall_scores.append(round(np.mean(recall_tests), 3))
            f1_scores.append(round(np.mean(f1_tests), 3))
            best_params.append(grid.best_params_)

        else:
            print(f'[🤖 MODEL] {name}')
            # Defining GridSearch
            grid = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
            # Fitting the model
            grid.fit(X_train, Y_train)
            # Training score
            score_train = grid.best_estimator_.score(X_train, Y_train)
            print(f'\t[💪 TRAIN]\t{round(score_train, 3)}')
            # Test scores
            Y_pred = grid.best_estimator_.predict(X_test)
            accuracy = accuracy_score(Y_test, Y_pred)
            precision = precision_score(Y_test, Y_pred, average='macro')
            recall = recall_score(Y_test, Y_pred, average='macro')
            f1 = f1_score(Y_test, Y_pred, average='macro')

            print(f'\t[📊 ACCURACY]\t{round(accuracy, 3)}')
            print(f'\t[📊 PRECISION]\t{round(precision, 3)}')
            print(f'\t[📊 RECALL]\t{round(recall, 3)}')
            print(f'\t[📊 F1 SCORE]\t{round(f1, 3)}\n')

            train_scores.append(score_train)
            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)
            best_params.append(grid.best_params_)

        # Feature importance for Random Forest
        if name == 'Random Forest':
            # Confusion Matrix
            conf_matrix = confusion_matrix(y_true=Y_test, y_pred=Y_pred)
            # Explainable ML
            impurity = grid.best_estimator_.feature_importances_
            std = np.std([tree.feature_importances_ for tree in grid.best_estimator_.estimators_], axis=0)
            explainer = shap.TreeExplainer(grid.best_estimator_)
            shap_values = explainer.shap_values(X_test)
    else:
        if authentication:
            score_trains_c = []
            accuracy_tests_c = []
            precision_tests_c = []
            recall_tests_c = []
            f1_tests_c = []
            score_trains_d = []
            accuracy_tests_d = []
            precision_tests_d = []
            recall_tests_d = []
            f1_tests_d = []

            for i, (X_train_c, X_test_c, Y_train_c, Y_test_c) in enumerate(zip(X_trains_c, X_tests_c, Y_trains_c, Y_tests_c)):
                grid_c = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
                grid_d = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
                print(f'[🤖 MODEL] {name} ({i+1}/{len(X_trains_c)})', end='\r')
                grid_c.fit(X_train_c, Y_train_c)
                score_trains_c.append(grid_c.best_estimator_.score(X_train_c, Y_train_c))
                # Test scores
                Y_pred_c = grid_c.best_estimator_.predict(X_test_c)
                accuracy_tests_c.append(accuracy_score(Y_test_c, Y_pred_c))
                precision_tests_c.append(precision_score(Y_test_c, Y_pred_c))
                recall_tests_c.append(recall_score(Y_test_c, Y_pred_c))
                f1_tests_c.append(f1_score(Y_test_c, Y_pred_c))
            print()
            print(f'\t[💪 TRAIN-C]\t{round(np.mean(score_trains_c), 3)}')
            print(f'\t[📊 ACCURACY-C]\t{round(np.mean(accuracy_tests_c), 3)}')
            print(f'\t[📊 PRECISION-C]\t{round(np.mean(precision_tests_c), 3)}')
            print(f'\t[📊 RECALL-C]\t{round(np.mean(recall_tests_c), 3)}')
            print(f'\t[📊 F1 SCORE-C]\t{round(np.mean(f1_tests_c), 3)}\n')
            # Saving results
            train_scores_c.append(round(np.mean(score_trains_c), 3))
            accuracy_scores_c.append(round(np.mean(accuracy_tests_c), 3))
            precision_scores_c.append(round(np.mean(precision_tests_c), 3))
            recall_scores_c.append(round(np.mean(recall_tests_c), 3))
            f1_scores_c.append(round(np.mean(f1_tests_c), 3))
            best_params_c.append(grid_c.best_params_)

            for i, (X_train_d, X_test_d, Y_train_d, Y_test_d) in enumerate(zip(X_trains_d, X_tests_d, Y_trains_d, Y_tests_d)):
                print(f'[🤖 MODEL] ({name} {i+1}/{len(X_trains_d)})', end='\r')
                grid_d.fit(X_train_d, Y_train_d)
                score_trains_d.append(grid_d.best_estimator_.score(X_train_d, Y_train_d))
                # Test scores
                Y_pred_d = grid_d.best_estimator_.predict(X_test_d)
                accuracy_tests_d.append(accuracy_score(Y_test_d, Y_pred_d))
                precision_tests_d.append(precision_score(Y_test_d, Y_pred_d))
                recall_tests_d.append(recall_score(Y_test_d, Y_pred_d))
                f1_tests_d.append(f1_score(Y_test_d, Y_pred_d))
            print()
            print(f'\t[💪 TRAIN-D]\t{round(np.mean(score_trains_d), 3)}')
            print(f'\t[📊 ACCURACY-D]\t{round(np.mean(accuracy_tests_d), 3)}')
            print(f'\t[📊 PRECISION-D]\t{round(np.mean(precision_tests_d), 3)}')
            print(f'\t[📊 RECALL-D]\t{round(np.mean(recall_tests_d), 3)}')
            print(f'\t[📊 F1 SCORE-D]\t{round(np.mean(f1_tests_d), 3)}\n')
            # Saving results
            train_scores_d.append(round(np.mean(score_trains_d), 3))
            accuracy_scores_d.append(round(np.mean(accuracy_tests_d), 3))
            precision_scores_d.append(round(np.mean(precision_tests_d), 3))
            recall_scores_d.append(round(np.mean(recall_tests_d), 3))
            f1_scores_d.append(round(np.mean(f1_tests_d), 3))
            best_params_d.append(grid_d.best_params_)
        else:
            grid_c = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
            grid_d = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
            
            print(f'[🤖 MODEL] {name}')

            grid_c.fit(X_train_c, Y_train_c)
            grid_d.fit(X_train_d, Y_train_d)

            score_train_c = grid_c.best_estimator_.score(X_train_c, Y_train_c)
            score_train_d = grid_d.best_estimator_.score(X_train_d, Y_train_d)
            print(f'\t[💪 TRAIN-C]\t{round(score_train_c, 3)}')
            print(f'\t[💪 TRAIN-D]\t{round(score_train_d, 3)}')

            # Test scores
            Y_pred_c = grid_c.best_estimator_.predict(X_test_c)
            accuracy_c = accuracy_score(Y_test_c, Y_pred_c)
            precision_c = precision_score(Y_test_c, Y_pred_c, average='macro')
            recall_c = recall_score(Y_test_c, Y_pred_c, average='macro')
            f1_c = f1_score(Y_test_c, Y_pred_c, average='macro')

            Y_pred_d = grid_d.best_estimator_.predict(X_test_d)
            accuracy_d = accuracy_score(Y_test_d, Y_pred_d)
            precision_d = precision_score(Y_test_d, Y_pred_d, average='macro')
            recall_d = recall_score(Y_test_d, Y_pred_d, average='macro')
            f1_D = f1_score(Y_test_d, Y_pred_d, average='macro')

            print(f'\t[📊 ACCURACY-C]\t{round(accuracy_c, 3)}')
            print(f'\t[📊 ACCURACY-D]\t{round(accuracy_d, 3)}')
            print(f'\t[📊 PRECISION-C]\t{round(precision_c, 3)}')
            print(f'\t[📊 PRECISION-D]\t{round(precision_d, 3)}')
            print(f'\t[📊 RECALL-C]\t{round(recall_c, 3)}')
            print(f'\t[📊 RECALL-D]\t{round(recall_d, 3)}')
            print(f'\t[📊 F1 SCORE-C]\t{round(f1_c, 3)}')
            print(f'\t[📊 F1 SCORE-D]\t{round(f1_d, 3)}\n')

            train_scores_c.append(score_train_c)
            train_scores_d.append(score_train_d)
            accuracy_scores_c.append(accuracy_c)
            precision_scores_c.append(precision_c)
            recall_scores_c.append(recall_c)
            f1_scores_c.append(f1_c)
            accuracy_scores_d.append(accuracy_d)
            precision_scores_d.append(precision_d)
            recall_scores_d.append(recall_d)
            f1_scores_d.append(f1_d)
            best_params_c.append(grid_c.best_params_)
            best_params_d.append(grid_d.best_params_)

        # Feature importance for Random Forest
        if name == 'Random Forest':
            # Confusion Matrix
            conf_matrix_c = confusion_matrix(y_true=Y_test_c, y_pred=Y_pred_c)
            conf_matrix_d = confusion_matrix(y_true=Y_test_d, y_pred=Y_pred_d)
            # Explainable ML
            impurity_c = grid_c.best_estimator_.feature_importances_
            impurity_d = grid_d.best_estimator_.feature_importances_
            std_c = np.std([tree.feature_importances_ for tree in grid_c.best_estimator_.estimators_], axis=0)
            std_d = np.std([tree.feature_importances_ for tree in grid_d.best_estimator_.estimators_], axis=0)
            explainer_c = shap.TreeExplainer(grid_c.best_estimator_)
            explainer_d = shap.TreeExplainer(grid_d.best_estimator_)
            shap_values_c = explainer_c.shap_values(X_test_c)
            shap_values_d = explainer_d.shap_values(X_test_d)

In [None]:
if not separate:
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i, s=conf_matrix[i, j],
                    va='center', ha='center', size='xx-large')

    plt.xlabel('Predictions', fontsize=18)
    plt.ylabel('Actuals', fontsize=18)
    plt.title('Confusion Matrix for Random Forest', fontsize=18)
    
    saveConfPath = os.path.join(imageFolder, saveBase)
    saveConfPath += '_confusionMatrix.pdf'
    plt.savefig(saveConfPath)

## 🔝 Feature Importance

In [None]:
if not separate:
    forest_impurity = pd.Series(impurity, index=X_train.columns).nlargest(20)

    fig, ax = plt.subplots()
    forest_impurity.plot.bar(ax=ax)  # , yerr=std)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
else:
    forest_impurity_c = pd.Series(impurity_c, index=X_train_c.columns).nlargest(20)
    forest_impurity_d = pd.Series(impurity_d, index=X_train_d.columns).nlargest(20)

    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs = axs.ravel()

    forest_impurity_c.plot.bar(ax=axs[0])  # , yerr=std_c)
    forest_impurity_d.plot.bar(ax=axs[1])  # , yerr=std_d)
    axs[0].set_title("Feature importances using MDI on Charge")
    axs[0].set_ylabel("Mean decrease in impurity")
    axs[1].set_title("Feature importances using MDI on Discharge")
    axs[1].set_ylabel("Mean decrease in impurity")

fig.tight_layout()
saveMDIPath = os.path.join(imageFolder, saveBase)
saveMDIPath += '_MDI.pdf'
plt.savefig(saveMDIPath)

In [None]:
if not separate:
    shap.summary_plot(shap_values, X_test, plot_type="bar")
    saveShapPath = os.path.join(imageFolder, saveBase)
    saveShapPath += '_Shap.pdf'
    plt.savefig(saveShapPath)
else:
    shap.summary_plot(shap_values_c, X_test_c, plot_type="bar")
    saveShapPath = os.path.join(imageFolder, saveBase)
    saveShapPath += '_ShapCharge.pdf'
    plt.savefig(saveShapPath)

    shap.summary_plot(shap_values_d, X_test_d, plot_type="bar")
    saveShapPath = os.path.join(imageFolder, saveBase)
    saveShapPath += '_ShapDischarge.pdf'
    plt.savefig(saveShapPath)

## 💾 Saving Results

In [None]:
if not separate:
    df_list = []

    for name, acc, prec, rec, f1 in zip(names, accuracy_scores, precision_scores, recall_scores, f1_scores):
        df_list.append({
            'Model': name,
            'ID': id[-1],
            'Authentication': authentication,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1': f1
        })

    df_csv = pd.DataFrame(df_list)
    savePath = os.path.join(RESULTS, saveBase)
    savePath += '.csv'
    df_csv.to_csv(savePath)
else:
    df_list_c = []
    df_list_d = []

    for name, acc_c, prec_c, rec_c, f1_c in zip(names, accuracy_scores_c, precision_scores_c, recall_scores_c, f1_scores_c):
        df_list_c.append({
            'Model': name,
            'ID': id[-1],
            'Authentication': authentication,
            'Accuracy': acc_c,
            'Precision': prec_c,
            'Recall': rec_c,
            'F1': f1_c
        })
    for name, acc_d, prec_d, rec_d, f1_d in zip(names, accuracy_scores_d, precision_scores_d, recall_scores_d, f1_scores_d):
        df_list_d.append({
            'Model': name,
            'ID': id[-1],
            'Authentication': authentication,
            'Accuracy': acc_d,
            'Precision': prec_d,
            'Recall': rec_d,
            'F1': f1_d
        })

    df_csv_c = pd.DataFrame(df_list_c)
    df_csv_d = pd.DataFrame(df_list_d)
    savePath_c = os.path.join(RESULTS, saveBase)
    savePath_c += '_Charge.csv'
    df_csv_c.to_csv(savePath_c)
    savePath_d = os.path.join(RESULTS, saveBase)
    savePath_d += '_Discharge.csv'
    df_csv_d.to_csv(savePath_d)