# 🔐 Authentication

## 📚 Loading libraries

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import shap
import sys
import tsfresh

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils.const import *
from utils.helperFunctions import *

import warnings
warnings.filterwarnings("ignore")

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## 📍 Variables

In [None]:
# Listing IDs
ids = ['id1', 'id2', 'id3']
# Choose what ID to process
id = 'id1'
ids_remove = [x for x in ids if x != id]

# Filter features and keep only relevant ones
filterFeatures = True

# Undersampling
fairUndersampling = False       # Each class same number
targetedUndersampling = True    # Downsample most frequent class
customBalance = False           # Downsample by specifying number of samples for each label

# Choose whether to separate Charge and Discharge cycling or not
separate = True

## 🤖 Models

In [None]:
names = [
    'AdaBoost',
    'Decision Tree',
    'Gaussian Naive Bayes',
    'Nearest Neighbors',
    'Neural Network',
    'Quadratic Discriminant Analysis',
    'Random Forest',
    'Support Vector Machine'
]

classifiers = [
    AdaBoostClassifier(random_state=SEED),
    DecisionTreeClassifier(random_state=SEED),
    GaussianNB(),
    KNeighborsClassifier(),
    MLPClassifier(random_state=SEED),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(random_state=SEED),
    SVC(random_state=SEED),
]

parameters = [
    # AdaBoostClassifier
    {
        'n_estimators': [50, 100, 150, 200]
    },
    # DecisionTreeClassifier
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': np.arange(3, 20)
    },
    # GaussianNB
    {
        'var_smoothing': np.logspace(0, -9, num=100)
    },
    # KNeighborsClassifier
    {
        'n_neighbors': list(range(1, 20)),
        'weights': ['uniform', 'distance']
    },
    # MLPClassifier
    {
        'hidden_layer_sizes': [(50, ), (100, ), (200, )],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd']
    },
    # QuadraticDiscriminantAnalysis
    {
        'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]
    },
    # RandomForestClassifier
    {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'n_estimators': [100, 200, 300, 400, 500]
    },
    # SVC
    {
        'kernel': ['linear', 'rbf'],
        'C': np.arange(1, 5, 1),
        'gamma': np.arange(0.2, 1, 0.2)
    },
]

## 📚 Datasets

In [None]:
if not separate:
    dff = []
    for dataset in DATASETS:
        dir = os.path.join(PROCESSED, dataset)
        for file in os.listdir(dir):
            if file.split('.')[-1] == 'parquet':
                df = pd.read_parquet(os.path.join(dir, file))
                dff.append(df)

        df = pd.concat(dff)
else:
    dff_charge = []
    dff_discharge = []
    for dataset in DATASETS:
        dir = os.path.join(PROCESSED, dataset)
        for file in os.listdir(dir):
            if file.split('.')[-1] == 'parquet':
                df = pd.read_parquet(os.path.join(dir, file))
                if file.split('.')[0] == 'charge':
                    dff_charge.append(df)
                elif file.split('.')[0] == 'discharge':
                    dff_discharge.append(df)

    df_charge = pd.concat(dff_charge)
    df_discharge = pd.concat(dff_discharge)

In [None]:
if not separate:
    fig, axs = plt.subplots(1, 2, figsize=(8, 4))
    axs = axs.ravel()

    df['id1'].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution', xlabel='IDs', ylabel='Occurences', ax=axs[0])
    df['id2'].value_counts().sort_index().plot(
        kind='bar', title='ID2 Distribution', xlabel='IDs', ylabel='Occurences', ax=axs[1])

else:
    fig, axs = plt.subplots(2, 2, figsize=(8, 8))
    axs = axs.ravel()

    df_charge['id1'].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution on Charge', xlabel='IDs', ylabel='Occurences', ax=axs[0])
    df_charge['id2'].value_counts().sort_index().plot(
        kind='bar', title='ID2 Distribution on Charge', xlabel='IDs', ylabel='Occurences', ax=axs[1])
    df_discharge['id1'].value_counts().sort_index().plot(
        kind='bar', title='ID1 Distribution on Discharge', xlabel='IDs', ylabel='Occurences', ax=axs[2])
    df_discharge['id2'].value_counts().sort_index().plot(
        kind='bar', title='ID2 Distribution on Discharge', xlabel='IDs', ylabel='Occurences', ax=axs[3])

plt.tight_layout()

In [None]:
if targetedUndersampling:
    if not separate:
        df_x = df.drop(id, axis=1)
        df_x = df_x.drop(ids_remove, axis=1)
        
        X_resampled, y_resampled = RandomUnderSampler(random_state=SEED).fit_resample(df_x, df[id])

        X_resampled[id] = y_resampled
        df = X_resampled

        df[id].value_counts().sort_index().plot(
            kind='bar', title='ID Distribution', xlabel='IDs', ylabel='Occurences')
    else:
        dfc_x = df_charge.drop(id, axis=1)
        dfd_x = df_discharge.drop(id, axis=1)

        dfc_x = dfc_x.drop(ids_remove, axis=1)
        dfd_x = dfd_x.drop(ids_remove, axis=1)
        
        Xc_resampled, yc_resampled = RandomUnderSampler(random_state=SEED).fit_resample(dfc_x, df_charge[id])
        Xd_resampled, yd_resampled = RandomUnderSampler(random_state=SEED).fit_resample(dfd_x, df_discharge[id])

        Xc_resampled[id] = yc_resampled
        Xd_resampled[id] = yd_resampled
        
        df_charge = Xc_resampled
        df_discharge = Xd_resampled

        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
        axs = axs.ravel()

        df_charge[id].value_counts().sort_index().plot(
            kind='bar', title='ID Distribution on Charge', xlabel='IDs', ylabel='Occurences', ax=axs[0])
        df_discharge[id].value_counts().sort_index().plot(
            kind='bar', title='ID Distribution on Discharge', xlabel='IDs', ylabel='Occurences', ax=axs[1])


In [None]:
if not separate:
    beforeFeat = df.shape[1]
    tsfresh.utilities.dataframe_functions.impute(df)

    if filterFeatures:
        df = tsfresh.select_features(df, df[id])
        afterFeat = df.shape[1]

        print(f'[🔥 FILTER]\n\tBefore: {beforeFeat}\n\tAfter: {afterFeat}')
else:
    beforeFeat_c = df_charge.shape[1]
    beforeFeat_d = df_discharge.shape[1]

    tsfresh.utilities.dataframe_functions.impute(df_charge)
    tsfresh.utilities.dataframe_functions.impute(df_discharge)

    if filterFeatures:
        df_charge = tsfresh.select_features(df_charge, df_charge[id])
        df_discharge = tsfresh.select_features(df_discharge, df_discharge[id])
        
        afterFeat_c = df_charge.shape[1]
        afterFeat_d = df_discharge.shape[1]

        print(f'[🔥 CHARGE]\n\tBefore: {beforeFeat_c}\n\tAfter: {afterFeat_c}')
        print()
        print(f'[🔥 DISCHARGE]\n\tBefore: {beforeFeat_d}\n\tAfter: {afterFeat_d}')

## 💪 Training

In [None]:
if not separate:
    # Loading labels
    labels = df[id][:, np.newaxis]

    # Loading features
    features = df.drop(id, axis=1)
    # for id_remove in ids_remove:
    #     features = features.drop(id_remove, axis=1)

    # Train and test split
    X_train, X_test, Y_train, Y_test = train_test_split(
        features, labels, test_size=0.2, random_state=SEED)

    cols = []
    for col in X_train.columns:
        cols.append(col.replace('dQ/dV__', ''))

    X_train.columns = cols
    X_test.columns = cols
else:
    # Loading labels
    labels_c = df_charge[id][:, np.newaxis]
    labels_d = df_discharge[id][:, np.newaxis]

    # Loading features
    features_c = df_charge.drop(id, axis=1)
    features_d = df_discharge.drop(id, axis=1)
    # for id_remove in ids_remove:
    #     features_c = features_c.drop(id_remove, axis=1)
    #     features_d = features_d.drop(id_remove, axis=1)

    # Train and test split
    X_train_c, X_test_c, Y_train_c, Y_test_c = train_test_split(
        features_c, labels_c, test_size=0.2, random_state=SEED)
    X_train_d, X_test_d, Y_train_d, Y_test_d = train_test_split(
        features_d, labels_d, test_size=0.2, random_state=SEED)
    
    cols_c = []
    cols_d = []
    for col in X_train_c.columns:
        cols_c.append(col.replace('dQ/dV__', ''))
    for col in X_train_d.columns:
        cols_d.append(col.replace('dQ/dV__', ''))

    X_train_c.columns = cols_c
    X_test_c.columns = cols_c
    X_train_d.columns = cols_d
    X_test_d.columns = cols_d

In [None]:
if not separate:
    train_scores = []
    test_scores = []
    best_params = []
else:
    train_scores_c = []
    train_scores_d = []
    test_scores_c = []
    test_scores_d = []
    best_params_c = []
    best_params_d = []

# Iterate over classifiers
for name, clf, param in zip(names, classifiers, parameters):
    print(f'[🤖 MODEL] {name}')

    if not separate:
        grid = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
        
        grid.fit(X_train, Y_train)

        score_train = grid.best_estimator_.score(X_train, Y_train)
        print(f'\t[👟 TRAIN]\t{round(score_train, 3)}')

        score_test = grid.best_estimator_.score(X_test, Y_test)
        print(f'\t[🧪 TEST]\t{round(score_test, 3)}\n')

        train_scores.append(score_train)
        test_scores.append(score_test)
        best_params.append(grid.best_params_)

        # Feature importance for Random Forest
        if name == 'Random Forest':
            impurity = grid.best_estimator_.feature_importances_
            std = np.std([tree.feature_importances_ for tree in grid.best_estimator_.estimators_], axis=0)
            explainer = shap.TreeExplainer(grid.best_estimator_)
            shap_values = explainer.shap_values(X_test)
    else:
        grid_c = GridSearchCV(clf, param, n_jobs=-1, verbose=0)
        grid_d = GridSearchCV(clf, param, n_jobs=-1, verbose=0)

        grid_c.fit(X_train_c, Y_train_c)
        grid_d.fit(X_train_d, Y_train_d)

        score_train_c = grid_c.best_estimator_.score(X_train_c, Y_train_c)
        score_train_d = grid_d.best_estimator_.score(X_train_d, Y_train_d)
        print(f'\t[👟 TRAIN-C]\t{round(score_train_c, 3)}')
        print(f'\t[👟 TRAIN-D]\t{round(score_train_d, 3)}')

        score_test_c = grid_c.best_estimator_.score(X_test_c, Y_test_c)
        score_test_d = grid_d.best_estimator_.score(X_test_d, Y_test_d)
        print(f'\t[🧪 TEST-C]\t{round(score_test_c, 3)}')
        print(f'\t[🧪 TEST-D]\t{round(score_test_d, 3)}\n')

        train_scores_c.append(score_train_c)
        train_scores_d.append(score_train_d)
        test_scores_c.append(score_test_c)
        test_scores_d.append(score_test_d)
        best_params_c.append(grid_c.best_params_)
        best_params_d.append(grid_d.best_params_)

        # Feature importance for Random Forest
        if name == 'Random Forest':
            impurity_c = grid_c.best_estimator_.feature_importances_
            impurity_d = grid_d.best_estimator_.feature_importances_
            std_c = np.std([tree.feature_importances_ for tree in grid_c.best_estimator_.estimators_], axis=0)
            std_d = np.std([tree.feature_importances_ for tree in grid_d.best_estimator_.estimators_], axis=0)
            explainer_c = shap.TreeExplainer(grid_c.best_estimator_)
            explainer_d = shap.TreeExplainer(grid_d.best_estimator_)
            shap_values_c = explainer_c.shap_values(X_test_c)
            shap_values_d = explainer_d.shap_values(X_test_d)

## 🔝 Feature Importance

In [None]:
if not separate:
    forest_impurity = pd.Series(impurity, index=X_train.columns).nlargest(20)

    fig, ax = plt.subplots()
    forest_impurity.plot.bar(ax=ax)  # , yerr=std)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
else:
    forest_impurity_c = pd.Series(impurity_c, index=X_train_c.columns).nlargest(20)
    forest_impurity_d = pd.Series(impurity_d, index=X_train_d.columns).nlargest(20)

    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs = axs.ravel()

    forest_impurity_c.plot.bar(ax=axs[0])  # , yerr=std_c)
    forest_impurity_d.plot.bar(ax=axs[1])  # , yerr=std_d)
    axs[0].set_title("Feature importances using MDI on Charge")
    axs[0].set_ylabel("Mean decrease in impurity")
    axs[1].set_title("Feature importances using MDI on Discharge")
    axs[1].set_ylabel("Mean decrease in impurity")

fig.tight_layout()

In [None]:
if not separate:
    shap.summary_plot(shap_values, X_test, plot_type="bar")
else:
    shap.summary_plot(shap_values_c, X_test_c, plot_type="bar")
    shap.summary_plot(shap_values_d, X_test_d, plot_type="bar")