In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_validate, StratifiedKFold

import import_ipynb
from visualizations import *

plt.style.use('seaborn')

def plot_feature_importances(model, x_test, y_test):
    result = permutation_importance(model, x_test, y_test, n_repeats=20,
                                random_state=42, n_jobs=8)
    sorted_idx = result.importances_mean.argsort()

    fig, ax = plt.subplots(figsize=(12,8))
    ax.boxplot(result.importances[sorted_idx].T,
               vert=False, labels=x_test.columns[sorted_idx])
    ax.set_title("Permutation Importances (test set)")
    fig.tight_layout()
    plt.show()

## Load dataset

In [None]:
train = pd.read_csv('/Users/adam/phd/projects/certificates/dataset/train.csv', index_col='index')
target_variable = 'sec_level_cat'
y_train = train[target_variable]
x_train = train.drop(columns=target_variable)

## Train classifier

In [None]:
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(x_train, y_train)

## Evaluate accuracy, precision and recall

In [None]:
# WARNING: Realize that when using NN or something where data is scaled to (0,1), it must be fit via pipe on training set and only transofmred on validation set!
# Pipeline should handle this well, as it applies fit on training data and transform on test data (which is the specific fold).

# Will raise some warnings because of dividing by 0 during precision count. Still, it acts correctly (taking 0).
score = cross_validate(rf, x_train, y_train, scoring=['accuracy', 'recall_macro', 'precision_macro'], n_jobs=8, cv=10)
acc, precision, recall = score['test_accuracy'], score['test_precision_macro'], score['test_recall_macro']
print(f'Validation accuracy: {np.mean(acc):.2f} +/- {np.std(acc):.2f}')
print(f'Validation precision: {np.mean(precision):.2f} +/- {np.std(precision):.2f}')
print(f'Validation recall: {np.mean(recall):.2f} +/- {np.std(recall):.2f}')

## Plot feature importances

In [None]:
plot_feature_importances(rf, x_test, y_test)

## Plot confusion matrix

In [None]:
sec_level_dict = {'EAL1': 0, 'EAL1+': 1, 'EAL2': 2, 'EAL2+': 3, 'EAL3': 4, 'EAL3+': 5, 'EAL4': 6, 'EAL4+': 7, 'EAL5': 8, 'EAL5+': 9, 'EAL6': 10, 'EAL6+': 11, 'EAL7': 12, 'EAL7+': 13}
labels = list(sec_level_dict.keys())
range_labels = list(sec_level_dict.values())

kf = StratifiedKFold(n_splits=10, random_state=42)
cm_dim = len(range_labels)
cm_sum = np.zeros((cm_dim,cm_dim))
for train_index, test_index in kf.split(x_train, y_train):
    rf.fit(x_train.iloc[train_index], y_train.iloc[train_index])
    cm = confusion_matrix(y_train.iloc[test_index], rf.predict(x_train.iloc[test_index]), labels=range_labels)
    cm_sum = np.add(cm_sum, cm)
    
# normalize
cm_sum = cm_sum.astype('float') / cm_sum.sum(axis=1)[:, np.newaxis]
# fill-in missing values (if class with 0 instances)
cm_sum = np.nan_to_num(cm_sum)

print_confusion_matrix(cm_sum, labels)