In [15]:
import pandas as pd
import numpy as np
import pickle

CLASSIFIER_MODES = [
    'decisiontreeclassifier',
    'gaussiannb',
    'multinomialnb',
    'svc',
    'adaboostclassifier',
    'randomforestclassifier',
    'mlpclassifier']

def parse_model_selection_result(ms_result: tuple, mode: str) -> list:
    """Parse the model selection result tuple and get the best models.

    Args:
        ms_result: Model selection result tuple.

    Returns:
        List of best model and statistics for each classifiers.

    """
    candidates, _ = ms_result
    candidates = [(i, c, cv['best_f1']) for i, c, cv in candidates]

    if mode == 'f1':
        f1s_mean = []
        for i, c, cv_best in candidates:
            # Iterate over splits to calculate average F1 score.
            f1s = [cv_best[f'split_{j}']['f1']
                   for j in range(int(len(cv_best)/2) - 1)]
            f1s_mean += [np.mean(np.nan_to_num(f1s))]

        candidates = list(zip(candidates, f1s_mean))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)

        best_candidate_per_clf = []
        for clf in CLASSIFIER_MODES:
            for (i, c, cv_best), f1_mean in candidates:
                if c[3] == clf:
                    if cv_best['param'] is not None:
                        cv_best['param'] = {k.split('__')[-1]: v
                                            for k, v in cv_best['param'].items()}

                    best_candidate_per_clf += [((i, c, cv_best), f1_mean)]
                    break
        return best_candidate_per_clf
    elif mode == 'balanced_accuracy':
        candidates, _ = ms_result
        # candidates = [(i, c, cv) for i, c, cv in candidates]
        balanced_accuracys_mean = []
        grid_results = []
        for i, c, cv in candidates:
            # parse every grid search result
            for key in cv:
                # Iterate over splits to calculate average F1 score for clf
                result = cv[key]
                balanced_accuracys = [result[f'split_{j}']['balanced_accuracy'] for j in range(int(len(result)/2) - 1)]
                grid_results += [(i, c, result)]
                balanced_accuracys_mean += [np.mean(np.nan_to_num(balanced_accuracys))]
        candidates = list(zip(grid_results, balanced_accuracys_mean))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)

        best_candidate_per_clf = []
        for clf in CLASSIFIER_MODES:
            for (i, c, cv), balanced_accuracy_mean in candidates:
                if c[3] == clf:
                    if cv['param'] is not None:
                        cv['param'] = {k.split('__')[-1]: v
                                            for k, v in cv['param'].items()}

                    best_candidate_per_clf += [((i, c, cv), balanced_accuracy_mean)]
                    break
        return best_candidate_per_clf

        # raise NotImplementedError
    else:
        raise ValueError(f"Unknown mode: {mode}")

In [16]:
with open('./output/pval_filter_60_MVI/output_12to18_yesmental/results.pkl', 'rb') as f:
        model_selection_result = pickle.load(f)

In [17]:
mode = "f1"
best_candidate_per_clf = parse_model_selection_result(
    model_selection_result, mode)
print(best_candidate_per_clf)

[((14, ('standard', 'knn', 'none', 'decisiontreeclassifier'), {'param': {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}, 'split_0': {'tn': 82.0, 'fp': 109.0, 'fn': 19.0, 'tp': 78.0, 'f1': 0.5492957746478873, 'balanced_accuracy': 0.6167215415339775}, 'split_0_train': {'tn': 317.0, 'fp': 447.0, 'fn': 91.0, 'tp': 296.0, 'f1': 0.5238938053097345, 'balanced_accuracy': 0.5898896735527686}, 'split_1': {'tn': 80.0, 'fp': 111.0, 'fn': 20.0, 'tp': 77.0, 'f1': 0.5403508771929825, 'balanced_accuracy': 0.6063313002644788}, 'split_1_train': {'tn': 319.0, 'fp': 445.0, 'fn': 90.0, 'tp': 297.0, 'f1': 0.5261293179805138, 'balanced_accuracy': 0.5924905637404115}, 'split_2': {'tn': 88.0, 'fp': 103.0, 'fn': 21.0, 'tp': 76.0, 'f1': 0.5507246376811594, 'balanced_accuracy': 0.6221190694661844}, 'split_2_train': {'tn': 311.0, 'fp': 453.0, 'fn': 89.0, 'tp': 298.0, 'f1': 0.523725834797891, 'balanced_accuracy': 0.5885469513102534}, 'spli

In [18]:
mode = "balanced_accuracy"
best_candidate_per_clf = parse_model_selection_result(
    model_selection_result, mode)
print(best_candidate_per_clf)

[((168, ('robust', 'missforest', 'isolation_forest', 'decisiontreeclassifier'), {'param': {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 9, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'random'}, 'split_0': {'tn': 131.0, 'fp': 54.0, 'fn': 42.0, 'tp': 47.0, 'f1': 0.49473684210526314, 'balanced_accuracy': 0.6180989978742788}, 'split_0_train': {'tn': 491.0, 'fp': 248.0, 'fn': 131.0, 'tp': 223.0, 'f1': 0.5406060606060606, 'balanced_accuracy': 0.6471774347683157}, 'split_1': {'tn': 128.0, 'fp': 57.0, 'fn': 38.0, 'tp': 51.0, 'f1': 0.5177664974619289, 'balanced_accuracy': 0.6324627998785302}, 'split_1_train': {'tn': 496.0, 'fp': 243.0, 'fn': 137.0, 'tp': 217.0, 'f1': 0.5331695331695332, 'balanced_accuracy': 0.6420858084294703}, 'split_2': {'tn': 113.0, 'fp': 72.0, 'fn': 33.0, 'tp': 55.0, 'f1': 0.5116279069767442, 'balanced_accuracy': 0.6179054054054054}, 'split_2_train': {'tn': 536.0, 'fp': 203.0, 'fn': 146.0, 'tp': 209.0, 'f1': 0.544980443285528, 'balanced_accuracy':

In [19]:
pd.DataFrame(best_candidate_per_clf)

Unnamed: 0,0,1
0,"(168, (robust, missforest, isolation_forest, d...",0.624611
1,"(99, (minmax, iterative, none, gaussiannb), {'...",0.616634
2,"(79, (minmax, knn, none, multinomialnb), {'par...",0.617295
3,"(80, (minmax, knn, none, svc), {'param': {'C':...",0.624757
4,"(81, (minmax, knn, none, adaboostclassifier), ...",0.640268
5,"(54, (standard, missforest, lof, randomforestc...",0.63417
6,"(13, (standard, knn, lof, mlpclassifier), {'pa...",0.639745
