In [29]:
import pandas as pd
import numpy as np
import pickle

CLASSIFIER_MODES = [
    'decisiontreeclassifier',
    'gaussiannb',
    'multinomialnb',
    'svc',
    'adaboostclassifier',
    'randomforestclassifier',
    'mlpclassifier']

def parse_model_selection_result(ms_result: tuple, mode: str) -> list:
    """Parse the model selection result tuple and get the best models.

    Args:
        ms_result: Model selection result tuple.

    Returns:
        List of best model and statistics for each classifiers.

    """
    candidates, _ = ms_result
    candidates = [(i, c, cv['best_f1']) for i, c, cv in candidates]

    if mode == 'f1':
        f1s_mean = []
        for i, c, cv_best in candidates:
            # Iterate over splits to calculate average F1 score.
            f1s = [cv_best[f'split_{j}']['f1']
                   for j in range(int(len(cv_best)/2) - 1)]
            f1s_mean += [np.mean(np.nan_to_num(f1s))]

        candidates = list(zip(candidates, f1s_mean))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)

        best_candidate_per_clf = []
        for clf in CLASSIFIER_MODES:
            for (i, c, cv_best), f1_mean in candidates:
                if c[3] == clf:
                    if cv_best['param'] is not None:
                        cv_best['param'] = {k.split('__')[-1]: v
                                            for k, v in cv_best['param'].items()}

                    best_candidate_per_clf += [((i, c, cv_best), f1_mean)]
                    break
        return best_candidate_per_clf
    elif mode == 'balanced_accuracy':
        candidates, _ = ms_result
        # candidates = [(i, c, cv) for i, c, cv in candidates]
        balanced_accuracys_mean = []
        grid_results = []
        for i, c, cv in candidates:
            # parse every grid search result
            for key in cv:
                # Iterate over splits to calculate average F1 score for clf
                result = cv[key]
                balanced_accuracys = [result[f'split_{j}']['balanced_accuracy'] for j in range(int(len(result)/2) - 1)]
                grid_results += [(i, c, result)]
                balanced_accuracys_mean += [np.mean(np.nan_to_num(balanced_accuracys))]
        candidates = list(zip(grid_results, balanced_accuracys_mean))
        candidates = sorted(candidates, key=lambda x: x[1], reverse=True)

        best_candidate_per_clf = []
        for clf in CLASSIFIER_MODES:
            for (i, c, cv), balanced_accuracy_mean in candidates:
                if c[3] == clf:
                    if cv['param'] is not None:
                        cv['param'] = {k.split('__')[-1]: v
                                            for k, v in cv['param'].items()}

                    best_candidate_per_clf += [((i, c, cv), balanced_accuracy_mean)]
                    break
        return best_candidate_per_clf

        # raise NotImplementedError
    else:
        raise ValueError(f"Unknown mode: {mode}")

In [30]:
with open('./output/pval_filter_60_MVI/output_12to18_yesmental/results.pkl', 'rb') as f:
        model_selection_result = pickle.load(f)

In [31]:
mode = "f1"
best_candidate_per_clf = parse_model_selection_result(
    model_selection_result, mode)
# print(best_candidate_per_clf)

In [32]:
pd.DataFrame(best_candidate_per_clf)

Unnamed: 0,0,1
0,"(14, (standard, knn, none, decisiontreeclassif...",0.529058
1,"(99, (minmax, iterative, none, gaussiannb), {'...",0.489107
2,"(79, (minmax, knn, none, multinomialnb), {'par...",0.510233
3,"(101, (minmax, iterative, none, svc), {'param'...",0.533829
4,"(81, (minmax, knn, none, adaboostclassifier), ...",0.544926
5,"(54, (standard, missforest, lof, randomforestc...",0.498169
6,"(146, (robust, knn, none, mlpclassifier), {'pa...",0.537369


In [33]:
mode = "balanced_accuracy"
best_candidate_per_clf = parse_model_selection_result(
    model_selection_result, mode)
# print(best_candidate_per_clf)

In [34]:
pd.DataFrame(best_candidate_per_clf)

Unnamed: 0,0,1
0,"(168, (robust, missforest, isolation_forest, d...",0.624611
1,"(99, (minmax, iterative, none, gaussiannb), {'...",0.616634
2,"(79, (minmax, knn, none, multinomialnb), {'par...",0.617295
3,"(80, (minmax, knn, none, svc), {'param': {'C':...",0.624757
4,"(81, (minmax, knn, none, adaboostclassifier), ...",0.640268
5,"(54, (standard, missforest, lof, randomforestc...",0.63417
6,"(13, (standard, knn, lof, mlpclassifier), {'pa...",0.639745
