In [86]:
import sys

import os
import pickle
# import logging

import numpy as np
import pandas as pd
import click
from typing import Union
from sklearn.metrics import precision_score
import sklearn

from msap.modeling.model_evaluation.statistics import (
    get_embedded_data,
    get_selected_features,
    get_curve_metrics,
    get_curve_metrics_test,
    get_training_statistics,
    get_baseline_training_statistics,
    get_validation_statistics,
    get_baseline_validation_statistics,
    get_testing_statistics,
    get_baseline_testing_statistics,
    get_similarity_matrix)
from msap.explanatory_analysis import get_pairwise_correlation
from msap.utils import (
    ClassifierHandler,
    load_X_and_y,
    KFold_by_feature)
from msap.utils.plot import (
    plot_heatmap,
    plot_embedded_scatter,
    plot_rfe_line,
    plot_rfe_line_detailed,
    plot_curves,
    plot_confusion_matrix)
from msap.modeling.configs import (
    ModelSelectionConfig)


METHODS_PC = ['pearson', 'spearman', 'kendall']
METHODS_EMBEDDING = ['tsne', 'pca']
METHODS_CURVE = ['pr', 'roc']
CLASSIFIER_MODES = [
    'decisiontreeclassifier',
    'gaussiannb',
    'multinomialnb',
    'svc',
    'adaboostclassifier',
    'randomforestclassifier',
    'mlpclassifier']

In [87]:
def parse_model_selection_result(ms_result: tuple) -> list:
    """Parse the model selection result tuple and get the best models.

    Args:
        ms_result: Model selection result tuple.

    Returns:
        List of best model and statistics for each classifiers.

    """
    candidates, _ = ms_result
    candidates = [(i, c, cv['best']) for i, c, cv in candidates]

    f1s_mean = []
    for i, c, cv_best in candidates:
        # Iterate over splits to calculate average F1 score.
        f1s = [cv_best[f'split_{j}']['f1'] for j in range(len(cv_best) - 1)]
        f1s_mean += [np.mean(np.nan_to_num(f1s))]

    candidates = list(zip(candidates, f1s_mean))
    candidates = sorted(candidates, key=lambda x: x[1], reverse=True) # sorts so max is first

    best_candidate_per_clf = []
    for clf in CLASSIFIER_MODES:
        for (i, c, cv_best), f1_mean in candidates:
            if c[3] == clf:
                if cv_best['param'] is not None:
                    cv_best['param'] = {k.split('__')[-1]: v
                                        for k, v in cv_best['param'].items()}

                best_candidate_per_clf += [((i, c, cv_best), f1_mean)]
                break # break to get the max

    return best_candidate_per_clf

In [88]:
# filepaths
path_input_model_selection_result = './output/pval_filter_60_MVI/output_12to18_yesmental/results.pkl'
path_input_preprocessed_data_dir = './output/pval_filter_60_MVI/output_12to18_yesmental/preprocessed'
path_output_dir = './output/pval_filter_60_MVI/output_12to18_yesmental/'
feature_label = 'y12to18_Dep_YN_216m'
random_state = 42

In [89]:
if not os.path.exists(path_output_dir):
    os.mkdir(path_output_dir)

model_selection_result = None
with open(path_input_model_selection_result, 'rb') as f:
    model_selection_result = pickle.load(f)

#print(model_selection_result)
best_candidate_per_clf = parse_model_selection_result(
    model_selection_result)
best_candidate = max(best_candidate_per_clf, key=lambda x: x[1])
_, best_combination, best_cv_result = best_candidate[0]
best_scale_mode, best_impute_mode, best_outlier_mode, best_clf \
    = best_combination

# print(best_combination)
#pd.DataFrame(best_candidate_per_clf).to_csv(
#    f"{path_output_dir}/best_clfs.csv")
# model_selection_result

In [90]:
def parse_model_selection_grid_search_results_for_best(ms_result: tuple, best_combination: tuple) -> list:
    """Parse the model selection result tuple and get the all models.

    Args:
        ms_result: Model selection result tuple.

    Returns:
        List of best model and statistics for each classifiers.

    """
    candidates, _ = ms_result
    # index, classifier, cv_result
    # no longer want best, just want all?
    candidates = [(i, c, cv) for i, c, cv in candidates]

    grid_results = []
    f1s_mean = []
    for i, c, cv in candidates:
        if c == best_combination:
            # parse every grid search result
            for key in cv:
                # Iterate over splits to calculate average F1 score for clf
                result = cv[key]
                f1s = [result[f'split_{j}']['f1'] for j in range(len(result) - 1)]
                grid_results += [((i, key), c, result)]
                f1s_mean += [np.mean(np.nan_to_num(f1s))]

    candidates = list(zip(grid_results, f1s_mean))
    # candidates = sorted(candidates, key=lambda x: x[1], reverse=True)

    all_candidates_of_combination = []
    for (i, c, cv), f1_mean in candidates:
        if c == best_combination:
            if cv['param'] is not None:
                # get name of parameter (last word after '__') and value
                cv['param'] = {k.split('__')[-1]: v
                                    for k, v in cv['param'].items()}

            all_candidates_of_combination += [((i, c, cv), f1_mean)]

    return all_candidates_of_combination

In [91]:
all_grid_search_results_for_best_combination = parse_model_selection_grid_search_results_for_best(
    model_selection_result, best_combination)
# all_grid_search_results_for_best_combination

In [104]:
# create pandas dataframe of current results
i_gridis = []
cs = []
params = []
splits = []
f1s = []
for ((i, grid_i), c, cv), f1_mean in all_grid_search_results_for_best_combination:
    #print(f"{i} {grid_i} {c} {cv} {f1_mean}")
    #break
    i_gridis += [(i, grid_i)]
    cs += [c]
    params += [cv['param']]
    splits += [[cv[f'split_{j}'] for j in range(len(cv) - 1)]]
    f1s += [f1_mean]

grids = {'i_gridis': i_gridis, 'cs': cs, 'params': params, 'splits': splits, 'f1s': f1s}
df = pd.DataFrame(grids)

#df['params'].apply(pd.Series)
df = pd.concat([df, df['params'].apply(pd.Series)], axis=1)
df = df.drop(columns='params')
#df
#df.to_csv(f"{path_output_dir}/best_clf_grid_search.csv")

Unnamed: 0,criterion,min_samples_leaf,min_samples_split,n_estimators,random_state
0,gini,1,2,25,42
1,gini,1,2,50,42
2,gini,1,2,75,42
3,gini,1,2,100,42
4,gini,1,2,125,42
...,...,...,...,...,...
996,entropy,9,10,425,42
997,entropy,9,10,450,42
998,entropy,9,10,475,42
999,entropy,9,10,500,42
