In [1]:
import json
from pathlib import Path
import pickle

from rich.progress import track

import pandas as pd

from sklearn.preprocessing import MaxAbsScaler

In [2]:
EXPERIMENT_NAME = "ablation-problem-class"
ANALYSIS_BASE = "./analysis/paper-data/"
INCLUDE_EWMA = False
INCLUDE_GRADIENTS = False

CROSS_VALIDATIONS_PATH = f"./cross-validations-{EXPERIMENT_NAME}"


In [3]:
def filter_dataframe_for_current_experiment(data_flattened):
    df = data_flattened.loc[(slice(None), slice(None), INCLUDE_GRADIENTS, INCLUDE_EWMA), :]
    df.index = df.index.droplevel([2, 3])
    df = df.sort_values(by=['percentage', 'model'])

    def float_or_list_to_tuple(x):
        if isinstance(x, list):
            return tuple(x)
        return x

    df["hidden_layer_sizes"] = df['hidden_layer_sizes'].apply(float_or_list_to_tuple)
    return df

In [4]:

def parse_files(cross_validations_path, seed):
    """
    Parse files and use pd.json_normalize to flatten the json.
    """
    files = Path(ANALYSIS_BASE).joinpath(f"{cross_validations_path}-{seed}").glob("**/*.json")
    results = pd.DataFrame()
    files = list(files)

    for file in track(
        files,
        description="Loading hyperparameters and "
        "performance data from file to DataFrame",
    ):
        with open(file) as f:
            data = json.load(f)
            original_target = data["original_target"]
            original_dict = json.loads(Path(original_target).read_text())

            f1_scores = pd.json_normalize(data, "f1_scores")
            normalized_data = pd.json_normalize(original_dict)
            normalized_data = pd.concat(
                [normalized_data] * len(f1_scores), ignore_index=True
            )
            normalized_data["metrics.f1_score"] = f1_scores
            normalized_data["paths.model_path"] = data["model_path"]
            results = pd.concat([results, normalized_data])

    results = results.set_index(
        ["percentage", "model", "use_gradient", "use_ewma"]
    )
    results.columns = pd.MultiIndex.from_arrays(
        zip(*results.columns.str.split(".", expand=True))
    )
    results = results.sort_index(axis=1)
    results = results.drop(columns=['k_fold', 'preprocessing'])
    results = results.droplevel(0, axis=1)

    results = filter_dataframe_for_current_experiment(results)

    return results

In [5]:
def pick_best_hyperparameters_from_k_folds(k_folds_data, percentages=None):

    if percentages is None:
        PERCENTAGES = [1, 5, 10, 15, 20]
        
    MODELS = ['RF', 'AdaBoost', 'DUM']
    FILTER_MODELS = True

    # This is a bit hacky, but I add model_path here because its only always the same
    # so it doesn't have effect on the groupby.
    # Would be better to `df.merge` though.
    group = k_folds_data.groupby(['percentage', 'model', 'C', 'hidden_layer_sizes', 'kernel', 'learning_rate', 'max_depth', 'n_estimators', 'alpha', 'model_path'], dropna=False)
    summary = group['f1_score'].agg(['mean', 'std'])

    idx = summary.groupby(['percentage', 'model']).idxmax()

    best_hyperparams = summary.loc[idx['mean'], :]

    best_models_per_percentage_and_type = best_hyperparams[best_hyperparams.index.get_level_values('percentage').isin(PERCENTAGES)]
    if FILTER_MODELS:
        best_models_per_percentage_and_type =  best_models_per_percentage_and_type[best_models_per_percentage_and_type.index.get_level_values('model').isin(MODELS)]
    return best_models_per_percentage_and_type



In [6]:
def load_classifiers_from_best_models(best_models_per_percentage_and_type):
    classifiers = {}
    for _, row in best_models_per_percentage_and_type.reset_index().iterrows():
        
        with open(row['model_path'], 'rb') as f:
            instantiated_model = pickle.load(f)
        classifiers[(row['percentage'], row['model'])] = instantiated_model
    return classifiers

In [7]:
data_for_all_seeds = []
for i in range(1,11):

    current_seed = {
        'seed': i
    }

    data = parse_files(Path(CROSS_VALIDATIONS_PATH), seed=i)
    current_seed['k_fold'] = data
    
    current_seed['best_models'] = pick_best_hyperparameters_from_k_folds(data)
    current_seed['classifiers'] = load_classifiers_from_best_models(current_seed['best_models'])

    TEST_DATASET_PATH = Path(f'./analysis/paper-data/{EXPERIMENT_NAME}.pkl_test.pkl')
    with open(TEST_DATASET_PATH, 'rb') as f:
        TEST_DATASET = pickle.load(f)
    TRAIN_DATASET_PATH = Path('./analysis/paper-data/{EXPERIMENT_NAME}.pkl_train.pkl')
    with open(TRAIN_DATASET_PATH, 'rb') as f:
        TRAIN_DATASET = pickle.load(f)

    current_seed['train_data'] = TRAIN_DATASET
    current_seed['test_data'] = TEST_DATASET
    
    data_for_all_seeds += [current_seed]

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

## Utility

In [None]:
def combine_seed_data_into_single_dataframe_opinionated(total_data):
    for seed in total_data:
        print(seed['best_models'])

In [8]:
def preprocess(dataframe, scaler=None):
    result, mzn = dataframe.drop(columns=['mzn', 'dzn'], axis=1), dataframe['mzn']

    result = result.drop(columns=result.columns[result.columns.str.contains('ewma|gradient')], axis=1)

    if scaler is None:
        result = result.drop(result.columns[result.nunique() == 1], axis=1)
        scaler = MaxAbsScaler().fit(result)
    else:
        # Drop constant columns except those in scaler.feature_names_in_
        constant_columns = result.columns[result.nunique() == 1]
        features_in = scaler.feature_names_in_
        columns_to_drop = constant_columns.difference(features_in)
        result = result.drop(columns=columns_to_drop, axis=1)
        
        
    result = pd.DataFrame(scaler.transform(result), columns=result.columns, index=result.index)

    result['mzn'] = mzn

    return result, scaler




In [None]:
def 

# Now here we can split off: <insert your analysis here>.