In [1]:
import json
from pathlib import Path
import pickle

from rich.progress import track

import pandas as pd

In [2]:
EXPERIMENT_NAME = "base-experiment"
ANALYSIS_BASE = "./analysis/paper-data/"
INCLUDE_EWMA = False
INCLUDE_GRADIENTS = False

CROSS_VALIDATIONS_PATH = f"./cross-validations-{EXPERIMENT_NAME}"


In [3]:
def filter_dataframe_for_current_experiment(data_flattened):
    df = data_flattened.loc[(slice(None), slice(None), INCLUDE_GRADIENTS, INCLUDE_EWMA), :]
    df.index = df.index.droplevel([2, 3])
    df = df.sort_values(by=['percentage', 'model'])

    def float_or_list_to_tuple(x):
        if isinstance(x, list):
            return tuple(x)
        return x

    df["hidden_layer_sizes"] = df['hidden_layer_sizes'].apply(float_or_list_to_tuple)
    return df

In [4]:

def parse_files(cross_validations_path, seed):
    """
    Parse files and use pd.json_normalize to flatten the json.
    """
    files = Path(ANALYSIS_BASE).joinpath(f"{cross_validations_path}-{seed}").glob("**/*.json")
    results = pd.DataFrame()
    files = list(files)

    for file in track(
        files,
        description="Loading hyperparameters and "
        "performance data from file to DataFrame",
    ):
        with open(file) as f:
            data = json.load(f)
            original_target = data["original_target"]
            original_dict = json.loads(Path(original_target).read_text())

            f1_scores = pd.json_normalize(data, "f1_scores")
            normalized_data = pd.json_normalize(original_dict)
            normalized_data = pd.concat(
                [normalized_data] * len(f1_scores), ignore_index=True
            )
            normalized_data["metrics.f1_score"] = f1_scores
            normalized_data["paths.model_path"] = data["model_path"]
            results = pd.concat([results, normalized_data])

    results = results.set_index(
        ["percentage", "model", "use_gradient", "use_ewma"]
    )
    results.columns = pd.MultiIndex.from_arrays(
        zip(*results.columns.str.split(".", expand=True))
    )
    results = results.sort_index(axis=1)
    results = results.drop(columns=['k_fold', 'preprocessing'])
    results = results.droplevel(0, axis=1)

    results = filter_dataframe_for_current_experiment(results)

    return results

In [5]:
def pick_best_hyperparameters_from_k_folds(k_folds_data):

    PERCENTAGES = [2, 5, 10, 15, 20, 25, 30, 35, 40 ]
    MODELS = ['RF', 'DUM', 'ET', 'AdaBoost']
    FILTER_MODELS = True

    # This is a bit hacky, but I add model_path here because its only always the same
    # so it doesn't have effect on the groupby.
    # Would be better to `df.merge` though.
    group = k_folds_data.groupby(['percentage', 'model', 'C', 'hidden_layer_sizes', 'kernel', 'learning_rate', 'max_depth', 'n_estimators', 'alpha', 'model_path'], dropna=False)
    summary = group['f1_score'].agg(['mean', 'std'])

    idx = summary.groupby(['percentage', 'model']).idxmax()

    best_hyperparams = summary.loc[idx['mean'], :]

    best_models_per_percentage_and_type = best_hyperparams[best_hyperparams.index.get_level_values('percentage').isin(PERCENTAGES)]
    if FILTER_MODELS:
        best_models_per_percentage_and_type =  best_models_per_percentage_and_type[best_models_per_percentage_and_type.index.get_level_values('model').isin(MODELS)]
    return best_models_per_percentage_and_type



In [6]:
def load_classifiers_from_best_models(best_models_per_percentage_and_type):
    classifiers = {}
    for _, row in best_models_per_percentage_and_type.reset_index().iterrows():
        
        with open(row['model_path'], 'rb') as f:
            instantiated_model = pickle.load(f)
        classifiers[(row['percentage'], row['model'])] = instantiated_model
    return classifiers

In [7]:
data_for_all_seeds = []
for i in range(1,11):

    current_seed = {
        'seed': i
    }

    data = parse_files(Path(CROSS_VALIDATIONS_PATH), seed=i)
    current_seed['k_fold'] = data
    
    current_seed['best_models'] = pick_best_hyperparameters_from_k_folds(data)
    current_seed['classifiers'] = load_classifiers_from_best_models(current_seed['best_models'])

    TEST_DATASET_PATH = Path(f'./analysis/paper-data/{EXPERIMENT_NAME}-{i}.pkl_test.pkl')
    with open(TEST_DATASET_PATH, 'rb') as f:
        TEST_DATASET = pickle.load(f)
        TEST_DATASET = TEST_DATASET
        
    TRAIN_DATASET_PATH = Path(f'./analysis/paper-data/{EXPERIMENT_NAME}-{i}.pkl_train.pkl')
    with open(TRAIN_DATASET_PATH, 'rb') as f:
        TRAIN_DATASET = pickle.load(f)
        TRAIN_DATASET = TRAIN_DATASET

    current_seed['train_data'] = TRAIN_DATASET
    current_seed['test_data'] = TEST_DATASET
    
    data_for_all_seeds += [current_seed]

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [12]:
print(data_for_all_seeds[0]['train_data'][1].columns)

Index(['conflicts', 'ewma_conflicts', 'decisions', 'search_iterations',
       'opennodes', 'ewma_opennodes', 'vars', 'back_jumps', 'ewma_back_jumps',
       'solutions', 'total_time', 'search_time', 'intVars', 'propagations',
       'sat_propagations', 'ewma_propagations', 'propagators', 'boolVars',
       'learnt', 'bin', 'tern', 'long', 'peak_depth', 'decision_level_engine',
       'ewma_decision_level_engine', 'decision_level_treesize', 'clause_mem',
       'prop_mem', 'log_of_unassn_var', 'frac_prop_vars', 'freq_backjumps',
       'frac_bool_vars', 'frac_long_clauses', 'log_of_frac_unassign_var',
       'log_of_fraction_of_failures_versus_unassigned', 'mzn', 'dzn',
       'solved_within_time_limit', 'has_gradients'],
      dtype='object')


# Utility

In [15]:
# from sklearn.preprocessing import MaxAbsScaler

def preprocess(dataframe, scaler=None):
    result, _ = dataframe.drop(columns=['mzn', 'dzn'], axis=1), dataframe['mzn']

    # Drop any columns that contain the text 'ewma' or 'gradient'
    result = result.drop(columns=result.columns[result.columns.str.contains('ewma|gradient')], axis=1)
    result = result.drop(result.columns[result.nunique() == 1], axis=1)

    
    # scaler = MaxAbsScaler().fit(result)    
    # result = pd.DataFrame(scaler.transform(result), columns=result.columns, index=result.index)

    return result




In [16]:
def combine_seed_data_into_single_dataframe_opinionated(total_data):
    dfs = []
    for seed in total_data:
        best_models_copy = seed['best_models'].copy()
        best_models_copy['seed'] = seed['seed']
        dfs.append(best_models_copy)
    return pd.concat(dfs)

# Now here we can split off: are the models gaining anything from the dynamic features?

In [18]:
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import numpy as np

In [19]:
from collections import defaultdict
from sklearn.base import clone
def fit_and_test_model(test_data, train_data, instantiated_model):
    test_x, test_y = test_data.drop(['solved_within_time_limit'], axis=1), test_data['solved_within_time_limit']
    train_x, train_y = train_data.drop(['solved_within_time_limit'], axis=1), train_data['solved_within_time_limit']

    test_x = preprocess(test_x)
    train_x = preprocess(train_x)
    instantiated_model = clone(instantiated_model)
    instantiated_model.fit(train_x, train_y)
    
      # Predict on the test set using all features
    y_pred = instantiated_model.predict(test_x)
    
    # Compute the F1 score
    score = f1_score(test_y, y_pred, average='binary')
    return score



def run_best_models_on_increasing_amounts_of_data(all_data):
    results = defaultdict(list)
    for seed in all_data:
        test_data = seed['test_data']
        train_data = seed['train_data']
        for k, instantiated_model in seed['classifiers'].items():
            percentage, model = k
            train_data_for_percentage = train_data[percentage]
            
            for length in range(100, len(train_data_for_percentage), 50):
                f1_score = fit_and_test_model(test_data=test_data[percentage], train_data=train_data[percentage][:length], instantiated_model=instantiated_model)
                results[(percentage, model, length)].append(f1_score)
            # f1_score = fit_and_test_model(test_data=test_data[percentage], train_data=train_data[percentage], instantiated_model=instantiated_model)
            # results[(percentage, model, len(train_data_for_percentage))].append(f1_score)
    return pd.DataFrame.from_records([{'percentage': k[0], 'model': k[1], 'length': k[2], 'mean': np.mean(v), 'std': np.std(v)} for k, v in dict(results).items()])
            

In [14]:
f1s = run_best_models_on_increasing_amounts_of_data(data_for_all_seeds)


NameError: name 'preprocess' is not defined

In [None]:
f1s

In [None]:
def plot_results(results):

    CURRENT_PLOT_MODEL = 'AdaBoost'

    plt.figure(figsize=(12, 8))

    ticks = range(100, max(results['length']), 50)
    
    for percentage in results['percentage'].unique():
        results_for_model = results[(results['model'] == CURRENT_PLOT_MODEL) & (results['percentage'] == percentage)]

        f1_scores_mean = results_for_model['mean']
        f1_scores_std = results_for_model['std']
        lengths = results_for_model['length']
        plt.errorbar(lengths, f1_scores_mean, yerr=f1_scores_std, fmt='o-', capsize=5, label=f'{percentage/2}')
        plt.fill_between(lengths, 
                        np.array(f1_scores_mean) - np.array(f1_scores_std), 
                        np.array(f1_scores_mean) + np.array(f1_scores_std), 
                        alpha=0.1)

    plt.title(f'F1 Score of the best {CURRENT_PLOT_MODEL} model for each percentage problems when increasing the dataset, averaged over 10 testsets of <unseen> problems')
    plt.xlabel('Amount of datapoints')
    plt.ylabel('Average F1 Score of the best RF model')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.xticks(ticks)
    plt.legend()
    plt.show()
plot_results(f1s)

## Follow up question: are the failing problems the same?

In [22]:
from collections import defaultdict
from sklearn.base import clone

def return_correctly_predicted_instances_for_fitted_model(seed, test_data, train_data, instantiated_model):
    test_x, test_y = test_data.drop(['solved_within_time_limit'], axis=1), test_data['solved_within_time_limit']
    train_x, train_y = train_data.drop(['solved_within_time_limit'], axis=1), train_data['solved_within_time_limit']

    test_x_preprocessed = preprocess(test_x)
    train_x_preprocessed = preprocess(train_x)
    instantiated_model = clone(instantiated_model)
    instantiated_model.fit(train_x_preprocessed, train_y)
    
      # Predict on the test set using all features
    y_pred = instantiated_model.predict(test_x_preprocessed)

    test_x_clone = test_x.copy()
    test_x_clone['correct'] = (test_y == y_pred)

    return set(test_x_clone[~test_x_clone['correct']][['mzn', 'dzn']].apply(lambda a: f"{seed} - {a[0]} - {a[1]}", axis=1).to_numpy().tolist())
    



def find_incorrectly_predicted_answers(all_data):
    results = defaultdict(set)
    for seed_idx, seed in enumerate(all_data):
        test_data = seed['test_data']
        train_data = seed['train_data']
        for k, instantiated_model in seed['classifiers'].items():
            percentage, model = k
            train_data_for_percentage = train_data[percentage]
            
            for length in range(100, len(train_data_for_percentage), 50):
                wrongly_predicted_problems = return_correctly_predicted_instances_for_fitted_model(seed=seed_idx, test_data=test_data[percentage], train_data=train_data[percentage][:length], instantiated_model=instantiated_model)
                results[(percentage, model, length)] = results[(percentage, model, length)].union(wrongly_predicted_problems)
    return results
            

In [None]:
wrongly_corrected_answers = find_incorrectly_predicted_answers(data_for_all_seeds)

In [None]:
first = wrongly_corrected_answers[(2, 'ET', 100)]
first

In [None]:
second = wrongly_corrected_answers[(40, 'ET', 100)]
second

In [None]:
len(first), len(second)

In [None]:
len(first.intersection(second))