## Cross Validation Scores

In [1]:
import pandas as pd
import os

dataset_names_id_based = ['air', 'bakery', 'm5', 'wage', 'yaz']
dataset_names_full_data = ['air_random_10', 'bakery_random_50', 'm5_random_30', 'wage', 'yaz']

def process_dataset(name, training_type, model_type):
    suffix = "FULLDATASET_" if training_type == "Full Data Training" else ""
    cv_drf_scores_path = f'/workspaces/Masterthesis-DRF/results/results_by_file/{suffix}cv_drf_scores_{name}.csv'
    cv_scores_model_path = f'/workspaces/Masterthesis-DRF/results/results_by_file/{suffix}cv_scores_{model_type}_{name}.csv'

    if not os.path.exists(cv_drf_scores_path) or not os.path.exists(cv_scores_model_path):
        return None

    df_0 = pd.read_csv(cv_drf_scores_path)
    df = pd.read_csv(cv_scores_model_path)

    if model_type == "levelset_models":
        required_columns = ['binSize', 'weightsByDistance', 'fold', 'model_name', 'cu', 'co', 'variable', 'dataset_name']
        value_columns = ['0.9', '0.75', '0.5', '0.25', '0.1']

        for col in required_columns:
            if col not in df.columns:
                df[col] = None

        for col in value_columns:
            if col not in df.columns:
                df[col] = None

        reshaped_df = df.melt(
            id_vars=required_columns,
            value_vars=value_columns,
            var_name='tau',
            value_name='split_test_score'
        )

        wide_df = reshaped_df.pivot_table(
            index=['binSize', 'weightsByDistance', 'model_name', 'cu', 'co', 'variable', 'tau'],
            columns='fold',
            values='split_test_score'
        ).reset_index()

        expected_columns = list(wide_df.columns[:7]) + [f"split{fold}_test_score" for fold in wide_df.columns[7:]]
        wide_df.columns = expected_columns

        combined_df = pd.concat([wide_df, df_0], join='outer', ignore_index=True)
    else:
        combined_df = pd.concat([df, df_0], join='outer', ignore_index=True)

    columns_to_drop = [
    'mean_score_time', 'std_score_time',  'std_test_score',
    'rank_test_score', 'mean_fit_time', 'std_fit_time', "mean_test_score"
]

    combined_df = combined_df.drop(columns=columns_to_drop, errors='ignore')
    combined_df['dataset'] = name
    combined_df['training_description'] = training_type
    combined_df['model_type'] = model_type

    return combined_df

processed_dfs = []

for name in dataset_names_id_based:
    for model_type in ['levelset_models', 'basic_models']:
        processed_df = process_dataset(name, training_type="ID-Based Training", model_type=model_type)
        if processed_df is not None:
            processed_dfs.append(processed_df)

for name in dataset_names_full_data:
    for model_type in ['levelset_models', 'basic_models']:
        processed_df = process_dataset(name, training_type="Full Data Training", model_type=model_type)
        if processed_df is not None:
            processed_dfs.append(processed_df)

final_combined_df = pd.concat(processed_dfs, ignore_index=True) if processed_dfs else pd.DataFrame()

final_combined_df.drop(columns=['params'], inplace=True)

final_combined_df.rename(columns={
    'binSize': 'param_binSize',
    'weightsByDistance': 'param_weightsByDistance'
}, inplace=True)

final_combined_df = final_combined_df.drop_duplicates()

param_columns = [col for col in final_combined_df.columns if col.startswith('param_')]
final_combined_df['hyperparameter'] = final_combined_df[param_columns].apply(
    lambda row: {col: row[col] for col in param_columns if pd.notna(row[col])}, axis=1
)

# Schritt 3: (Optional) Entferne die ursprünglichen 'param_'-Spalten, wenn sie nicht mehr benötigt werden
final_combined_df = final_combined_df.drop(columns=param_columns)


final_combined_df.to_csv("crossValidation_results_allDatasets.csv", index=False)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
display(final_combined_df)

Unnamed: 0,model_name,cu,co,variable,tau,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,dataset,training_description,model_type,hyperparameter
0,LS_KDEx_LGBM,1.0,9.0,Location_1_max_CO,0.1,0.764092,0.651639,0.609375,0.798535,0.706927,air,ID-Based Training,levelset_models,"{'param_binSize': 20.0, 'param_weightsByDistance': False}"
1,LS_KDEx_LGBM,1.0,9.0,Location_1_max_NO2,0.1,0.987536,0.961502,0.777616,0.955837,0.871928,air,ID-Based Training,levelset_models,"{'param_binSize': 20.0, 'param_weightsByDistance': False}"
2,LS_KDEx_LGBM,1.0,9.0,Location_1_max_O3,0.1,0.813612,0.706799,0.683918,1.359494,1.006182,air,ID-Based Training,levelset_models,"{'param_binSize': 20.0, 'param_weightsByDistance': False}"
3,LS_KDEx_LGBM,1.0,9.0,Location_1_max_PM10,0.1,0.799842,0.893143,0.796496,0.834180,0.901941,air,ID-Based Training,levelset_models,"{'param_binSize': 20.0, 'param_weightsByDistance': False}"
4,LS_KDEx_LGBM,1.0,9.0,Location_1_max_PM2.5,0.1,0.840553,0.891214,0.850004,1.040120,0.926806,air,ID-Based Training,levelset_models,"{'param_binSize': 20.0, 'param_weightsByDistance': False}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429815,DRF,1.0,9.0,dummyID,0.1,-0.153080,-0.172690,-0.160152,-0.187802,-0.160530,yaz,Full Data Training,basic_models,"{'param_min_node_size': 128.0, 'param_num_features': 8.0, 'param_num_trees': 100.0}"
429816,DRF,1.0,9.0,dummyID,0.1,-0.137637,-0.161119,-0.154333,-0.175216,-0.151331,yaz,Full Data Training,basic_models,"{'param_min_node_size': 32.0, 'param_num_features': 64.0, 'param_num_trees': 50.0}"
429817,DRF,1.0,9.0,dummyID,0.1,-0.139674,-0.157154,-0.148969,-0.172815,-0.153373,yaz,Full Data Training,basic_models,"{'param_min_node_size': 32.0, 'param_num_features': 64.0, 'param_num_trees': 500.0}"
429818,DRF,1.0,9.0,dummyID,0.1,-0.132174,-0.158939,-0.150274,-0.166776,-0.147638,yaz,Full Data Training,basic_models,"{'param_min_node_size': 4.0, 'param_num_features': 8.0, 'param_num_trees': 250.0}"


In [2]:
print(final_combined_df["model_name"].unique())

['LS_KDEx_LGBM' 'LS_KDEx_MLP' 'DRF' 'MLP' 'LGBM' 'RFW' 'KNNW' 'DTW' 'GKW']


## Model Results

In [37]:
import pandas as pd
import os

# Dataset names for ID-based and full-data training
dataset_names_id_based = ['air', 'bakery', 'm5', 'wage', 'yaz']
dataset_names_full_data = ['air_random_10', 'bakery_random_50', 'm5_random_30', 'wage', 'yaz']

# Initialize an empty list to store DataFrames
combined_dfs = []

# Process ID-based training datasets (Basic Models)
for name in dataset_names_id_based:
    file_path = f'/workspaces/Masterthesis-DRF/results/results_by_file/results_basic_Models_{name}.csv'
    try:
        df = pd.read_csv(file_path)
        df['dataset'] = name
        df['training_description'] = "ID-Based Training"
        combined_dfs.append(df)
    except FileNotFoundError:
        print(f"File not found for dataset: {name} (Basic Models)")

# Process full-data training datasets (Basic Models)
for name in dataset_names_full_data:
    file_path = f'/workspaces/Masterthesis-DRF/results/results_by_file/FULLDATASET_results_basic_Models_{name}.csv'
    try:
        df = pd.read_csv(file_path)
        df['dataset'] = name
        df['training_description'] = "Full Data Training"
        combined_dfs.append(df)
    except FileNotFoundError:
        print(f"File not found for FULLDATASET dataset: {name} (Basic Models)")

# Process ID-based training datasets (Levelset Models)
for name in dataset_names_id_based:
    file_path = f'/workspaces/Masterthesis-DRF/results/results_by_file/results_LevelsetModels_{name}.csv'
    try:
        df = pd.read_csv(file_path)
        df['dataset'] = name
        df['training_description'] = "ID-Based Training"
        combined_dfs.append(df)
    except FileNotFoundError:
        print(f"File not found for dataset: {name} (Levelset Models)")

# Process full-data training datasets (Levelset Models)
for name in dataset_names_full_data:
    file_path = f'/workspaces/Masterthesis-DRF/results/results_by_file/FULLDATASET_results_LevelsetModels_{name}.csv'
    try:
        df = pd.read_csv(file_path)
        df['dataset'] = name
        df['training_description'] = "Full Data Training"
        combined_dfs.append(df)
    except FileNotFoundError:
        print(f"File not found for FULLDATASET dataset: {name} (Levelset Models)")

# Combine all DataFrames into a single DataFrame
final_combined_df = pd.concat(combined_dfs, ignore_index=True)
final_combined_df = final_combined_df.drop_duplicates()
# Save the final DataFrame to a CSV file
final_combined_df.to_csv("results_combined_allDatasets.csv", index=False)

# Display the final combined DataFrame
display(final_combined_df)


Unnamed: 0,Variable,cu,co,Model,Pinball Loss,Best Params,delta C,sl,dataset,training_description
0,Location_1_max_CO,9.0,1.0,SAA,0.088849,,,0.9,air,ID-Based Training
1,Location_1_max_CO,9.0,1.0,MLP,0.195847,"OrderedDict([('alpha', 0.0001), ('early_stopping', True), ('layer1', 6), ('layer2', 13), ('learning_rate_init', 0.0005), ('max_iter', 1000), ('solver', 'adam')])",-1.204268,0.9,air,ID-Based Training
2,Location_1_max_CO,9.0,1.0,LGBM,0.178693,"OrderedDict([('learning_rate', 0.01), ('max_depth', -1), ('min_data_in_leaf', 20), ('n_estimators', 100), ('num_leaves', 127)])",-1.011205,0.9,air,ID-Based Training
3,Location_1_max_CO,9.0,1.0,RFW,0.109210,"OrderedDict([('max_depth', 8), ('max_features', None), ('min_samples_split', 16), ('n_estimators', 50)])",-0.229163,0.9,air,ID-Based Training
4,Location_1_max_CO,9.0,1.0,KNNW,0.140335,"OrderedDict([('n_neighbors', 128)])",-0.579482,0.9,air,ID-Based Training
...,...,...,...,...,...,...,...,...,...,...
19080,fish,1.0,9.0,LS_KDEx_MLP,0.032095,"{'binSize': 1000, 'weightsByDistance': False}",-0.619814,0.1,yaz,Full Data Training
19081,koefte,1.0,9.0,LS_KDEx_MLP,0.020352,"{'binSize': 1000, 'weightsByDistance': False}",0.241680,0.1,yaz,Full Data Training
19082,lamb,1.0,9.0,LS_KDEx_MLP,0.022566,"{'binSize': 1000, 'weightsByDistance': False}",0.149625,0.1,yaz,Full Data Training
19083,shrimp,1.0,9.0,LS_KDEx_MLP,0.027005,"{'binSize': 1000, 'weightsByDistance': False}",-0.280237,0.1,yaz,Full Data Training
