In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns

In [None]:
log_folder_path = '/Users/jk1/temp/opsum_end/training/hyperopt/xgb_gridsearch/xgb_gs_20250513_154517'
output_dir = '/Users/jk1/temp/opsum_end/training/hyperopt/xgb_gridsearch/xgb_gs_20250513_154517'

In [None]:
gs_df = pd.DataFrame()
for root, dirs, files in os.walk(log_folder_path):
    for file in files:
        if file.endswith('.jsonl'):
            temp_df = pd.read_json(os.path.join(root, file),  
                              lines=True, dtype={'timestamp': 'object'}, convert_dates=False).drop(0)
            # add file name as column
            temp_df['file_name'] = file
            gs_df = pd.concat([gs_df, temp_df], ignore_index=True)

In [None]:
gs_df

In [None]:
best_df = gs_df.sort_values('median_val_scores', ascending=False).head(1)
best_df

In [None]:
# save best config
# best_df.to_csv(os.path.join(output_dir, 'xgb_best_config.csv'), index=False)

## Cave

ROC AUC not reliable alone, as very imbalanced data and XGB has a tendency to predict only 0

In [None]:
full_results_dir = '/Users/jk1/temp/opsum_end/training/hyperopt/xgb_gridsearch/xgb_gs_20250513_154517'

In [None]:
# load all csv files in directory and join them in one df
gs_full_results_df = pd.DataFrame()
for root, dirs, files in os.walk(full_results_dir):
    for file in files:
        if file.endswith('.csv'):
            temp_df = pd.read_csv(os.path.join(root, file))
            # append timestamp from file name as column 
            temp_df['timestamp'] = '_'.join(file.split('_')[1:])[0:-4]
            gs_full_results_df = pd.concat([gs_full_results_df, temp_df], ignore_index=True)
            

In [None]:
gs_full_results_df

In [None]:
# select best model based on mcc_val
mcc_best_model = gs_full_results_df.sort_values('mcc_val', ascending=False).head(1)

In [None]:
mcc_best_model

In [None]:
auroc_best_model = gs_full_results_df.sort_values('auc_val', ascending=False).head(1)
auroc_best_model

In [None]:
# save best model config
# best_model.to_csv(os.path.join(output_dir, 'xgb_best_model.csv'), index=False)

## Evaluate hyperparameters

In [None]:
gs_full_results_df.columns

In [None]:
hyperparameter_columns = ['max_depth', 'n_estimators',
       'learning_rate', 'alpha', 'reg_lambda', 'moving_average']
median_over_folds_df = gs_full_results_df.groupby("timestamp").agg(
    {
        'mcc_val': 'median',
        'auc_val': 'median',
        'auprc_val': 'median',
        'acc_val': 'median',
        'precision_val': 'median',
        'sn_val': 'median',
        'sp_val': 'median',
        # 'mcc_train': 'median',
        # 'auc_train': 'median',
        # 'f1_train': 'median',
        # 'accuracy_train': 'median',
        # 'precision_train': 'median',
        # 'recall_train': 'median',
        'max_depth': 'first',
        'n_estimators': 'first',
        'learning_rate': 'first',
        'alpha': 'first',
        'reg_lambda': 'first',
        'moving_average': 'first'
    }
).reset_index()


In [None]:
def plot_hyperparameter_vs_metric(hyperparameter, metric, df):
    # plot auc_val vs hyperparameters
    fig, axes = plt.subplots(2, 3, figsize=(20, 10))
    for i, hyperparameter in enumerate(hyperparameter_columns):
        ax = axes[i // 3, i % 3]
        # if number of unique values is > 10, use scatter plot
        if len(median_over_folds_df[hyperparameter].unique()) > 10:
            sns.scatterplot(data=median_over_folds_df, x=hyperparameter, y=metric, ax=ax)
        else:
            sns.boxplot(data=median_over_folds_df, x=hyperparameter, y=metric, ax=ax)
        ax.set_title(f'{metric} vs {hyperparameter}')
        ax.set_xlabel(hyperparameter)
        ax.set_ylabel(metric)
    plt.tight_layout()

In [None]:
target = 'auc_val'
plot_hyperparameter_vs_metric(hyperparameter_columns, target, median_over_folds_df)

In [None]:
target = 'auprc_val'
plot_hyperparameter_vs_metric(hyperparameter_columns, target, median_over_folds_df)

In [None]:
target = 'mcc_val'
# plot all hyperparameters against target
plot_hyperparameter_vs_metric(hyperparameter_columns, target, median_over_folds_df)