In [1]:
# %%
# Imports
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import shutil
import torch
from sklearn import metrics as skmetrics
from argparse import ArgumentParser

# Imports from this project
sys.path.insert(0, "..")
from utils import paths
from utils.input_output import load_feature_keys, load_feature_properties, load_preprocessed_data
from utils.histograms import find_good_binning, get_hist, calc_pull
from utils.merge_pdfs import merge_pdfs
from model_B_classifier import DeepSetModel

# Evaluation Results from different models

In [6]:
with open("../features_B_classifier.json", "r") as file:
    feature_lists = json.load(file)
    
trained_model_names = [n.replace("features_","") for n in feature_lists.keys() if (paths.models_dir/n.replace("features_","")/paths.B_classifier_eval_plots_file.name).is_file()]
trained_model_names

['B_classifier_all',
 'B_classifier_without_Tr_T_x',
 'B_classifier_without_Tr_ProbSS',
 'B_classifier_without_Tr_T_ACHI2DOCA',
 'B_classifier_without_Tr_T_ADOCA',
 'B_classifier_without_Tr_T_Charge']

In [15]:
df_results = pd.DataFrame()

for model_name in trained_model_names:
    paths.update_B_classifier_name(model_name)
    with open(paths.B_classifier_dir/"eval_results.json", "r") as file:
        eval_results = json.load(file)
        
    for metric in eval_results.keys():
        if metric != "confusion_matrix_test":
            df_results.loc[model_name,metric] = eval_results[metric]
            
    if "confusion_matrix_test" in eval_results.keys():
        df_results.loc[model_name,"efficiency_Bd_test"] = eval_results["confusion_matrix_test"][0][0]
        df_results.loc[model_name,"efficiency_Bs_test"] = eval_results["confusion_matrix_test"][1][1]
        
with pd.option_context("display.float_format", '${:,.3f}'.format):
    print(df_results.sort_values(by="accuracy_test", ascending=False))

                                     roc_auc_test  roc_auc_train  \
B_classifier_without_Tr_ProbSS             $0.931         $0.953   
B_classifier_all                           $0.934         $0.954   
B_classifier_without_Tr_T_x                $0.926         $0.946   
B_classifier_without_Tr_T_ADOCA            $0.913         $0.934   
B_classifier_without_Tr_T_ACHI2DOCA        $0.904         $0.930   
B_classifier_without_Tr_T_Charge           $0.891         $0.917   

                                     accuracy_test  accuracy_train  \
B_classifier_without_Tr_ProbSS              $0.857          $0.882   
B_classifier_all                            $0.854          $0.881   
B_classifier_without_Tr_T_x                 $0.846          $0.870   
B_classifier_without_Tr_T_ADOCA             $0.831          $0.857   
B_classifier_without_Tr_T_ACHI2DOCA         $0.829          $0.856   
B_classifier_without_Tr_T_Charge            $0.796          $0.820   

                                

# Generare Feature Lists

In [19]:
with open("../features_B_classifier.json", "r") as file:
    feature_lists = json.load(file)
    
assert "features_B_classifier_baseline" in feature_lists.keys()
assert "features_B_classifier_all" in feature_lists.keys()

baseline_features = feature_lists["features_B_classifier_baseline"]
all_features = feature_lists["features_B_classifier_all"]

In [17]:
# add lists with one feature added, each
for feature in set(all_features) - set(baseline_features):
    feature_lists[f"features_B_classifier_baseline_with_{feature}"] = baseline_features + [feature]

In [18]:
with open("../features_B_classifier.json", "w") as file:
    json.dump(feature_lists, file, indent=2)

In [20]:
feature_lists.keys()

dict_keys(['features_B_classifier_baseline', 'features_B_classifier_baseline_with_Tr_T_Best_PAIR_M_fromiso', 'features_B_classifier_baseline_with_Tr_T_Ntr_incone', 'features_B_classifier_baseline_with_Tr_T_Charge', 'features_B_classifier_baseline_with_Tr_T_TRPCHI2', 'features_B_classifier_baseline_with_Tr_T_SumMinBDT_ult', 'features_B_classifier_baseline_with_Tr_T_MinBDT_sigtr', 'features_B_classifier_baseline_with_Tr_T_ACHI2DOCA', 'features_B_classifier_baseline_with_Tr_T_TRFITTCHI2', 'features_B_classifier_baseline_with_Tr_T_NbNonIsoTr_ult', 'features_B_classifier_baseline_with_Tr_T_SumBDT_sigtr', 'features_B_classifier_baseline_with_Tr_T_TRCHI2DOF', 'features_B_classifier_baseline_with_Tr_T_TrFITTCHI2NDOF', 'features_B_classifier_baseline_with_Tr_T_AALLSAMEBPV', 'features_B_classifier_baseline_with_Tr_T_ISMUON', 'features_B_classifier_baseline_with_Tr_T_ConIso_pt_ult', 'features_B_classifier_baseline_with_Tr_T_TRFITVELOCHI2NDOF', 'features_B_classifier_baseline_with_Tr_T_PIDmu', 'fe

# Threads

In [1]:
train_time_by_threads = {
    50 : "54:18",
    40 : "49:18",
    20 : "1:04:44",
    10 : "1:16:31",
    5 : "1:36:53"
}
train_time_by_threads

{50: '54:18', 40: '49:18', 20: '1:04:44', 10: '1:16:31', 5: '1:36:53'}

# Baseline + 1 Feature

In [2]:
trained_models = [str(model_dir.name) for model_dir in paths.models_dir.glob("B_classifier*") if model_dir.is_dir()]

baseline_model_name = "B_classifier_baseline"

models_baseline_plus_one = [n for n in trained_models if "B_classifier_baseline_with_" in n]
additional_features = [n.replace("B_classifier_baseline_with_","") for n in models_baseline_plus_one]

In [3]:
df_fi = pd.DataFrame({"feature":additional_features}).set_index("feature")

In [4]:
paths.update_B_classifier_name(baseline_model_name)
with open(paths.B_classifier_eval_data_file, "r") as file:
    eval_results = json.load(file)

baseline_results = {}

baseline_results["accuracy_test"] = eval_results["accuracy_test"]
baseline_results["accuracy_train"] = eval_results["accuracy_train"]
baseline_results["efficiency_Bd_test"] = eval_results["confusion_matrix_test"][0][0]
baseline_results["efficiency_Bs_test"] = eval_results["confusion_matrix_test"][1][1]
baseline_results["roc_auc_test"] = eval_results["roc_auc_test"]
baseline_results["roc_auc_train"] = eval_results["roc_auc_train"]

In [5]:
for feature, model_name in zip(additional_features, models_baseline_plus_one):
    paths.update_B_classifier_name(model_name)
    with open(paths.B_classifier_eval_data_file, "r") as file:
        eval_results = json.load(file)
        
    df_fi.loc[feature, "accuracy_test"] = eval_results["accuracy_test"]
    df_fi.loc[feature, "accuracy_train"] = eval_results["accuracy_train"]
    df_fi.loc[feature, "efficiency_Bd_test"] = eval_results["confusion_matrix_test"][0][0]
    df_fi.loc[feature, "efficiency_Bs_test"] = eval_results["confusion_matrix_test"][1][1]
    df_fi.loc[feature, "roc_auc_test"] = eval_results["roc_auc_test"]
    df_fi.loc[feature, "roc_auc_train"] = eval_results["roc_auc_train"]

In [6]:
importance_metrics = []

for metric in baseline_results.keys():
    df_fi[f"diff_{metric}"] = df_fi[metric] - baseline_results[metric]
    importance_metrics.append(f"diff_{metric}")
    
df_fi.sort_values(by="diff_accuracy_test", ascending=False, inplace=True)

In [7]:
paths.update_B_classifier_name("B_classifier_all")
df_fi_all = pd.read_csv(paths.B_classifier_feature_importance_data_file)

importances_on_all = ["perm_accuracy","combined_mean","combined_max"]
df_fi[importances_on_all] = df_fi_all.set_index("feature").loc[additional_features,importances_on_all]

importance_metrics = importance_metrics + importances_on_all

In [8]:
# %%
# Plot the feature importances horizontal
fig, axs = plt.subplots(len(importance_metrics),1, 
                        figsize=(len(additional_features)/1.5, len(importance_metrics)*5), 
                        sharex=True)

#fig.suptitle(f"Feature Importance")

for i, (ax, metric) in enumerate(zip(axs, importance_metrics)):
    ax.set_title(f"feature importance metric: {metric}")
    if f"{metric}_std" in df_fi.columns:
        err = df_fi[f"{metric}_std"]
    else:
        err = None
    ax.bar(df_fi.index, df_fi[metric], yerr=err, color=f"C{i}", alpha=0.8)
    ax.set_ylabel(metric)
    ax.tick_params(axis="x", labelbottom=True, labelrotation=60)

plt.tight_layout()
plt.savefig(paths.plots_dir/"baseline_plus_one.pdf")
plt.close()