In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import sys
import matplotlib.pyplot as plt
import shap
import plotly.express as px


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.metrics import RocCurveDisplay, roc_curve, auc, recall_score
from sklearn.metrics import precision_score
import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix  
import glob
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE

In [3]:
def string_to_float(string):
    return float(string.strip('[]'))


In [4]:
#20 GeV verticali
file_list_p = ['/lustrehome/mbossa/Nuses/Analysis/protonMono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDprotonMono_20000-0.25_onAxis_20000-evt-0', '/lustrehome/mbossa/Nuses/Analysis/protonMono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDprotonMono_20000-0.25_onAxis_20000-evt-1', '/lustrehome/mbossa/Nuses/Analysis/protonMono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDprotonMono_20000-0.25_onAxis_20000-evt-3', '/lustrehome/mbossa/Nuses/Analysis/protonMono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDprotonMono_20000-0.25_onAxis_20000-evt-4']
file_list_pi_plus = ['/lustrehome/mbossa/Nuses/Analysis/pi+Mono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDpi+Mono_20000-0.25_onAxis_20000-evt-0 ', '/lustrehome/mbossa/Nuses/Analysis/pi+Mono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDpi+Mono_20000-0.25_onAxis_20000-evt-1', '/lustrehome/mbossa/Nuses/Analysis/pi+Mono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDpi+Mono_20000-0.25_onAxis_20000-evt-2', '/lustrehome/mbossa/Nuses/Analysis/pi+Mono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDpi+Mono_20000-0.25_onAxis_20000-evt-3', '/lustrehome/mbossa/Nuses/Analysis/pi+Mono_20GeV_onAxis_20000_evts/NUSES_wt_CaloHERDpi+Mono_20000-0.25_onAxis_20000-evt-4']

In [5]:
df_list_p = [pd.read_csv(file) for file in file_list_p]
combined_df_p = pd.concat(df_list_p, ignore_index=True)
combined_df_p.to_csv('combined_df_p.csv', index=False)

In [6]:
df_list_pi_plus = [pd.read_csv(file) for file in file_list_pi_plus]
combined_df_pi_plus = pd.concat(df_list_pi_plus, ignore_index=True)
combined_df_pi_plus.to_csv('combined_df_pi_plus.csv', index=False)

In [7]:
df_p = pd.read_csv('combined_df_p.csv')
df_pi_plus = pd.read_csv('combined_df_pi_plus.csv')

In [8]:
df_p['R1'] = df_p['R1'].apply(string_to_float)
df_p['R2'] = df_p['R2'].apply(string_to_float)
df_p['R4'] = df_p['R4'].apply(string_to_float)
df_p['R5'] = df_p['R5'].apply(string_to_float)
df_p['R6'] = df_p['R6'].apply(string_to_float)

df_pi_plus['R1'] = df_pi_plus['R1'].apply(string_to_float)
df_pi_plus['R2'] = df_pi_plus['R2'].apply(string_to_float)
df_pi_plus['R4'] = df_pi_plus['R4'].apply(string_to_float)
df_pi_plus['R5'] = df_pi_plus['R5'].apply(string_to_float)
df_pi_plus['R6'] = df_pi_plus['R6'].apply(string_to_float)

In [9]:
df_p['R3'] = df_p['R3'].apply(lambda x: eval(x) if isinstance(x, str) else np.nan)
df_pi_plus['R3'] = df_pi_plus['R3'].apply(lambda x: eval(x) if isinstance(x, str) else np.nan)

In [10]:
for i in range(25):
        df_p[f'R3_{i+1}'] = df_p['R3'].apply(lambda x: x[i] if isinstance(x, list) and i < len(x) else 0)
df2_p = df_p.drop(columns=['R3'])

In [11]:
for i in range(25):
        df_pi_plus[f'R3_{i+1}'] = df_pi_plus['R3'].apply(lambda x: x[i] if isinstance(x, list) and i < len(x) else 0)
df2_pi_plus = df_pi_plus.drop(columns=['R3'])

In [18]:
df2_p['y_true'] = 0
df2_pi_plus['y_true']= 1

In [19]:
combined_df = pd.concat([df2_p, df2_pi_plus])
shuffled_df = combined_df.sample(frac=1).reset_index(drop=True)
y_true = shuffled_df['y_true'].values

In [20]:
#xgb classifier
param_grid_xgb = {
    'max_depth': [3],#, 5, 6],6 e' il max
    'learning_rate': [0.1],# 0.01, 0.001],
    'subsample': [0.5],# 0.7, 1],
   # 'reg_alpha':[0],# 0.5, 1, 5],
    #'reg_lambda':[0],# 0.5, 1, 5]   
}

xgb_model = xgb.XGBClassifier(tree_method="hist",seed=0)
grid_search = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='accuracy',refit=True)
# Fit the GridSearchCV object to the training data
X_train, X_test, y_train, y_test = train_test_split(shuffled_df.iloc[:, 1:-1], shuffled_df.iloc[:, -1], test_size=0.3, random_state=0)
X_val, X_test1, Y_val, Y_test1 = train_test_split(X_test, y_test, test_size=0.1, random_state=0)

print(np.unique(y_train))  # Dovrebbe stampare [0 1]
print(np.unique(Y_val))    # Dovrebbe stampare [0 1]
print(np.unique(Y_test1))  #
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

[0 1]
[0 1]
[0 1]


In [21]:
y_pred_test = best_model.predict(X_test1)
accuracy_test = accuracy_score(Y_test1, y_pred_test)
recall_test = recall_score(Y_test1, y_pred_test)
precision_test = precision_score(Y_test1, y_pred_test)

# Stampa dei risultati sui dati di test
print("Accuracy on Test Set: %.2f%%" % (accuracy_test * 100.0))
print("Recall on Test Set: %.2f%%" % (recall_test * 100.0))
print("Precision on Test Set: %.2f%%" % (precision_test * 100.0))

# Predire sui dati di validazione
y_pred_val = best_model.predict(X_val)
accuracy_val = accuracy_score(Y_val, y_pred_val)
recall_val = recall_score(Y_val, y_pred_val)
precision_val = precision_score(Y_val, y_pred_val)

# Stampa dei risultati sui dati di validazione
print("Accuracy on Validation Set: %.2f%%" % (accuracy_val * 100.0))
print("Recall on Validation Set: %.2f%%" % (recall_val * 100.0))
print("Precision on Validation Set: %.2f%%" % (precision_val * 100.0))

Accuracy on Test Set: 61.13%
Recall on Test Set: 74.98%
Precision on Test Set: 62.64%
Accuracy on Validation Set: 59.34%
Recall on Validation Set: 72.94%
Precision on Validation Set: 61.24%
