# Photon ID Run 2 BDT classification - Hyperparameter optimation

https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_simple.py

In [8]:
import numpy as np
import pandas as pd
import pickle
import joblib

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import RocCurveDisplay, accuracy_score
from sklearn.metrics import precision_recall_curve, f1_score

In [2]:
datadir = "/home/chardong/y_identification/Venv/save_pkl/"
savedir = "/home/chardong/y_identification/Venv/save_plots/Py8_yj_jj_train_skim30/"
# Chemin pour enregistrer les fichiers pickle
save_path = '/home/chardong/y_identification/Venv/save_pkl/Fudge_Factor/'
#datadir = "/eos/user/m/mdelmast/Data/EGamma/PhotonID/Run2/"
savedirmodel = "/home/chardong/y_identification/Venv/BDT_model/skim30/"

In [3]:
df = pd.read_pickle(datadir+"RAW_data/Py8_yj_jj_mc16ade_pd122_train_w_skim_30.pkl")

In [4]:
shower_shape_var = ['y_Reta',
                    'y_Rphi',
                    'y_weta2',
                    'y_fracs1',
                    'y_weta1',
                    'y_wtots1',
                    'y_Rhad',
                    'y_Rhad1',
                    'y_Eratio', 
                    'y_deltae']

conv_var = [ 'y_convRadius', 'y_convType']

kinem_var = ['y_pt', 'y_eta', 'y_phi']

truth_var = ['y_truth_pt', 'y_truth_eta' ]

discriminating_var = shower_shape_var + kinem_var + conv_var 

Y_var = list(set(df.columns)-set(discriminating_var+truth_var))

In [5]:
X = df[discriminating_var+truth_var]
Y = df[Y_var]

x_train_val, x_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=42)

weight_train = y_train["weight"]
weight_val   = y_val  ["weight"]
weight_test  = y_test ["weight"]

Y_var_drop = list(set(Y_var)-{"truth_label"})

othervars_train = y_train[Y_var_drop]
othervars_val   = y_val  [Y_var_drop]
othervars_test  = y_test [Y_var_drop]

y_train = y_train.drop(Y_var_drop, axis=1)
y_test  = y_test .drop(Y_var_drop, axis=1)
y_val   = y_val  .drop(Y_var_drop, axis=1)

truth_train = x_train[truth_var]
truth_val   = x_val  [truth_var]
truth_test  = x_test [truth_var]

x_train = x_train.drop(truth_var, axis=1)
x_test  = x_test .drop(truth_var, axis=1)
x_val   = x_val  .drop(truth_var, axis=1)

y_train = np.ravel(y_train)
y_val   = np.ravel(y_val)

In [6]:
# Plages d'hyperparamètres
learning_rates = [0.01, 0.05, 0.1]
num_leaves_options = [31, 63, 127]
max_depth_options = [-1, -5, -10]
n_estimators_options = [100, 500, 1000]

In [None]:
best_f1 = 0
best_params = {}

for lr in learning_rates:
    for num_leaves in num_leaves_options:
        for max_depth in max_depth_options:
            for n_estimators in n_estimators_options:
                # Définir le modèle avec les hyperparamètres actuels
                model = lgb.LGBMClassifier(
                    learning_rate=lr,
                    num_leaves=num_leaves,
                    max_depth=max_depth,
                    n_estimators=n_estimators
                )
                
                # Entraîner le modèle
                model.fit(x_train, y_train, sample_weight=weight_train)
                
                # Prédictions sur l'ensemble de validation
                y_val_pred_proba = model.predict_proba(x_val)[:, 1]
                
                # Calcul de la courbe Précision-Rappel
                precision, recall, _ = precision_recall_curve(y_val, y_val_pred_proba, sample_weight=weight_val)
                f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
                max_f1 = np.max(f1)  # F1-score maximal pour cette configuration
                
                if max_f1 > best_f1:
                    best_f1 = max_f1
                    best_params = {
                        'learning_rate': lr,
                        'num_leaves': num_leaves,
                        'max_depth': max_depth,
                        'n_estimators': n_estimators
                    }

                print(f'Params: lr={lr}, leaves={num_leaves}, depth={max_depth}, estimators={n_estimators}')
                print(f'F1 Score: {max_f1}\n')


[LightGBM] [Info] Number of positive: 7479469, number of negative: 3702534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.410837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3576
[LightGBM] [Info] Number of data points in the train set: 11182003, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500230 -> initscore=0.000918
[LightGBM] [Info] Start training from score 0.000918
Params: lr=0.01, leaves=31, depth=-1, estimators=100
F1 Score: 0.9074035143522032

[LightGBM] [Info] Number of positive: 7479469, number of negative: 3702534
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.311148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3576
[LightGBM] [Info] N

In [None]:
print(f'Best F1 Score: {best_f1}')
print(f'Best Parameters: {best_params}')

In [None]:
# Predicting on the validation set
z_val = np.dot(x_val, weights) + bias
y_val_pred = sigmoid(z_val)
y_val_pred = np.where(y_val_pred > 0.5, 1, 0)

# Calculate accuracy on the validation set
accuracy = np.mean(y_val_pred == y_val)
print(f'Validation Accuracy: {accuracy}')