# Photon ID Run 2 BDT classification - Hyperparameter optimation

https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_simple.py

In [13]:
import numpy as np
import pandas as pd
import pickle
import joblib

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})
import seaborn as sns

import lightgbm as lgb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import RocCurveDisplay, accuracy_score

In [14]:
datadir = "/home/chardong/y_identification/Venv/save_pkl/"
savedir = "/home/chardong/y_identification/Venv/save_plots/Py8_yj_jj_train_skim30/"
# Chemin pour enregistrer les fichiers pickle
save_path = '/home/chardong/y_identification/Venv/save_pkl/Fudge_Factor/'
#datadir = "/eos/user/m/mdelmast/Data/EGamma/PhotonID/Run2/"
savedirmodel = "/home/chardong/y_identification/Venv/BDT_model/skim30/"

In [15]:
df = pd.read_pickle(datadir+"RAW_data/Py8_yj_jj_mc16ade_pd122_train_w_skim_30.pkl")

In [16]:
shower_shape_var = ['y_Reta',
                    'y_Rphi',
                    'y_weta2',
                    'y_fracs1',
                    'y_weta1',
                    'y_wtots1',
                    'y_Rhad',
                    'y_Rhad1',
                    'y_Eratio', 
                    'y_deltae']

conv_var = [ 'y_convRadius', 'y_convType']

kinem_var = ['y_pt', 'y_eta', 'y_phi']

truth_var = ['y_truth_pt', 'y_truth_eta' ]

discriminating_var = shower_shape_var + kinem_var + conv_var 

Y_var = list(set(df.columns)-set(discriminating_var+truth_var))

In [17]:
X = df[discriminating_var+truth_var]
Y = df[Y_var]

x_train_val, x_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=42)

weight_train = y_train["weight"]
weight_val   = y_val  ["weight"]
weight_test  = y_test ["weight"]

Y_var_drop = list(set(Y_var)-{"truth_label"})

othervars_train = y_train[Y_var_drop]
othervars_val   = y_val  [Y_var_drop]
othervars_test  = y_test [Y_var_drop]

y_train = y_train.drop(Y_var_drop, axis=1)
y_test  = y_test .drop(Y_var_drop, axis=1)
y_val   = y_val  .drop(Y_var_drop, axis=1)

truth_train = x_train[truth_var]
truth_val   = x_val  [truth_var]
truth_test  = x_test [truth_var]

x_train = x_train.drop(truth_var, axis=1)
x_test  = x_test .drop(truth_var, axis=1)
x_val   = x_val  .drop(truth_var, axis=1)

y_train = np.ravel(y_train)
y_val   = np.ravel(y_val)

In [18]:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py

def classification_metric(testy, probs):
    from sklearn.metrics import precision_recall_curve
    precision, recall, thresholds = precision_recall_curve(testy, probs[:,1])
    # convert to f score
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    return fscore[ix]

def objective(trial):
    
    model = lgb.LGBMClassifier(learning_rate  = 0.05,  
                               num_leaves     = trial.suggest_int("num_leaves", 2, 256), # tries various value trying to get maximum accuracy
                               max_depth      = -5,
                               objective      = 'xentropy',
                               n_estimators   = 10,
                               force_col_wise = True,
                               verbosity      = -1)

    cb = [lgb.early_stopping(stopping_rounds=10),lgb.log_evaluation(30)]

    model.fit(x_train, y_train,
          sample_weight = weight_train,
          eval_set = [(x_train, y_train), (x_val, y_val)],  
          eval_names = ['Train', 'Validation'],
          eval_sample_weight = [weight_train, weight_val],
          callbacks = cb)
    
    y_pred_prob_test = model.predict_proba(x_test)
    return classification_metric(y_test, y_pred_prob_test)

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-07-18 16:21:03,525] A new study created in memory with name: no-name-8ed721f0-ceb4-4812-9e28-8b28efe3100d


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[10]	Train's cross_entropy: 0.468612	Validation's cross_entropy: 0.46487


[I 2024-07-18 16:21:49,165] Trial 0 finished with value: 0.9708853207827065 and parameters: {'num_leaves': 29}. Best is trial 0 with value: 0.9708853207827065.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[10]	Train's cross_entropy: 0.458198	Validation's cross_entropy: 0.455849


[I 2024-07-18 16:22:33,894] Trial 1 finished with value: 0.9721366725599038 and parameters: {'num_leaves': 240}. Best is trial 1 with value: 0.9721366725599038.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[10]	Train's cross_entropy: 0.458267	Validation's cross_entropy: 0.455896


[I 2024-07-18 16:23:15,619] Trial 2 finished with value: 0.9723457111303252 and parameters: {'num_leaves': 235}. Best is trial 2 with value: 0.9723457111303252.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[10]	Train's cross_entropy: 0.458382	Validation's cross_entropy: 0.455961


[I 2024-07-18 16:23:57,033] Trial 3 finished with value: 0.9723002492733159 and parameters: {'num_leaves': 230}. Best is trial 2 with value: 0.9723457111303252.


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[10]	Train's cross_entropy: 0.459632	Validation's cross_entropy: 0.456885


[I 2024-07-18 16:24:37,533] Trial 4 finished with value: 0.9718374566420876 and parameters: {'num_leaves': 170}. Best is trial 2 with value: 0.9723457111303252.


Number of finished trials: 5
Best trial:
  Value: 0.9723457111303252
  Params: 
    num_leaves: 235


In [11]:
# Feature importance: Numbers of times the feature is used in a model
lgb.plot_importance(model, importance_type='split', figsize=(8,6))
plt.title('Feature importance: split')
#plt.savefig(savedir+'feature_split_lr_0.09_35_skim30.pdf')
plt.show()

NameError: name 'model' is not defined