$$\textrm{Joaquin Peñuela Parra, Cristian Fernando Rodriguez Cruz}$$
$$\textrm{University of Los Andes}$$
$$\textrm{High Energy Physics Group: Phenomenology of Particles}$$

This code was written to be running in Docker. If you do not have a Docker inside hep-server2 please refer to: https://github.com/Phenomenology-group-uniandes/Tutoriales_Generales

In [1]:
import os, sys

def add_parent_lib_path(name="Leptoquarks_Searches_2023"):
    sys.path.append(sys.path[0].split(name)[0])
    
add_parent_lib_path()

In [2]:
import os
import time
import Uniandes_Framework
from ROOT import *
from Uniandes_Framework.ml_tools.xgb_classifier import XGB_Classifier

Welcome to JupyROOT 6.22/06


In [3]:
nucleos_cpu = 12

In [4]:
#Preliminares: 

channels = ["hadronic_dLQ", "hadronic_sLQ", "hadronic_non-resonant", "semileptonic_dLQ", "semileptonic_sLQ", "semileptonic_non-resonant"]

bkg_by_channel = {"dLQ": ['ttbar', 'z_jets'], 
                  "sLQ": ['ttbar', 'z_jets', 'stop'], 
                  "non-resonant": ['ttbar', 'z_jets', 'stop', 'diboson']}

Masses = ["1000",'1250', '1500', '1750', '2000', '2250', '2500']
signals = []
signals += ['LQ_LQ_wo_RHC', 'Tau_LQ_wo_RHC', 'Tau_Tau_wo_RHC']
signals += ['LQ_LQ', 'Tau_LQ', 'Tau_Tau']

parameters={
    "n_estimators":[
        100,
        125,
        250,
        500,
        # 750,
        # 1000
    ],
    "max_depth":[
        3,
        5,
        7,
        9
    ],
    "learning_rate":[
        0.1
    ]
}

bkg_names = ['ttbar', 'z_jets', 'stop']

In [5]:
#%%capture
try: os.mkdir('XGB_models')
except: pass

for channel in channels: 
    try: os.mkdir(f'XGB_models/{channel}')
    except: pass
    
    initial_time = time.time()
    
    mass = '1750'
    
    signal_dict = {}
    for signal in signals:
        key=f"{signal}_{mass}"
        signal_dict[key] = [os.path.join(os.sep,"disco4","pheno_csv_files","Leptoquarks_Searches",key,f"{key}_{channel}.csv")]
        
    bkg_dict = {}
    for bkg in bkg_names: bkg_dict[bkg] = [os.path.join(os.sep,"disco4","pheno_csv_files","Leptoquarks_Searches",bkg,f"{bkg}_{channel}.csv")]
    bkg_dict['diboson'] = [os.path.join(os.sep,"disco4","pheno_csv_files","Leptoquarks_Searches",bkg,f"{bkg}_{channel}.csv") for bkg in ['ww', 'wz', 'zz']]

    bkgs_dict = {}
    bkg_list = [bkg_by_channel[key] for key in bkg_by_channel if key in channel][0]
    
    
    for bkg in bkg_list: bkgs_dict[bkg] = bkg_dict[bkg]
        
    model = XGB_Classifier(
                            ncpu = nucleos_cpu,
                            cv =10, 
                            parameters = parameters,
                            signal_dictionary = signal_dict,
                            bkg_dictionary = bkgs_dict,
                            balance = True
                            )
    
    best_features = model.get_most_important_features() 
    
    if not("sT(GeV)" in best_features):
            best_features += ["sT(GeV)"]
    
    ######################################################
    #Writing .txt:
    with open(os.path.join(os.getcwd(), 'XGB_models', channel, f'Most_Important_Features.txt'), "w") as f:
        for feature in best_features: f.write(feature +"\n")
    f.close()
    
    best_params = {'learning_rate': model.learning_rate, 'max_depth': model.max_depth, 'n_estimators': model.n_estimators}
    
    output_model_channel = [f'For the {model.model_name} model',
                            f'The Best Parameters are {best_params}',
                            'The most important variables are:',
                             str(best_features)]
    
    with open(os.path.join(os.getcwd(), 'XGB_models', channel, f'Output_console.txt'), "w") as f: 
        for line in output_model_channel: f.write(line +"\n")
    f.close()    
    ######################################################
    
    for mass in Masses:
        
        signal_dict = {}
        for signal in signals:
            key=f"{signal}_{mass}"
            signal_dict[key] = [os.path.join(os.sep,"disco4","pheno_csv_files","Leptoquarks_Searches",key,f"{key}_{channel}.csv")]      
            
        model = XGB_Classifier(
                                ncpu = nucleos_cpu,
                                cv =5, 
                                parameters = parameters,
                                signal_dictionary = signal_dict,
                                bkg_dictionary = bkgs_dict,
                                balance = True
                                )    

        model.filter_by_features(best_features, n_pca = 10)
        metrics = model.get_metrics()
        model.save_model(path_to_save= os.path.join(os.getcwd(), 'XGB_models',channel), file_name= f'M{mass}_XGB.joblib')
        
        ######################################################
        #Writing .txt files:
        best_params = {'learning_rate': model.learning_rate, 'max_depth': model.max_depth, 'n_estimators': model.n_estimators}
        model_path = os.path.join(os.getcwd(), 'XGB_models', channel, f'M{mass}_XGB.joblib')
        output_model_mass = ['=='*80,
                              f'Mass: {mass}, channel: {channel}',
                              f'For the {model.model_name} model',
                              f'The Best Parameters are {best_params}',
                              f'The train accuracy is {metrics[1]} and the test test accuracy is {metrics[0]}',
                              'The most important variables are:',
                              str(model.importances_df),
                              f'trainLab size: {len(model.trainLab)}, trainPred size: {len(model.trainPred)}',
                              f'signal_dict = {signal_dict}',
                              f'bkgs_dict = {bkgs_dict}',
                              f'The model was saved in: {model_path}']        
        
        with open(os.path.join(os.getcwd(), 'XGB_models', channel, f'Output_console.txt'), "a") as f: 
            for line in output_model_mass: f.write(line +"\n")   
        f.close()
        ######################################################

    ######################################################
    #Writing .txt files:
    final_time = time.time()
    with open(os.path.join(os.getcwd(), 'XGB_models', channel, f'Output_console.txt'), "a") as f: f.write(f'The channel {channel} takes {(final_time - initial_time)/3600} hours.')
    f.close()
    ######################################################

Fitting 10 folds for each of 16 candidates, totalling 160 fits
For the Gradient_Boosting model
the Best Parameters are {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 250}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
For the Gradient_Boosting model
the Best Parameters are {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
The model was saved in: 
/disco4/SIMULACIONES/Cristian/Github/Leptoquarks_Searches_2023/04_ML_Classification/XGB_models/hadronic_dLQ/M1000_XGB.joblib
Fitting 5 folds for each of 16 candidates, totalling 80 fits
For the Gradient_Boosting model
the Best Parameters are {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 125}
The model was saved in: 
/disco4/SIMULACIONES/Cristian/Github/Leptoquarks_Searches_2023/04_ML_Classification/XGB_models/hadronic_dLQ/M1250_XGB.joblib
Fitting 5 folds for each of 16 candidates, totalling 80 fits
For the Gradient_Boosting model
the Best Parameters are {'learning_rate': 0.1, 'max_depth': 7, 'n_estimato

In [6]:
!touch XBG_models_acabo.txt