# Analyzing the Data

### TODOS

- check for missing values
  - how to fill them in? 
- check which features to use and if we want to use a reduction method
  - [Reduction: PCA vs LDA](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_lda.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-lda-py) (LDA was mentioned in lecture)
  - [Get values from LDA](https://stackoverflow.com/questions/13973096/how-do-i-get-the-components-for-lda-in-scikit-learn)
  - [LDA step by step](https://machinelearningmastery.com/linear-discriminant-analysis-for-dimensionality-reduction-in-python/)
- After that we implement the classifier 
  - Combine a reduction with the classifier in a [pipeline](https://stackoverflow.com/questions/32860849/classification-pca-and-logistic-regression-using-sklearn) 

- [tuning pipelines](https://www.kaggle.com/code/mathurutkarsh/pipelines-and-hyperparameter-tuning-in-sklearn)
  

- To use hyperparameter tuning its best to use our own AMS score as the deciding scorer
  - [See here on how to do that](https://scikit-learn.org/stable/modules/model_evaluation.html#defining-your-scoring-strategy-from-metric-functions) 
  - 

# Data preparation

In [1]:
#import uproot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ROOT;
#import lumiere as lm
#lm.loadstyle(True);

from sklearn.metrics import roc_auc_score, roc_curve

def ams_score(x, y, w, cut):
# Calculate Average Mean Significane as defined in ATLAS paper
#    -  approximative formula for large statistics with regularisation
# x: array of truth values (1 if signal)
# y: array of classifier result
# w: array of event weights
# cut
    t = y > cut 
    s = np.sum((x[t] == 1)*w[t])
    b = np.sum((x[t] == 0)*w[t])
    return s/np.sqrt(b+10.0)

def find_best_ams_score(x, y, w):
# find best value of AMS by scanning cut values; 
# x: array of truth values (1 if signal)
# y: array of classifier results
# w: array of event weights
#  returns 
#   ntuple of best value of AMS and the corresponding cut value
#   list with corresponding pairs (ams, cut) 
# ----------------------------------------------------------
    ymin=min(y) # classifiers may not be in range [0.,1.]
    ymax=max(y)
    nprobe=200    # number of (equally spaced) scan points to probe classifier 
    amsvec= [(ams_score(x, y, w, cut), cut) for cut in np.linspace(ymin, ymax, nprobe)] 
    maxams=sorted(amsvec, key=lambda lst: lst[0] )[-1]
    return maxams, amsvec




def printScore(model):

    try:
        pred_clf = model.predict_proba(x_val)[:, 1]
    except:
        pred_clf = model.predict(x_val)
        pred_clf = pred_clf.reshape((pred_clf.shape[0],))

    auc = roc_auc_score(y_val, pred_clf, sample_weight=w_val)
    print('AUC:', auc)
    bs = find_best_ams_score(y_val, pred_clf, w_val)
    print('AMS:', bs[0][0])
    print('AMS total:', bs[0][0]*np.sqrt(50))

Welcome to JupyROOT 6.28/10


## Read-in & to Pandas

In [2]:
input_columns = ['DER_deltaeta_jet_jet', 'DER_deltar_tau_lep', 'DER_lep_eta_centrality', 'DER_mass_MMC', 'DER_mass_jet_jet', 
                 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_met_phi_centrality', 'DER_prodeta_jet_jet', 'DER_pt_h', 
                 'DER_pt_ratio_lep_tau', 'DER_pt_tot', 'DER_sum_pt', 'PRI_jet_all_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
                 'PRI_jet_leading_pt', 'PRI_jet_num', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_subleading_pt', 
                 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_lep_pt', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_tau_eta', 'PRI_tau_phi', 
                 'PRI_tau_pt', 'transverse_lepton_jet_mass']
print(len(input_columns))

31


In [3]:
RDF = ROOT.ROOT.RDataFrame

signal_tree_name = 'signal'
background_tree_name = 'background'
test_tree_name = 'validation'
file_name = 'atlas-higgs-challenge-2014-v2_part.root'

rdf_signal = RDF(signal_tree_name, file_name)
rdf_bkg = RDF(background_tree_name, file_name)
rdf_test = RDF(test_tree_name, file_name)

reconstruct_transverse_lepton_jet_mass = '''

float lep_px = PRI_lep_pt * TMath::Cos(PRI_lep_phi);
float lep_py = PRI_lep_pt * TMath::Sin(PRI_lep_phi);
float jet_px = PRI_jet_leading_pt * TMath::Cos(PRI_jet_leading_phi);
float jet_py = PRI_jet_leading_pt * TMath::Sin(PRI_jet_leading_phi);

//calculate angle between jet and lepton
float cos_theta = (lep_px*jet_px + lep_py*jet_py) / PRI_lep_pt / PRI_jet_leading_pt;

return PRI_lep_pt * PRI_jet_leading_pt * (1 - cos_theta);
'''

#insertion
rdf_signal = rdf_signal.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_bkg = rdf_bkg.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_test = rdf_test.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)

# label classification to int values
rdf_test = rdf_test.Define('IntLabel', '''
const char ch = Label[0];
const char s = 's';
if(ch == s){
    return 1;
}
else{
    return 0;
}
''')


df_signal = pd.DataFrame(rdf_signal.AsNumpy())
df_bg = pd.DataFrame(rdf_bkg.AsNumpy())
df_test = pd.DataFrame(rdf_test.AsNumpy())


## concatination, shuffle and split

In [4]:
from sklearn.utils import shuffle;
from sklearn.model_selection import train_test_split;

#input feature arrays
vars_signal = df_signal[input_columns].to_numpy()
vars_bg = df_bg[input_columns].to_numpy()
vars_test = df_test[input_columns].to_numpy()

inputs = np.concatenate([vars_signal, vars_bg])

#weights
weight_signal = df_signal['Weight'].to_numpy()
weight_bg = df_bg['Weight'].to_numpy()
weights = np.concatenate([weight_signal, weight_bg])
weights = weights.reshape((weights.shape[0],))

weights_test = df_test['Weight'].to_numpy()


# target classifictionation (1:signal / 0: background)
y_signal = np.ones((vars_signal.shape[0], ))
y_bg = np.zeros((vars_bg.shape[0], ))

targets = np.concatenate([y_signal, y_bg])

# for test dataset there is already a classification; convert to int
truths_test = df_test.IntLabel.to_numpy()


# shuffle 
inputs, targets, weights = shuffle(inputs, targets, weights)


# not for gridcv

# training and validation split  (80, 20)
#x_train, x_val, y_train, y_val, w_train, w_val = train_test_split(inputs, targets, weights, test_size=0.2)
x_train, y_train = inputs, targets

## Pipeline approach

In [5]:
# custom AMS scorer
def BuildScorer(validation_x, validation_y, validation_weight):

    def AMS_scorer(estimator, X, y):
        predictions = estimator.predict_proba(validation_x)[:, 1]
        score = find_best_ams_score(validation_y, predictions, validation_weight)
        return score[0][0] 
    
    return AMS_scorer

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler;
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV


scaler = StandardScaler()
pca = PCA()
clf = GradientBoostingClassifier(random_state=0, verbose=0)

pipe = Pipeline([ ('scaler', scaler), ('pca', pca), ('clf', clf)])


param_grid = {'pca__n_components': [22, 20, 18, 16],
                  'clf__n_estimators': [190, 200, 225, 250, 275],
                  'clf__min_samples_leaf': [50, 75, 100, 125, 150],
                  'clf__max_depth': [7, 8, 9], 
                  'clf__learning_rate': [0.1, 0.075, 0.025, 0.01]
                }


In [7]:
grid_search = HalvingGridSearchCV(estimator=pipe, param_grid=param_grid, scoring='roc_auc', verbose=2, cv=4, random_state=0)

grid_search.fit(x_train, y_train)

grid_results = pd.DataFrame(grid_search.cv_results_)

grid_results.to_csv('second_search.csv', index=False)


n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 7
min_resources_: 68
max_resources_: 50000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1200
n_resources: 68
Fitting 4 folds for each of 1200 candidates, totalling 4800 fits
[CV] END clf__learning_rate=0.1, clf__max_depth=7, clf__min_samples_leaf=50, clf__n_estimators=190, pca__n_components=22; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=7, clf__min_samples_leaf=50, clf__n_estimators=190, pca__n_components=22; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=7, clf__min_samples_leaf=50, clf__n_estimators=190, pca__n_components=22; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=7, clf__min_samples_leaf=50, clf__n_estimators=190, pca__n_components=22; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=7, clf__min_samples_leaf=50, clf__n_estimators=190, pca__n_components=20; total time=   0.1s
[CV] END clf__learning_rate=0

In [8]:
grid_results = pd.DataFrame(grid_search.cv_results_)
grid_results

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__learning_rate,param_clf__max_depth,param_clf__min_samples_leaf,param_clf__n_estimators,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,0,68,0.097102,0.001939,0.002369,0.000232,0.1,7,50,190,...,0.500000,0.500000,0.000000,371,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000
1,0,68,0.094934,0.001103,0.002145,0.000057,0.1,7,50,190,...,0.500000,0.500000,0.000000,371,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000
2,0,68,0.093934,0.000743,0.002115,0.000057,0.1,7,50,190,...,0.500000,0.500000,0.000000,371,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000
3,0,68,0.096138,0.000866,0.002397,0.000270,0.1,7,50,190,...,0.500000,0.500000,0.000000,371,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000
4,0,68,0.102335,0.003465,0.002340,0.000341,0.1,7,50,200,...,0.500000,0.500000,0.000000,371,0.500000,0.500000,0.500000,0.500000,0.500000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1796,5,16524,87.480378,2.778500,0.201347,0.021460,0.025,9,50,250,...,0.873604,0.872131,0.005677,6,0.974320,0.975122,0.976131,0.970922,0.974124,0.001957
1797,5,16524,70.210589,0.304681,0.234715,0.022085,0.025,9,75,225,...,0.873601,0.872277,0.005128,5,0.958328,0.957986,0.957801,0.957722,0.957959,0.000233
1798,5,16524,78.390443,0.636633,0.202328,0.027534,0.025,9,75,250,...,0.872777,0.872307,0.004908,4,0.963447,0.960167,0.961996,0.959805,0.961354,0.001466
1799,6,49572,265.265800,1.342903,0.302329,0.031543,0.025,9,75,250,...,0.883819,0.882270,0.001559,2,0.945155,0.945551,0.944981,0.944841,0.945132,0.000266


## Validation

In [9]:
grid_results = pd.read_csv('halving_results.csv')

In [10]:
best_one = grid_results.sort_values('mean_test_score', ascending=False).head(1)

### is data missing?

In [11]:
print(df_signal.isnull().sum().sum() + df_bg.isnull().sum().sum())

0


In [13]:
grid_search.best_params_

{'clf__learning_rate': 0.025,
 'clf__max_depth': 9,
 'clf__min_samples_leaf': 75,
 'clf__n_estimators': 275,
 'pca__n_components': 22}

In [14]:
grid_search.best_score_

0.8826922863214798