# Analyzing the Data

# Data preparation

In [1]:
#import uproot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ROOT;
#import lumiere as lm
#lm.loadstyle(True);

from sklearn.metrics import roc_auc_score, roc_curve

def ams_score(x, y, w, cut):
# Calculate Average Mean Significane as defined in ATLAS paper
#    -  approximative formula for large statistics with regularisation
# x: array of truth values (1 if signal)
# y: array of classifier result
# w: array of event weights
# cut
    t = y > cut 
    s = np.sum((x[t] == 1)*w[t])
    b = np.sum((x[t] == 0)*w[t])
    return s/np.sqrt(b+10.0)

def find_best_ams_score(x, y, w):
# find best value of AMS by scanning cut values; 
# x: array of truth values (1 if signal)
# y: array of classifier results
# w: array of event weights
#  returns 
#   ntuple of best value of AMS and the corresponding cut value
#   list with corresponding pairs (ams, cut) 
# ----------------------------------------------------------
    ymin=min(y) # classifiers may not be in range [0.,1.]
    ymax=max(y)
    nprobe=200    # number of (equally spaced) scan points to probe classifier 
    amsvec= [(ams_score(x, y, w, cut), cut) for cut in np.linspace(ymin, ymax, nprobe)] 
    maxams=sorted(amsvec, key=lambda lst: lst[0] )[-1]
    return maxams, amsvec




def printScore(model):

    try:
        pred_clf = model.predict_proba(x_val)[:, 1]
    except:
        pred_clf = model.predict(x_val)
        pred_clf = pred_clf.reshape((pred_clf.shape[0],))

    auc = roc_auc_score(y_val, pred_clf, sample_weight=w_val)
    print('AUC:', auc)
    bs = find_best_ams_score(y_val, pred_clf, w_val)
    print('AMS:', bs[0][0])
    print('AMS total:', bs[0][0]*np.sqrt(50))

Welcome to JupyROOT 6.28/10


## Read-in & to Pandas

In [2]:
input_columns = ['DER_deltaeta_jet_jet', 'DER_deltar_tau_lep', 'DER_lep_eta_centrality', 'DER_mass_MMC', 'DER_mass_jet_jet', 
                 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_met_phi_centrality', 'DER_prodeta_jet_jet', 'DER_pt_h', 
                 'DER_pt_ratio_lep_tau', 'DER_pt_tot', 'DER_sum_pt', 'PRI_jet_all_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
                 'PRI_jet_leading_pt', 'PRI_jet_num', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_subleading_pt', 
                 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_lep_pt', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_tau_eta', 'PRI_tau_phi', 
                 'PRI_tau_pt', 'transverse_lepton_jet_mass']
print(len(input_columns))

31


In [3]:
RDF = ROOT.ROOT.RDataFrame

signal_tree_name = 'signal'
background_tree_name = 'background'
test_tree_name = 'validation'
file_name = 'atlas-higgs-challenge-2014-v2_part.root'

rdf_signal = RDF(signal_tree_name, file_name)
rdf_bkg = RDF(background_tree_name, file_name)
rdf_test = RDF(test_tree_name, file_name)

reconstruct_transverse_lepton_jet_mass = '''

float lep_px = PRI_lep_pt * TMath::Cos(PRI_lep_phi);
float lep_py = PRI_lep_pt * TMath::Sin(PRI_lep_phi);
float jet_px = PRI_jet_leading_pt * TMath::Cos(PRI_jet_leading_phi);
float jet_py = PRI_jet_leading_pt * TMath::Sin(PRI_jet_leading_phi);

//calculate angle between jet and lepton
float cos_theta = (lep_px*jet_px + lep_py*jet_py) / PRI_lep_pt / PRI_jet_leading_pt;

return PRI_lep_pt * PRI_jet_leading_pt * (1 - cos_theta);
'''

#insertion
rdf_signal = rdf_signal.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_bkg = rdf_bkg.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_test = rdf_test.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)

# label classification to int values
rdf_test = rdf_test.Define('IntLabel', '''
const char ch = Label[0];
const char s = 's';
if(ch == s){
    return 1;
}
else{
    return 0;
}
''')


df_signal = pd.DataFrame(rdf_signal.AsNumpy())
df_bg = pd.DataFrame(rdf_bkg.AsNumpy())
df_test = pd.DataFrame(rdf_test.AsNumpy())


## concatination, shuffle and split

In [4]:
from sklearn.utils import shuffle;
from sklearn.model_selection import train_test_split;

#input feature arrays
vars_signal = df_signal[input_columns].to_numpy()
vars_bg = df_bg[input_columns].to_numpy()
vars_test = df_test[input_columns].to_numpy()

inputs = np.concatenate([vars_signal, vars_bg])

#weights
weight_signal = df_signal['Weight'].to_numpy()
weight_bg = df_bg['Weight'].to_numpy()
weights = np.concatenate([weight_signal, weight_bg])
weights = weights.reshape((weights.shape[0],))

weights_test = df_test['Weight'].to_numpy()


# target classifictionation (1:signal / 0: background)
y_signal = np.ones((vars_signal.shape[0], ))
y_bg = np.zeros((vars_bg.shape[0], ))

targets = np.concatenate([y_signal, y_bg])

# for test dataset there is already a classification; convert to int
truths_test = df_test.IntLabel.to_numpy()


# shuffle 
inputs, targets, weights = shuffle(inputs, targets, weights)


# not for gridcv

# training and validation split  (80, 20)
x_train, x_val, y_train, y_val, w_train, w_val = train_test_split(inputs, targets, weights, test_size=0.2)
#x_train, y_train = inputs, targets

## Pipeline approach

In [5]:
# custom AMS scorer
def BuildScorer(validation_x, validation_y, validation_weight):

    def AMS_scorer(estimator, X, y):
        predictions = estimator.predict_proba(validation_x)[:, 1]
        score = find_best_ams_score(validation_y, predictions, validation_weight)
        return score[0][0] 
    
    return AMS_scorer

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler;
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV


scaler = StandardScaler()
pca = PCA()
clf = GradientBoostingClassifier(random_state=0, verbose=0)

pipe = Pipeline([ ('scaler', scaler), ('pca', pca), ('clf', clf)])


param_grid = {'pca__n_components': [20, 12],
                  'clf__n_estimators': [100, 150, 200, 400],
                  'clf__min_samples_leaf': [100, 200, 300],
                  'clf__max_depth': [5, 8, 10], 
                  'clf__learning_rate': [1, 0.5, 0.1, 0.05]
                }


## Validation

In [7]:
grid_results = pd.read_csv('halving_results.csv')

In [8]:
best_one = grid_results.sort_values('mean_test_score', ascending=False).head(1)

## StandardScaling 

In [9]:
from sklearn.preprocessing import StandardScaler;
 
scaler = StandardScaler()
scaler.fit(x_train) #set up only on train data
 
# tranformation applied to all
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(vars_test)

## SMOTE

In [10]:
%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE


# Apply SMOTE to the training data
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)


Note: you may need to restart the kernel to use updated packages.


## Dimensionality reduction

### PCA

In [11]:
from sklearn.decomposition import PCA

x_train_pre = x_train

pca = PCA(n_components=22)
pca.fit(x_train)

x_train = pca.transform(x_train)
x_val = pca.transform(x_val)
x_test = pca.transform(x_test)

## Classifier training

In [12]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=500, 
                            criterion='gini', 
                             verbose=1
                           )

clf.fit(x_train, y_train)

printScore(clf)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.8min
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  4.5min
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:  6.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    1.0s


AUC: 0.917172514925354
AMS: 0.32198372011577125
AMS total: 2.2767687192553323


In [13]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=800, 
                            criterion='gini', 
                             verbose=1
                           )

clf.fit(x_train, y_train)

printScore(clf)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   17.7s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.2min
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:  2.7min
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:  4.8min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    1.0s


AUC: 0.9176905630228844
AMS: 0.3248469934580271
AMS total: 2.2970151192223303


[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    1.8s


## halving grid

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV


clf = RandomForestClassifier()


param_grid = {
    'n_estimators': [500], 
    'max_features': ['auto', 'sqrt', 'log2'],  
    'max_depth': [None, 10, 30, 50], 
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10], 
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy', 'log_loss'], 
    'max_leaf_nodes': [None, 10, 30, 50], 
    'min_impurity_decrease': [0.0, 0.01, 0.1]
    }


grid_search = HalvingGridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', verbose=3, cv=3, random_state=0)
grid_search.fit(x_train, y_train)
grid_results = pd.DataFrame(grid_search.cv_results_)
grid_results.to_csv('storage/forest_random_search.csv', index=False)


n_iterations: 8
n_required_iterations: 9
n_possible_iterations: 8
min_resources_: 12
max_resources_: 52774
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 13824
n_resources: 12
Fitting 3 folds for each of 13824 candidates, totalling 41472 fits
[CV 1/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=auto, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=(train=nan, test=nan) total time=   0.0s
[CV 2/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=auto, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=(train=nan, test=nan) total time=   0.0s
[CV 3/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=auto, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=500;, score=(train=nan, test=nan) total time=   0.0s
[CV 1/3] END bootstrap=Tru

In [None]:
grid_results