In [None]:
# %pip install graphviz

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import re
import os
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV, train_test_split

%matplotlib inline

In [None]:
data = pd.read_csv('all_outputs.csv')
data_no_mass = data.drop(columns='jj_m')
# data_no_mass.head()

## Looking at the raw data:

In [None]:
print('Size of data: {}'.format(data_no_mass.shape))
print('Number of events: {}'.format(data_no_mass.shape[0]))
print('Number of columns: {}'.format(data_no_mass.shape[1]))

print ('\nList of features in dataset:')
for col in data_no_mass.columns:
    print(col)

In [None]:
print('Number of signal events: {}'.format(len(data_no_mass[data_no_mass.label == 1])))
print('Number of background events: {}'.format(len(data_no_mass[data_no_mass.label == 0])))
print('Fraction signal: {}'.format(len(data_no_mass[data_no_mass.label == 1])/(float)(len(data_no_mass[data_no_mass.label == 1]) + len(data_no_mass[data_no_mass.label == 0]))))

## Formatting the data for use with XGBoost

In [None]:
shuf_data = data_no_mass.sample(frac=1)

In [None]:
shuf_data['label'] = data_no_mass.label.astype('category')

Taking 70% of data for training set, 15% for evaluation set, and 15% for test set.

In [None]:
no_events = shuf_data.shape[0]
no_training = int(0.8 * no_events)

training_set = shuf_data[:no_training]
test_set = shuf_data[no_training:]

In [None]:
print('Number of training samples: {}'.format(len(training_set)))
print('Number of testing samples: {}'.format(len(test_set)))

print('\nNumber of signal events in training set: {}'.format(len(training_set[training_set.label == 1])))
print('Number of background events in training set: {}'.format(len(training_set[training_set.label == 0])))
print('Fraction signal: {}'.format(len(training_set[training_set.label == 1])/(float)(len(training_set[training_set.label == 1]) + len(training_set[training_set.label == 0]))))

In [None]:
# training_set.label.cat.codes
# print(training_set.label.cat.codes)

## Creating DMatrix

In [None]:
feature_names = data_no_mass.columns[0:-1]  # skip the final collumn as it is the label
# print(feature_names)
train = xgb.DMatrix(data=training_set[feature_names],label=training_set.label.cat.codes,
                    missing=-999.0,feature_names=feature_names)
test = xgb.DMatrix(data=test_set[feature_names],label=test_set.label.cat.codes,
                   missing=-999.0,feature_names=feature_names)

In [None]:
print('Number of training samples: {}'.format(train.num_row()))
print('Number of testing samples: {}'.format(test.num_row()))

print('\nNumber of signal events in training set: {}'.format(len(np.where(train.get_label())[0])))

## Hyperparameters

## Evaluating Hyperparmameters

In [None]:
eta_range = np.arange(0.01, 0.91, 0.01)
max_depth_range = np.arange(2, 11, 1)
min_child_range = np.arange(0, 5.01, 0.01)
subsample_range = np.arange(0.01, 1.01, 0.01)
colsample_range = np.arange(0.1, 1.01, 0.01)
lambda_range = np.arange(0.1, 10.1, 0.1)
gamma_range = np.arange(0, 5.01, 0.01)

rmse_outputs = np.zeros(len(gamma_range))

# print(gamma_range)

In [None]:
# regex pattern to find floats in evaluator string
# float_pattern = r'\d+\.\d+'

# for i in range(len(gamma_range)):
#     print(i)
#     param = {}

#     # Booster parameters
#     param['eta']              = 0.1 # learning rate
#     param['max_depth']        = 6  # maximum depth of a tree
#     param['subsample']        = 1 # fraction of events to train tree on
#     param['colsample_bytree'] = 0.8 # fraction of features to train tree on
#     param['gamma'] = gamma_range[i]  # Minimum loss reduction

#     # L2 regularization
#     param['lambda'] = 3

#     # Learning task parameters
#     param['objective']   = 'binary:logistic' # objective function
#     param['eval_metric'] = 'error'           # evaluation metric for cross validation
#     param = list(param.items()) + [('eval_metric', 'logloss')] + [('eval_metric', 'rmse')]

#     num_trees = 250  # number of trees to make

#     eval_booster = xgb.train(param, train, num_boost_round=num_trees)
#     eval_res = eval_booster.eval(evaluation)

#     result = float(re.findall(float_pattern, eval_res)[0])
#     rmse_outputs[i] = result

# print(rmse_outputs)
# os.system("say beep")

In [None]:
# plt.plot(gamma_range, rmse_outputs, 'o-', color='midnightblue')
# plt.title('RMSE vs Gamma')
# plt.xlabel('Gamma')
# plt.ylabel('RMSE')

## Random Hyperparameter Search

In [None]:
# params = {
#     'eta':eta_range,
#     'max_depth':max_depth_range, 
#     'gamma':gamma_range,  
#     'subsample':subsample_range,
#     'min_child_weight':min_child_range,
#     'colsample_bytree':colsample_range, 
#     'objective': ['binary:logistic'],
#     'eval_metric': ['rmse'],
#     'lambda':lambda_range,
# }

In [None]:
# cla = XGBClassifier()

In [None]:
# random_search = RandomizedSearchCV(cla, param_distributions=params,n_iter=20, cv=None)

In [None]:
# #train test split with randomization performed (although randomization is not necessary)
# hyperparam_training_set = shuf_data[:no_training].label.astype(int)
# X_train, y_train  = training_set.iloc[:,:-1], training_set.label

In [None]:
# random_search.fit(X_train, y_train)
# os.system("say bing bong")

In [None]:
# hyper = random_search.best_params_
# hyper

## Training

In [None]:
param = {}

# Booster parameters
param['eta']              = 0.16 # learning rate
param['min_child_weight'] = 3.45
param['max_depth']        = 6  # maximum depth of a tree
param['subsample']        = 0.46 # fraction of events to train tree on
param['colsample_bytree'] = 0.8599999999999995 # fraction of features to train tree on
param['gamma'] = 1.42

# L2 regularization
param['lambda'] = 5.0

# Learning task parameters
param['objective']   = 'binary:logistic' # objective function
param['eval_metric'] = 'error'           # evaluation metric for cross validation
param = list(param.items()) + [('eval_metric', 'logloss')] + [('eval_metric', 'rmse')]

num_trees = 600  # number of trees to make

In [None]:
booster = xgb.train(param,train, num_boost_round=num_trees, evals=[(test, 'test')], early_stopping_rounds=10)
os.system("say bing bong")

In [None]:
print(booster.eval(test))

In [None]:
predictions = booster.predict(test)

In [None]:
att = booster.attributes()
att

In [None]:
# booster_df = booster.trees_to_dataframe()

In [None]:
best_tree = booster.best_iteration

In [None]:
print(best_tree)

In [None]:
# selected_feature_df = booster_df[booster_df['Feature'] == 'jj_m']

In [None]:
# selected_feature_df[selected_feature_df['Tree'] == 999]

In [None]:
# plot all predictions (both signal and background)
plt.figure()
plt.hist(predictions,bins=np.linspace(0,1,50),histtype='step',color='darkgreen',label='All events')
# make the plot readable
plt.xlabel('Prediction from BDT',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

# plot signal and background separately
plt.figure()
plt.hist(predictions[test.get_label().astype(bool)],bins=np.linspace(0,1,50),
         histtype='step',color='midnightblue',label='signal')
plt.hist(predictions[~(test.get_label().astype(bool))],bins=np.linspace(0,1,50),
         histtype='step',color='firebrick',label='background')
# make the plot readable
plt.xlabel('Prediction from BDT',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
# choose score cuts:
cuts = np.linspace(0,1,500)
nsignal = np.zeros(len(cuts))
nbackground = np.zeros(len(cuts))
for i,cut in enumerate(cuts):
    nsignal[i] = len(np.where(predictions[test.get_label().astype(bool)] > cut)[0])
    nbackground[i] = len(np.where(predictions[~(test.get_label().astype(bool))] > cut)[0])

# plot efficiency vs. purity (ROC curve)
plt.figure()
plt.plot(nsignal/len(test_set[test_set.label == 1]),nsignal/(nsignal + nbackground),'o-',color='blueviolet')
# make the plot readable
plt.xlabel('Efficiency',fontsize=12)
plt.ylabel('Purity',fontsize=12)
plt.legend(frameon=False)

In [None]:
xgb.plot_importance(booster,grid=False)

In [None]:
# import matplotlib

# xgb.plot_tree(booster, num_trees=227, rankdir='LR')
# fig = matplotlib.pyplot.gcf()
# fig.set_size_inches(150, 100)
# fig.savefig('tree.png')

In [None]:
plt.figure()
plt.hist(training_set.jet_e_2[training_set.label == 1],bins=np.linspace(0,150,150),
         histtype='step',color='midnightblue',label='signal')
plt.hist(training_set.jet_e_2[training_set.label == 0],bins=np.linspace(0,150,150),
         histtype='step',color='firebrick',label='background')

plt.xlabel('jet_e_2',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
plt.figure()
plt.plot(training_set.reco_q_1[training_set.label == 0],training_set.reco_q_2[training_set.label == 0],
         'o',markersize=1.5,color='firebrick',markeredgewidth=0,alpha=0.8,label='background')
plt.plot(training_set.reco_q_1[training_set.label == 1],training_set.reco_q_2[training_set.label == 1],
         'o',markersize=1.5,color='mediumblue',markeredgewidth=0,alpha=0.8,label='signal')

plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel('reco_g_1',fontsize=12)
plt.ylabel('reco_g_2',fontsize=12)
plt.legend(frameon=False,numpoints=1,markerscale=2)

In [None]:
plt.figure()
plt.plot(training_set.jet_e_1[training_set.label == 0],training_set.jet_e_2[training_set.label == 0],
         'o',markersize=1,color='firebrick',markeredgewidth=0,alpha=0.8,label='background')
plt.plot(training_set.jet_e_1[training_set.label == 1],training_set.jet_e_2[training_set.label == 1],
         'o',markersize=1,color='mediumblue',markeredgewidth=0,alpha=0.8,label='signal')

plt.xlim(0,100)
plt.ylim(0,100)
plt.xlabel('jet_e_1',fontsize=12)
plt.ylabel('jet_e_2',fontsize=12)
plt.legend(frameon=False,numpoints=1,markerscale=1)

In [None]:
plt.figure()
plt.plot(training_set.jet_nchad_1[training_set.label == 0],training_set.jet_nchad_2[training_set.label == 0],
         'o',markersize=2,color='firebrick',markeredgewidth=0,alpha=0.8,label='background')
plt.plot(training_set.jet_nchad_1[training_set.label == 1],training_set.jet_nchad_2[training_set.label == 1],
         'o',markersize=2,color='mediumblue',markeredgewidth=0,alpha=0.8,label='signal')

plt.xlim(0, 60)
plt.ylim(0, 50)
plt.xlabel('reco_g_1',fontsize=12)
plt.ylabel('reco_g_2',fontsize=12)
plt.legend(frameon=False,numpoints=1,markerscale=1)