In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import uproot
import awkward as ak
import scipy
from tqdm import tqdm
import collections
import shap
%matplotlib inline

In [6]:
file = uproot.open('ki_data/sig/user.ebusch.509962.root')
file['PostSel'].keys()

# open one file check whats inside, choose variables
# open all files together put in dict/df
# QCD small is only background file

['all_jets_pt',
 'all_jets_eta',
 'all_jets_phi',
 'all_jets_E',
 'year',
 'runNumber',
 'eventNumber',
 'mcEventWeight',
 'weight',
 'SumW',
 'n_jets',
 'jet1_pt',
 'jet1_eta',
 'jet1_phi',
 'jet1_E',
 'jet2_pt',
 'jet2_eta',
 'jet2_phi',
 'jet2_E',
 'maxphi_minphi',
 'dphi_min',
 'dphi_max',
 'pt_balance_12',
 'mjj_12',
 'mT_jj',
 'dR_12',
 'deta_12',
 'deltaY_12',
 'hT',
 'rT',
 'aplanarity',
 'sphericity',
 'sphericity_T',
 'met_met',
 'met_phi']

In [11]:
import numpy as np
import uproot
from sklearn.model_selection import train_test_split

ftrs = [
 "jet1_pt",
 "jet1_phi",
 "jet2_pt",
 "jet2_phi",
 "pt_balance_12",
 "mjj_12",
 "mT_jj",
 "dR_12",
 "deta_12",
 "deltaY_12",
 "hT",
 "rT",
 "aplanarity",
 "sphericity",
 "sphericity_T",
 "met_met",
 "met_phi"]

sig = uproot.concatenate({"ki_data/sig/*.root" : "PostSel"}, ftrs)
bkg = uproot.concatenate({"ki_data/bkg/*.root" : "PostSel"}, ftrs)

# Convert to NumPy arrays
sig = sig.to_numpy()
bkg = bkg.to_numpy()

data = np.concatenate((sig, bkg), axis=0)

y_sig = np.ones(sig.shape[0])  # Set label for "sig" to 1
y_bkg = np.zeros(bkg.shape[0])  # Set label for "bkg" to 0

# Concatenate labels
y = np.concatenate((y_sig, y_bkg), axis=0)

# Split data and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [None]:

for ftr in ftrs:
    plt.hist(sig[ftr], bins=50, density=True, alpha=0.4, label='bkgd')
    plt.hist(bkg[ftr], bins=50, density=True, alpha=0.4, label='sig')
    plt.legend()
    plt.title(ftr)
    plt.show()

In [12]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

#_train = X_train.astype('float32')
#X_test = X_test.astype('float32')

X_train = np.array(X_train.tolist()).view(np.float32).reshape(X_train.shape + (-1,))
X_test = np.array(X_test.tolist()).view(np.float32).reshape(X_test.shape + (-1,))

clf = xgb.XGBClassifier(verbosity = 2)
#setattr(clf, 'verbosity', 2)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


"""from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

# Convert input features to regular numpy arrays with float32 data type
X_train = np.array(X_train.tolist()).view(np.float32).reshape(X_train.shape + (-1,))
X_test = np.array(X_test.tolist()).view(np.float32).reshape(X_test.shape + (-1,))

# Create and fit the classifier
clf = GradientBoostingClassifier(verbose=1)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
"""

[09:45:00] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 106 extra nodes, 0 pruned nodes, max_depth=6
[09:45:01] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 118 extra nodes, 0 pruned nodes, max_depth=6
[09:45:03] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[09:45:04] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 118 extra nodes, 0 pruned nodes, max_depth=6
[09:45:05] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 122 e

'from sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nimport numpy as np\n\n# Convert input features to regular numpy arrays with float32 data type\nX_train = np.array(X_train.tolist()).view(np.float32).reshape(X_train.shape + (-1,))\nX_test = np.array(X_test.tolist()).view(np.float32).reshape(X_test.shape + (-1,))\n\n# Create and fit the classifier\nclf = GradientBoostingClassifier(verbose=1)\nclf.fit(X_train, y_train)\n\n# Make predictions\ny_pred = clf.predict(X_test)\n'

In [14]:
from sklearn.metrics import accuracy_score

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8497994259773335


In [22]:
#https://xgboost.readthedocs.io/en/latest/parameter.html

import xgboost as xgb
from sklearn.metrics import accuracy_score


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)




import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import XGBoostPruningCallback
from optuna.samplers import TPESampler


def objective(trial):
    param = {
        "silent": 1,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-10, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-10, 1.0),
    }
    
    if param["booster"] == "gbtree":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 100)
        param["min_child_weight"] = trial.suggest_int("min_child_samples", 5, 300)
        param["eta"] = trial.suggest_loguniform("eta", 1e-8, 1)
        param["gamma"] = trial.suggest_loguniform("gamma", 1e-12, 1.0)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        param["max_leaves"] = trial.suggest_int("max_leaves", 0, 500)

    #pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    
    
    #pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    bst = xgb.train(param, dtrain, evals=[(dtest, "validation")], callbacks=[pruning_callback])
    preds = bst.predict(dtest)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_test, pred_labels)
    return accuracy

In [23]:
sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="epe-svj", direction="maximize", sampler = sampler, pruner=optuna.pruners.HyperbandPruner())

study.optimize(objective, n_trials=10)
print(study.best_trial)

[32m[I 2023-05-04 09:53:05,511][0m A new study created in memory with name: epe-svj[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.



Parameters: { "silent" } are not used.

[0]	validation-auc:0.80470
[1]	validation-auc:0.80470
[2]	validation-auc:0.80470
[3]	validation-auc:0.80470
[4]	validation-auc:0.80470
[5]	validation-auc:0.80470
[6]	validation-auc:0.80470
[7]	validation-auc:0.80470
[8]	validation-auc:0.80470
[9]	validation-auc:0.80470


[32m[I 2023-05-04 09:53:08,803][0m Trial 0 finished with value: 0.7603705800482349 and parameters: {'booster': 'gbtree', 'lambda': 1.4798580082624078e-06, 'alpha': 0.0015967794487583625, 'max_depth': 1, 'min_child_samples': 94, 'eta': 1.492956405760975e-07, 'gamma': 1.2825170199331593e-11, 'grow_policy': 'lossguide', 'max_leaves': 198}. Best is trial 0 with value: 0.7603705800482349.[0m


Parameters: { "silent" } are not used.




suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.



[0]	validation-auc:0.91612
[1]	validation-auc:0.91835
[2]	validation-auc:0.91947
[3]	validation-auc:0.92013
[4]	validation-auc:0.92055
[5]	validation-auc:0.92091
[6]	validation-auc:0.92114
[7]	validation-auc:0.92137
[8]	validation-auc:0.92152
[9]	validation-auc:0.92174


[32m[I 2023-05-04 09:54:19,633][0m Trial 1 finished with value: 0.8443126678090457 and parameters: {'booster': 'gbtree', 'lambda': 2.44437222373131e-05, 'alpha': 1.5557691091700663e-06, 'max_depth': 69, 'min_child_samples': 65, 'eta': 0.10591061529966875, 'gamma': 2.1313320630522686e-12, 'grow_policy': 'depthwise', 'max_leaves': 279}. Best is trial 1 with value: 0.8443126678090457.[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` inst

Parameters: { "silent" } are not used.

[0]	validation-auc:0.91494


[32m[I 2023-05-04 09:54:30,113][0m Trial 2 pruned. Trial was pruned at iteration 1.[0m


Parameters: { "silent" } are not used.

[0]	validation-auc:0.91482


[32m[I 2023-05-04 09:54:45,818][0m Trial 3 pruned. Trial was pruned at iteration 1.[0m


Parameters: { "silent" } are not used.

[0]	validation-auc:0.85908
[1]	validation-auc:0.87775
[2]	validation-auc:0.88590
[3]	validation-auc:0.89451
[4]	validation-auc:0.89613
[5]	validation-auc:0.89793
[6]	validation-auc:0.90035
[7]	validation-auc:0.90425
[8]	validation-auc:0.90499
[9]	validation-auc:0.90589


[32m[I 2023-05-04 09:54:50,537][0m Trial 4 finished with value: 0.8258777674737691 and parameters: {'booster': 'gbtree', 'lambda': 0.0007328401869202116, 'alpha': 0.022195080216558992, 'max_depth': 2, 'min_child_samples': 227, 'eta': 0.8144955413180945, 'gamma': 0.0009505782051189749, 'grow_policy': 'lossguide', 'max_leaves': 51}. Best is trial 1 with value: 0.8443126678090457.[0m


Parameters: { "silent" } are not used.




suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.



[0]	validation-auc:0.85344
[1]	validation-auc:0.90135


[32m[I 2023-05-04 09:55:01,843][0m Trial 5 pruned. Trial was pruned at iteration 1.[0m


Parameters: { "silent" } are not used.

[0]	validation-auc:0.91560
[1]	validation-auc:0.91551
[2]	validation-auc:0.91570
[3]	validation-auc:0.91580
[4]	validation-auc:0.91584
[5]	validation-auc:0.91588
[6]	validation-auc:0.91597
[7]	validation-auc:0.91599
[8]	validation-auc:0.91628
[9]	validation-auc:0.91638


[32m[I 2023-05-04 09:56:18,191][0m Trial 6 finished with value: 0.8387211831363008 and parameters: {'booster': 'gbtree', 'lambda': 8.236289299581944e-06, 'alpha': 3.4168463613805656e-10, 'max_depth': 58, 'min_child_samples': 48, 'eta': 0.0005181366289597021, 'gamma': 0.0002495171034693965, 'grow_policy': 'lossguide', 'max_leaves': 347}. Best is trial 1 with value: 0.8443126678090457.[0m


Parameters: { "silent" } are not used.




suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.



[0]	validation-auc:0.91608
[1]	validation-auc:0.91608


[32m[I 2023-05-04 09:56:29,451][0m Trial 7 pruned. Trial was pruned at iteration 1.[0m


Parameters: { "silent" } are not used.

[0]	validation-auc:0.91578
[1]	validation-auc:0.91866
[2]	validation-auc:0.91985
[3]	validation-auc:0.92052
[4]	validation-auc:0.92098
[5]	validation-auc:0.92144
[6]	validation-auc:0.92180
[7]	validation-auc:0.92206
[8]	validation-auc:0.92233
[9]	validation-auc:0.92251


[32m[I 2023-05-04 09:57:43,775][0m Trial 8 finished with value: 0.8451489620692743 and parameters: {'booster': 'gbtree', 'lambda': 2.470378349559229e-09, 'alpha': 0.011855309247426364, 'max_depth': 40, 'min_child_samples': 53, 'eta': 0.26306837563252955, 'gamma': 1.4900138899841883e-08, 'grow_policy': 'depthwise', 'max_leaves': 442}. Best is trial 8 with value: 0.8451489620692743.[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` inste

Parameters: { "silent" } are not used.

[0]	validation-auc:0.91640
[1]	validation-auc:0.91855
[2]	validation-auc:0.91969


[32m[I 2023-05-04 09:58:09,153][0m Trial 9 pruned. Trial was pruned at iteration 3.[0m


FrozenTrial(number=8, state=1, values=[0.8451489620692743], datetime_start=datetime.datetime(2023, 5, 4, 9, 56, 29, 451877), datetime_complete=datetime.datetime(2023, 5, 4, 9, 57, 43, 775459), params={'booster': 'gbtree', 'lambda': 2.470378349559229e-09, 'alpha': 0.011855309247426364, 'max_depth': 40, 'min_child_samples': 53, 'eta': 0.26306837563252955, 'gamma': 1.4900138899841883e-08, 'grow_policy': 'depthwise', 'max_leaves': 442}, user_attrs={}, system_attrs={'completed_rung_0': 0.9186601662020313, 'completed_rung_1': 0.9205207628949609, 'completed_rung_2': 0.9225129476623958}, intermediate_values={0: 0.9157824917190222, 1: 0.9186601662020313, 2: 0.9198514097669911, 3: 0.9205207628949609, 4: 0.9209848193633342, 5: 0.9214358959902553, 6: 0.9217985612123785, 7: 0.9220630099654963, 8: 0.9223306186510543, 9: 0.9225129476623958}, distributions={'booster': CategoricalDistribution(choices=('gbtree',)), 'lambda': FloatDistribution(high=1.0, log=True, low=1e-10, step=None), 'alpha': FloatDist

In [24]:
print(f"\tBest Accuracy: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best Accuracy: 0.84515
	Best params:
		booster: gbtree
		lambda: 2.470378349559229e-09
		alpha: 0.011855309247426364
		max_depth: 40
		min_child_samples: 53
		eta: 0.26306837563252955
		gamma: 1.4900138899841883e-08
		grow_policy: depthwise
		max_leaves: 442


In [25]:
optuna.visualization.plot_param_importances(study)

In [26]:
optuna.visualization.plot_optimization_history(study)