In [None]:
import os
import csv
import numpy as np
import time

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import precision_score, confusion_matrix, accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler

from hyperopt import hp, tpe, fmin, Trials, space_eval

import warnings
warnings.filterwarnings('ignore')

# First run to find the most important features

In [None]:
scaler = StandardScaler()

Data_train = pd.read_csv('AppML_InitialProject_train.csv')

X = Data_train.drop(['p_Truth_isElectron', 'p_Truth_Energy'], axis=1)
y = Data_train['p_Truth_isElectron']

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.1,
                            max_depth=8, eval_metric='logloss', n_estimators=300,
                            seed=42, use_label_encoder=False, n_jobs = -1)

start_time = time.time()
xgb_clf.fit(X_train, y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

# Selecting the most important features

In [None]:
###Selecting the 20 most important features###
importances = xgb_clf.feature_importances_
sorted_indices = importances.argsort()[::-1]
top_20_indices = sorted_indices[:20]
top_20_features = X.columns[top_20_indices]

X_train_20 = X_train[top_20_features]
X_val_20 = X_val[top_20_features]

# Second run with **hyperparameter optimization** and **cross validation**

# Also third run with the best parameters

In [None]:
###Hyperparameter tuning with Bayes search and cross-validation###

X_20 = X[top_20_features]

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', np.arange(3, 11, dtype=int)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.uniform('gamma', 0, 1),
    'n_estimators': hp.choice('n_estimators', np.arange(100, 1000, 100, dtype=int)),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10)
}

def objective(params):
    model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=42, use_label_encoder=False, **params, n_jobs = -1)
    scores = cross_val_score(model, X_20, y, cv=5, scoring='neg_log_loss')
    return -np.mean(scores)

##Hyperparameter optimization##
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
best_params = space_eval(space, best)

best_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', seed=42, use_label_encoder=False, **best_params, n_jobs = -1)
start_time = time.time()
best_model.fit(X_train_20, y_train)
end_time = time.time()

y_pred_proba = best_model.predict_proba(X_val_20)[:, 1]

##Evaluation##
accuracy = accuracy_score(y_val, y_pred_proba.round())
conf_matrix = confusion_matrix(y_val, y_pred_proba.round())
logloss = log_loss(y_val, y_pred_proba)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)
print("Confusion matrix:", conf_matrix)
print("LogLoss:", logloss)

elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

100%|██████████| 10/10 [25:15<00:00, 151.54s/trial, best loss: 0.07089634871367259]
Best Hyperparameters: {'colsample_bytree': 0.7838522963083432, 'gamma': 0.7018822402097628, 'learning_rate': 0.05306630350590696, 'max_depth': 8, 'min_child_weight': 9.827462508018204, 'n_estimators': 700, 'subsample': 0.6858906365814332}
Accuracy: 0.9733777777777778
Confusion matrix: [[35196   470]
 [  728  8606]]
LogLoss: 0.07175243804333949
Elapsed time: 39.316006660461426 seconds


# Testing on the test set

In [5]:
Data_test = pd.read_csv('AppML_InitialProject_test_classification.csv')

X_test = pd.DataFrame(scaler.fit_transform(Data_test), columns=Data_test.columns)
X_test_20 = X_test[top_20_features]
y_pred_prob = best_model.predict_proba(X_test_20)[:, 1]

print(y_pred_prob)

[9.9953282e-01 9.1819185e-01 3.0162913e-01 ... 1.1910958e-02 3.1416648e-04
 3.2807007e-03]


# Saving (set to False)

In [None]:
folder_name = 'solutions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

Write = False
if Write:

    top_20_features_list = top_20_features.tolist()
    variables = top_20_features
    
    csv_file_path = os.path.join(folder_name, 'Classification_XGBoost_VariableList.csv')

    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for variable in variables:
            writer.writerow([variable])
    
    data = y_pred_prob
    
    csv_file_path = os.path.join(folder_name, 'Classification_XGBoost.csv')
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for index, item in enumerate(data, start=0):
            writer.writerow([index, item])