In [None]:
import os
import csv
import numpy as np
import time

import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, tpe, fmin, Trials, space_eval

import warnings
warnings.filterwarnings('ignore')

# First run

In [None]:
scaler = StandardScaler()

Data_train = pd.read_csv('AppML_InitialProject_train.csv')
Data_train = Data_train[Data_train['p_Truth_isElectron'] == 1]

y_mean = np.copy(np.mean(Data_train['p_Truth_Energy']))
y_std = np.copy(np.std(Data_train['p_Truth_Energy']))

Data_train = pd.DataFrame(scaler.fit_transform(Data_train), columns=Data_train.columns)

X = Data_train.drop(['p_Truth_isElectron', 'p_Truth_Energy'], axis=1)
y = Data_train['p_Truth_Energy']  # Target variable for regression

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.1,
                           max_depth=8, n_estimators=300,
                           seed=42, n_jobs=-1, eval_metric='mae')

start_time = time.time()
xgb_reg.fit(X_train, y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

# Selecting the most important features using **feature_importances_**

In [None]:
importances = xgb_reg.feature_importances_
sorted_indices = importances.argsort()[::-1]
top_20_indices = sorted_indices[:25]
top_20_features = X.columns[top_20_indices]

X_train_20 = X_train[top_20_features]
X_val_20 = X_val[top_20_features]

# Second run using the most important features, **cross validation** and **hyperparameter optimization (bayesian)**

In [None]:
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', np.arange(3, 17, dtype=int)),
    'subsample': hp.uniform('subsample', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.0, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 1.0),
    'n_estimators': hp.choice('n_estimators', np.arange(100, 1000, 100, dtype=int)),
    'min_child_weight': hp.uniform('min_child_weight', 0, 10)
}

def objective(params):
    model = xgb.XGBRegressor(objective='reg:squarederror', seed=42, **params, n_jobs=-1, eval_metric='mae')
    scores = cross_val_score(model, X_val_20, y_val, cv=5, scoring='neg_mean_absolute_error')
    return -np.mean(scores)

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

best_params = space_eval(space, best)

best_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42, **best_params, n_jobs=-1, eval_metric='mae')

start_time = time.time()
best_model.fit(X_train_20, y_train)
end_time = time.time()

y_pred = best_model.predict(X_val_20)
mae = mean_absolute_error(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print("Mean Absolute Error (MAE):", mae)

elapsed_time = end_time - start_time
print("Elapsed time:", elapsed_time, "seconds")

100%|██████████| 10/10 [02:28<00:00, 14.86s/trial, best loss: 0.15149056150933735]
Best Hyperparameters: {'colsample_bytree': 0.9206778001753837, 'gamma': 0.1308725613479863, 'learning_rate': 0.12712692444465476, 'max_depth': 13, 'min_child_weight': 8.09842442235597, 'n_estimators': 600, 'subsample': 0.9598991115569738}
Mean Absolute Error (MAE): 0.136768140574989
Elapsed time: 24.175472259521484 seconds


# Testing on the test set

In [None]:
Data_test = pd.read_csv('AppML_InitialProject_test_regression.csv')

X_test = pd.DataFrame(scaler.fit_transform(Data_test), columns=Data_test.columns)
X_test_20 = X_test[top_20_features]

y_pred_test = best_model.predict(X_test_20)
print("Predicted Energies:", y_pred_test)

Rescaled_y_pred_test = y_std*y_pred_test + y_mean
print('Rescaled Predicted Energies:', Rescaled_y_pred_test)

Predicted Energies: [-0.7943728   0.6129045   0.3662371  ... -0.6280874  -0.73131466
 -0.69969213]
Rescaled Predicted Energies: [19423.332 98539.59  84672.1   ... 28771.793 22968.422 24746.223]


In [None]:
folder_name = 'solutions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

Write = False
if Write:
    top_20_features_list = top_20_features.tolist()
    variables = top_20_features

    csv_file_path = os.path.join(folder_name, 'Regression_XGBoost_VariableList.csv')
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for variable in variables:
            writer.writerow([variable])
    
    data = Rescaled_y_pred_test
    csv_file_path = os.path.join(folder_name, 'Regression_XGBoost.csv')
    
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for index, item in enumerate(data, start=0):
            writer.writerow([index, item])