In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pickle

# ----------------------------
# User-defined Configuration
# ----------------------------

# Name of the CSV file containing training data
filename = "trainDataFile"    #Do not include ".csv"

# Insert the desired name of your trained model to be stored as
model_name = "ExampleModelName"    #Do not include ".pkl"

# (Optional) random seed for reproducibility
# For truly random shuffling input: None
random_state = 42


# ----------------------------
# Data Loading and Preprocessing
# ----------------------------

# Read the CSV file, fill missing values, and shuffle the data.
data = pd.read_csv(f"{filename}.csv")
data = data.ffill()
data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)

# Define predictor and target columns.
PREDICTORS = [f"xy{i}" for i in range(1, 151)]
TARGET = "preLoad"

# Scale the predictor columns.
scaler = StandardScaler()
data[PREDICTORS] = scaler.fit_transform(data[PREDICTORS])

# Define Predictors and Targets.
X = data[PREDICTORS]
y = data[TARGET]

# ----------------------------
# XGBoost Model and Grid Search
# ----------------------------

# Instantiate the XGBRegressor.
xgb_model = xgb.XGBRegressor(random_state=random_state, objective='reg:squarederror')

# Define parameter grid for hyperparameter tuning.
param_grid = {
    'n_estimators': [100, 200, 400, 500],
    'max_depth': [None, 10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

print("Starting Grid Search for XGBoost hyperparameters...")
grid_search_xgb = GridSearchCV(estimator=xgb_model,
                               param_grid=param_grid,
                               scoring='neg_mean_squared_error',
                               cv=3,
                               n_jobs=-1,
                               verbose=1)

grid_search_xgb.fit(X, y)

print("\nBest parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best CV MSE (negated):", grid_search_xgb.best_score_)

# Obtain the best estimator.
best_xgb = grid_search_xgb.best_estimator_

# ----------------------------
# Save the Best Model
# ----------------------------

# Save the model, scaler, predictors, and target information as a dictionary.
model_data = {
    "model": best_xgb,
    "scaler": scaler,
    "predictors": PREDICTORS,
    "target": TARGET
}

with open(f"{model_name}.pkl", "wb") as f:
    pickle.dump(model_data, f)

print(f"\nFinal XGBoost model saved to '{model_name}.pkl'.")
