In [10]:
import pandas as pd

pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [11]:
data = pd.read_csv("ML_ready.csv")
print(data.head())
print(data.tail())
print(data.shape)
print(data.columns)
print(data.dtypes)
print(data.describe())

  Name_idx        Date Name     Close     Volume  Close Dow Jones  Close 5-Year Treasury Yield  Close S&P 500  Close 2-Year Treasury Yield  Close NASDAQ  Close 10-Year Treasury Yield  price Crude Oil  Close_Gold_Price    VIX  pct_change_t+1  pct_change_t-0  pct_change_t-1  pct_change_t-2  pct_change_t-3  pct_change_t-4  pct_change_t-5  pct_change_t-6  pct_change_t-7  pct_change_t-8  pct_change_t-9  pct_change_t-10  pct_change_t-11  pct_change_t-12  pct_change_t-13  pct_change_t-14  pct_change_t-15  pct_change_t-16  pct_change_t-17  pct_change_t-18  pct_change_t-19  pct_change_t-20  pct_change_t-21  pct_change_t-22  pct_change_t-23  pct_change_t-24  pct_change_t-25  pct_change_t-26  pct_change_t-27  pct_change_t-28  pct_change_t-29  pct_change_t-30  pct_change_t-31  pct_change_t-32  pct_change_t-33  pct_change_t-34  pct_change_t-35  pct_change_t-36  pct_change_t-37  pct_change_t-38  pct_change_t-39  pct_change_t-40  pct_change_t-41  pct_change_t-42  pct_change_t-43  pct_change_t-44  pct

In [12]:
X = data.drop(['pct_change_t+1','Name_idx','Name'], axis=1)
y = data['pct_change_t+1']

# X_train is all data before 2015
X_train = X[X['Date'] < '2015-01-01']
y_train = y[X['Date'] < '2015-01-01']

# X_test is all data after 2015
X_test = X[X['Date'] >= '2015-01-01']
y_test = y[X['Date'] >= '2015-01-01']

#drop Date column
X_train = X_train.drop(['Date'], axis=1)
X_test = X_test.drop(['Date'], axis=1)

#print shapes
print(X_train.shape)
print(y_train.shape)



(2142991, 71)
(2142991,)


In [13]:
#min max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#get the mb size of the data
size_in_bytes = X_train_scaled.nbytes
size_in_MB = size_in_bytes / (1024 * 1024)  # Convert to MB

print(f"Array Size: {size_in_MB:.2f} MB")



Array Size: 1160.83 MB


In [16]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from scipy.stats import uniform, randint
import time
import sys
from tqdm.notebook import tqdm  # imported in case you want to add progress bars later

# Define parameter grid using distributions
param_dist = {
    # Continuous parameters
    'learning_rate': uniform(0.01, 0.29),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 1),
    'reg_alpha': uniform(0, 10),
    'reg_lambda': uniform(0, 10),
    
    # Integer parameters
    'n_estimators': randint(50, 300),
    'max_depth': list(range(3, 73)) + [None],
    'min_child_weight': randint(1, 10),
    
    # Categorical parameters
    'booster': ['gbtree'],
    'tree_method': ['auto', 'hist']
}

# Create the XGBoost model – n_jobs=2 here maintains parallelism during training
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                             random_state=42,
                             n_jobs=-1,
                             verbosity=1)

# Set up k-fold cross-validation
cv = KFold(n_splits=4, shuffle=True, random_state=42)

print("Starting hyperparameter optimization with RandomizedSearchCV...", flush=True)
start_time = time.time()

try:
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist,
        n_iter=100,
        scoring='neg_mean_squared_error',
        cv=cv,
        verbose=3,  # high verbosity to output progress from each parallel job
        random_state=42,
        n_jobs=1,
        return_train_score=True,
        refit=True,
        error_score='raise'  # raises errors immediately for debugging
    )

    random_search.fit(X_train_scaled, y_train)
    
    best_params = random_search.best_params_
    print(f"\nOptimization completed in {(time.time() - start_time)/60:.2f} minutes", flush=True)
    print("Best parameters:", flush=True)
    print(best_params, flush=True)
    best_model = random_search.best_estimator_
    
except Exception as e:
    print(f"An error occurred during optimization: {str(e)}", flush=True)
    print("Retrying with n_jobs=2 and increased verbosity...", flush=True)
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist,
        n_iter=100,
        scoring='neg_mean_squared_error',
        cv=cv,
        verbose=3,
        random_state=42,
        n_jobs=1,  # still using parallelism
        return_train_score=True,
        refit=True
    )
    random_search.fit(X_train_scaled, y_train)
    best_params = random_search.best_params_
    print("Best parameters:", flush=True)
    print(best_params, flush=True)
    best_model = random_search.best_estimator_

# Optionally, print out detailed results from each iteration
print("\nDetailed candidate results:", flush=True)
results = random_search.cv_results_
for i in range(len(results['mean_test_score'])):
    print(f"Iteration {i+1}: Params: {results['params'][i]}, Mean Test Score: {results['mean_test_score'][i]}", flush=True)

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    results = {
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'MAPE': mean_absolute_percentage_error(y_test, y_pred),
        'R²': r2_score(y_test, y_pred)
    }
    return results

print("\nTraining metrics:", flush=True)
train_metrics = evaluate_model(best_model, X_train_scaled, y_train)
print(train_metrics, flush=True)

print("\nTest metrics:", flush=True)
test_metrics = evaluate_model(best_model, X_test_scaled, y_test)
print(test_metrics, flush=True)


Starting hyperparameter optimization with RandomizedSearchCV...
Fitting 4 folds for each of 100 candidates, totalling 400 fits
[CV 1/4] END booster=gbtree, colsample_bytree=0.6872700594236812, gamma=0.9507143064099162, learning_rate=0.22227824312530747, max_depth=63, min_child_weight=5, n_estimators=152, reg_alpha=4.458327528535912, reg_lambda=0.9997491581800289, subsample=0.7296244459829335, tree_method=auto;, score=(train=-0.001, test=-0.001) total time=  12.2s
[CV 2/4] END booster=gbtree, colsample_bytree=0.6872700594236812, gamma=0.9507143064099162, learning_rate=0.22227824312530747, max_depth=63, min_child_weight=5, n_estimators=152, reg_alpha=4.458327528535912, reg_lambda=0.9997491581800289, subsample=0.7296244459829335, tree_method=auto;, score=(train=-0.001, test=-0.001) total time=  17.9s
[CV 3/4] END booster=gbtree, colsample_bytree=0.6872700594236812, gamma=0.9507143064099162, learning_rate=0.22227824312530747, max_depth=63, min_child_weight=5, n_estimators=152, reg_alpha=4.

In [19]:
#train an XG Boost with these parameters: Best parameters: {'booster': 'gbtree', 'colsample_bytree': 0.5681857377933849, 'gamma': 0.014544665667881929, 'learning_rate': 0.11167039205391312, 'max_depth': 75, 'min_child_weight': 3, 'n_estimators': 300, 'reg_alpha': 7.263970326502283, 'reg_lambda': 5.474463068991115, 'subsample': 0.7254552237080523, 'tree_method': 'auto'}
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                                random_state=42,
                                n_jobs=-1,
                                verbosity=1,
                                booster='gbtree',
                                colsample_bytree=0.5681857377933849,
                                gamma=0.014544665667881929,
                                learning_rate=0.11167039205391312,
                                max_depth=75,
                                n_estimators=500,
                                reg_alpha=7.263970326502283,
                                reg_lambda=5.474463068991115,
                                subsample=0.7254552237080523,
                                tree_method='hist')

xgb_model.fit(X_train_scaled, y_train)

# Evaluate the model
train_metrics = evaluate_model(xgb_model, X_train_scaled, y_train)
print(train_metrics)

test_metrics = evaluate_model(xgb_model, X_test_scaled, y_test)
print(test_metrics)

{'MSE': 0.0005970056532523312, 'RMSE': 0.024433699131575046, 'MAE': 0.014441434919014182, 'MAPE': 288767340873.0447, 'R²': 0.391273016301828}
{'MSE': 0.0004427236126481175, 'RMSE': 0.02104099837574533, 'MAE': 0.014074819042456638, 'MAPE': 92119397267.18881, 'R²': -0.06017701106973794}
