# Imports

In [None]:
!pip install --quiet optuna
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from google.colab import drive
drive.mount('/content/drive')

# Data

In [None]:
# Load data
path1 = "/content/drive/MyDrive/Thesis/Data/Merging CleanPrice & Features.csv"
price = pd.read_csv(path1, sep=',')

price['Date'] = pd.to_datetime(price['Date'])
columns = price.copy()
price.set_index('Date', inplace=True)
RawData = price.copy()

# Dataframes and global settings

In [None]:
# Placeholders
placeholder = pd.DataFrame(index=pd.MultiIndex.from_product([columns['GEO_Name'].unique(), columns['Date'].unique()], names=['State', 'OriginalIndex']))
horizons = [ 1, 3, 6, 12]
hnames = ['h1', 'h3', 'h6', 'h12']

for column_name in hnames:
    placeholder[column_name] = np.nan

# Placeholder df's
Actuals = placeholder.copy()
Forecasts = placeholder.copy()
Forecast_errors = placeholder.copy()

In [None]:
# Global starting points
Trials = 10
Stopping = 25
Random_Seed = 42
jobs = 1

# Initial settings
validation_size = 24
initial_train_size = 330
retrain_period = 24

# Functions

In [None]:
def feature_selection(df, state, h):
    State_Data = df[df['GEO_Name'] == state]
    Log_return_lag = State_Data[[f'Log_Return_h{h}']].shift(h)
    Log_return_lag = Log_return_lag.rename(lambda x: f'{x}_lag', axis='columns')
    Log_return = State_Data[f'Log_Return_h{h}']
    State_Data = State_Data.drop([f'Log_Return_h1', 'Log_Return_h3', 'Log_Return_h6', 'Log_Return_h12', 'Index_SA', 'Ln(Index_SA)'], axis=1).shift(h)
    Lag = State_Data.drop(['Year', 'Month', 'GEO_Name'], axis=1).shift(h)
    Lag = Lag.rename(lambda x: f'{x}_lag', axis='columns')
    Combined = pd.concat([State_Data, Lag], axis=1).dropna()
    Combined = pd.concat([Combined, Log_return_lag], axis=1).dropna()
    Combined = pd.concat([Combined, Log_return], axis=1).dropna()

    # Filter data
    train = Combined.iloc[:initial_train_size-validation_size, :]
    y = train[[f'Log_Return_h{h}']]
    X = train.drop([f'Log_Return_h{h}', 'GEO_Name', 'Year', 'Month'], axis=1)

    # Train decision tree
    tree_model = DecisionTreeRegressor(random_state=Random_Seed)
    tree_model.fit(X, y)

    # Determine important features
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': tree_model.feature_importances_,
        'GEO_Name': state,
        'Count': 1,
        'Horizon': f'h{h}'
    }).sort_values(by='importance', ascending=False)
    feature_importances['cumulative_importance'] = feature_importances['importance'].cumsum()
    selected_features = feature_importances[feature_importances['cumulative_importance'] <= 0.95]['feature'].tolist()
    selected_features_df = feature_importances[feature_importances['cumulative_importance'] <= 0.95]

    print(selected_features)

    return selected_features, Combined, selected_features_df

In [None]:
def feature_selection_loop(df, state, h, t):
    State_Data = df[df['GEO_Name'] == state]
    Log_return_lag = State_Data[[f'Log_Return_h{h}']].shift(h)
    Log_return_lag = Log_return_lag.rename(lambda x: f'{x}_lag', axis='columns')
    Log_return = State_Data[f'Log_Return_h{h}']
    State_Data = State_Data.drop([f'Log_Return_h1', 'Log_Return_h3', 'Log_Return_h6', 'Log_Return_h12', 'Index_SA', 'Ln(Index_SA)'], axis=1).shift(h)
    Lag = State_Data.drop(['Year', 'Month', 'GEO_Name'], axis=1).shift(h)
    Lag = Lag.rename(lambda x: f'{x}_lag', axis='columns')
    Combined = pd.concat([State_Data, Lag], axis=1).dropna()
    Combined = pd.concat([Combined, Log_return_lag], axis=1).dropna()
    Combined = pd.concat([Combined, Log_return], axis=1).dropna()

    # Filter data
    train = Combined.iloc[t - initial_train_size + retrain_period-1:t-validation_size + retrain_period-1, :]
    y = train[[f'Log_Return_h{h}']]
    X = train.drop([f'Log_Return_h{h}', 'GEO_Name', 'Year', 'Month'], axis=1)

    # Train decision tree
    tree_model = DecisionTreeRegressor(random_state=Random_Seed)
    tree_model.fit(X, y)

    # Determine important features
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': tree_model.feature_importances_,
        'GEO_Name': state,
        'Count': i,
        'Horizon': f'h{h}'
    }).sort_values(by='importance', ascending=False)
    feature_importances['cumulative_importance'] = feature_importances['importance'].cumsum()
    selected_features = feature_importances[feature_importances['cumulative_importance'] <= 0.95]['feature'].tolist()
    selected_features_df = feature_importances[feature_importances['cumulative_importance'] <= 0.95]

    print(selected_features)

    return selected_features, Combined, selected_features_df

In [None]:
def train_model_for_horizon(train, val, y_key, jobs=jobs):
    Random_Seed1 = Random_Seed + i + w + q
    def objective(trial):
        # Hyperparameters to optimize
        n_estimators = trial.suggest_int('n_estimators', 50, 500)
        max_depth = trial.suggest_int('max_depth', 3, 7)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        subsample = trial.suggest_float('subsample', 0.5, 1.0)
        reg_lambda = trial.suggest_float('reg_lambda', 0.01, 10, log=True)
        reg_alpha = trial.suggest_float('reg_alpha', 0.01, 1, log=True)

        model = xgb.XGBRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            n_jobs=jobs,
            random_state= Random_Seed1,
            tree_method = 'hist',
            early_stopping_rounds = Stopping)

        # Training the model
        model.fit(
            train.drop(columns=[y_key]), train[y_key],
            eval_set=[(val.drop(columns=[y_key]), val[y_key])],
            verbose=False)
        preds = model.predict(val.drop(columns=[y_key]))

        # Calculating MSE
        mse = mean_squared_error(val[y_key], preds)

        return -mse

    # Optuna study for hyperparameter optimization
    sampler = optuna.samplers.TPESampler(seed=Random_Seed1)
    study = optuna.create_study(direction='maximize', sampler = sampler)
    study.optimize(objective, n_trials=Trials, n_jobs=jobs)

    # Best model training
    best_params = study.best_params
    best_model = xgb.XGBRegressor(
        n_jobs=jobs,
        random_state = Random_Seed1,
        **best_params,
        tree_method = 'hist',
        early_stopping_rounds = Stopping)

    # Combining train and validation sets for final model training
    combined_train_val = pd.concat([train, val])
    best_model.fit(
        combined_train_val.drop(columns=[y_key]), combined_train_val[y_key],
        eval_set=[(combined_train_val.drop(columns=[y_key]), combined_train_val[y_key])],
        verbose=False)

    feature_importances = pd.DataFrame({
        'feature': combined_train_val.drop(columns=[y_key]).columns,
        'importance': best_model.feature_importances_,
        'GEO_Name': state,
        'Count': i,
        'Horizon': f'h{h}'
    }).sort_values(by='importance', ascending=False)

    return best_model, feature_importances

# Predictions

In [None]:
# df's
states = RawData['GEO_Name'].unique()
all_selected_features_df = pd.DataFrame()
Model_Importance = pd.DataFrame()

w = 0
q = 0

for state in states:
    StateData = RawData[RawData['GEO_Name'] == state]

    q += 1

    for h, hname in zip(horizons, hnames):

        w += 1

        print(f"\nProcessing: {state}")
        print(f'Horizon: {h} \n')

        selected_feature_list, Combined, selected_features_df = feature_selection(RawData, state, h)
        all_selected_features_df = pd.concat([all_selected_features_df, selected_features_df], ignore_index=True)

        # Defining Target & Feature
        Target = Combined[[f'Log_Return_h{h}']]
        Feature = Combined[selected_feature_list]

        Data = pd.concat([Target, Feature], axis=1).dropna()

        y_key = f'Log_Return_h{h}'

        # Initial traning and validation
        train_initial = Data.iloc[:initial_train_size-validation_size, :]
        val_initial = Data.iloc[initial_train_size-validation_size:initial_train_size, :]

        i = 0

        model, feature_importances = train_model_for_horizon(train_initial, val_initial, y_key, jobs=jobs)

        Model_Importance = pd.concat([Model_Importance, feature_importances], ignore_index=True)

        test = Data.iloc[initial_train_size+h:initial_train_size+retrain_period+h, :]
        forecast = model.predict(test.drop(columns=[y_key]))

        for i in range(retrain_period):
          Forecasts.loc[(state, test.index[i]), hname] = forecast[i]
          Actuals.loc[(state, test.index[i]), hname] = test[y_key].values[i]

        for t in range(initial_train_size, len(Data), 1):

            # Cross validation (Nested Validation)
            if i % retrain_period == 0:

                selected_feature_list, Combined, selected_features_df = feature_selection_loop(RawData, state, h, t)
                all_selected_features_df = pd.concat([all_selected_features_df, selected_features_df], ignore_index=True)

                # Defining Target & Feature
                Target = Combined[[f'Log_Return_h{h}']]
                Feature = Combined[selected_feature_list]
                Data = pd.concat([Target, Feature], axis=1).dropna()
                train = Data.iloc[t - initial_train_size + retrain_period-1:t - validation_size + retrain_period-1, :]
                val = Data.iloc[t-validation_size + retrain_period-1:t + retrain_period-1, :]

                model, feature_importances = train_model_for_horizon(train, val, y_key, jobs=1)

                Model_Importance = pd.concat([Model_Importance, feature_importances], ignore_index=True)

                print(f'\n Length {t}')

            i += 1

            test = Data.iloc[t+retrain_period+h-1:t+retrain_period+h, :]
            if test.empty:
              break

            forecast = model.predict(test.drop(columns=[y_key]))

            # Appending predictions
            Forecasts.loc[(state, test.index[0]), hname] = forecast[0]
            Actuals.loc[(state, test.index[0]), hname] = test[y_key].values[0]

Forecasts = Forecasts.reset_index(level='State')
Actuals = Actuals.reset_index(level='State')

# Feature Selection
importance_summary = all_selected_features_df.groupby(['feature', 'GEO_Name', 'Horizon'])['importance'].sum().reset_index()
count_summary = all_selected_features_df.groupby(['GEO_Name', 'Horizon'])['Count'].nunique().reset_index()
count_summary.rename(columns={'Count': 'Unique_Counts'}, inplace=True)
summary = importance_summary.merge(count_summary, on=['GEO_Name', 'Horizon'], how='left')
summary['mean_importance'] = summary['importance'] / summary['Unique_Counts']
summary.drop(columns=['Unique_Counts', 'importance'], inplace=True)
summary_sorted = summary.sort_values(by=['GEO_Name', 'Horizon', 'mean_importance'], ascending=[True, True, False]).reset_index()
summary_sorted.drop(columns=['index'], inplace=True)

# Feature Importance
Importance_Model = Model_Importance.groupby(['feature', 'GEO_Name', 'Horizon'])['importance'].sum().reset_index()
count_summary_model = Model_Importance.groupby(['GEO_Name', 'Horizon'])['Count'].nunique().reset_index()
count_summary_model.rename(columns={'Count': 'Unique_Counts'}, inplace=True)
summary_model = Importance_Model.merge(count_summary_model, on=['GEO_Name', 'Horizon'], how='left')
summary_model['mean_importance'] = summary_model['importance'] / summary_model['Unique_Counts']
summary_model.drop(columns=['Unique_Counts', 'importance'], inplace=True)
summary_model_sorted = summary_model.sort_values(by=['GEO_Name', 'Horizon', 'mean_importance'], ascending=[True, True, False]).reset_index()
summary_model_sorted.drop(columns=['index'], inplace=True)

# Saving CSV of Predictions, Actuals, Feature Selection and Importances
Forecasts.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/XGBoostPredictions.csv")
Actuals.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/XGBoostActuals.csv")
all_selected_features_df.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/XGBoostFeatureImportance.csv")
Model_Importance.to_csv("/content/drive/MyDrive/Thesis/Models/Predictions/XGBoostFeatureImportanceModel.csv")

In [None]:
# END