# Imports

In [None]:
import sys
import os
from google.colab import drive
from google.colab import files
from dotenv import load_dotenv
import json
import warnings
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
%matplotlib inline

# Bootstrap

In [None]:
np.random.seed(31071967)

# Find and load the .env file from the current or parent directories
load_dotenv()

drive.mount('/content/drive')

with open(f"{os.getenv('PROJECT_PATH')}/src/config.json", 'r') as f:
    project_config = json.load(f)
    project_config.pop('_comment', None)
    project_config.pop('_note', None)
    f.close()

# Download TKL data from YF

In [None]:
tickers_yf = {
    "y"     : f"{project_config['TKL']}",
    "NASDAQ"       : "^IXIC",
    "SP500"        : "^GSPC",
    "Gold"         : "GC=F",
    "Oil"          : "CL=F",
    "RealEstate"   : "VNQ",
    "InflationExp": "^TNX"
}

desired_order = [
    "Date",
    "y",
    "NASDAQ",
    "SP500",
    "Oil",
    "Gold",
    "RealEstate",
    "InflationExp",
]

In [None]:
import yfinance as yf
import pandas as pd
from pandas_datareader import data as pdr

from datetime import date, timedelta
end_date = date.today() - timedelta(days=1)
start_date = end_date - timedelta(days=int(project_config["HISTORY_DEPTH"]))

# ---- DOWNLOAD FROM YAHOO FINANCE ----
ts_yf = yf.download(
    tickers=list(tickers_yf.values()),
    start=start_date,
    end=end_date,
    auto_adjust=True
)["Close"]

#ts_yf = ts_yf.dropna(how="all")
# ts_yf = yf.download(
#     tickers=list(tickers_yf.values()),
#     period="max",
#     auto_adjust=True
# )["Close"]
#ts_yf = ts_yf.tail(int(project_config["HISTORY_DEPTH"]))

# rename columns to readable names
rename_map = {v: k for k, v in tickers_yf.items()}
ts_yf = ts_yf.rename(columns=rename_map)

# Fill missing daily values for macro data (monthly)
ts_yf = ts_yf.fillna(method='ffill').fillna(method='bfill')
ts_yf = ts_yf.reset_index().rename(columns={"Date": "Date",})

print(f"\n\nDataset for y={project_config['TKL']}")
display(ts_yf.head(1))
display(ts_yf.tail(1))
ts_yf.info()

# Turn time-series into scaled supprvied ML table

In [None]:
df = ts_yf.copy()
df_orig = ts_yf.copy()

# Keep Date
date_col = df["Date"]

# Targets
y_col = ['y']

# Features
X_cols = df.drop(columns=y_col+['Date']).columns

# Initialize scalers
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Scale
df_X_scaled = pd.DataFrame(X_scaler.fit_transform(df[X_cols]),
                           columns=X_cols, index=df.index)

df_y_scaled = pd.DataFrame(y_scaler.fit_transform(df[y_col]),
                           columns=y_col, index=df.index)

# Rebuild dataframe
df = pd.concat([date_col, df_X_scaled, df_y_scaled], axis=1)

df = df[desired_order]
df_orig = df_orig[desired_order]

del df_X_scaled, df_y_scaled

display(df.tail(1))
display(df_orig.tail(1))

In [1]:
def generate_ts_features(df, lags=[2,3,4,5,6,10,22,66], windows=[5,10,22,66]):

  # Identify columns to generate features for (excluding 'index' column which is the date)
  base_cols = [c for c in df.columns if c not in ['Date','index']]

  for base_col in base_cols:

    for lag in lags:
      df[f"{base_col}_lag_{lag}"] = df[base_col].shift(lag)

  # Generate rolling window statistics for the current base_col
    for window in windows:
      df[f"{base_col}_min_{window}"] = df[base_col].rolling(window=window).min()
      df[f"{base_col}_max_{window}"] = df[base_col].rolling(window=window).max()
      df[f"{base_col}_mean_{window}"] = df[base_col].rolling(window=window).mean()
      df[f"{base_col}_std_{window}"]  = df[base_col].rolling(window=window).std()
      df[f"{base_col}_diff_{window}"] = df[base_col].diff(window)
      df[f"{base_col}_pct_{window}"] = df[base_col].pct_change(window)

  df['y_next'] = df['y'].shift(-1)                            # y_next = tomorrow's y (close price)
  df.loc[df.index[-1], 'y_next'] = df.loc[df.index[-1], 'y']  # the TARGET cell. y_next tomorrow = y today
  df = df.fillna(method='ffill').fillna(method='bfill')

  return df

df = generate_ts_features(df)
df_orig = generate_ts_features(df_orig)

# Replace infinite values with NaN in X_train and y_train
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(method='ffill').fillna(method='bfill')

display(df.tail(1))
display(df_orig.tail(1))

NameError: name 'df' is not defined

# Split

In [None]:
X = df.drop(columns=['Date','y_next'])
y = df['y_next']

X_train, X_test = X.iloc[:-2], X.iloc[-1:]
y_train, y_test = y.iloc[:-2], y.iloc[-1:]

del X,y

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train, Predict, Evaluate

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

xgb = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror")

xgb.fit(X_train, y_train)

In [None]:
pred = xgb.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print("MAE:", mae)

In [None]:
last_row = X_test # Use X_test which was successfully predicted upon
scaled_tomorrow_prediction = xgb.predict(last_row)[0] # This is a scaled prediction for y_next

# Calculate min and max for 'y_next' from the original unscaled data (df_orig)
# This allows us to manually inverse transform the single predicted value.
min_y_next_orig = df_orig['y_next'].min()
max_y_next_orig = df_orig['y_next'].max()

# Inverse transform the scaled prediction using the min-max formula
unscaled_tomorrow_prediction = scaled_tomorrow_prediction * (max_y_next_orig - min_y_next_orig) + min_y_next_orig

# Update the scaled 'y_next' in the scaled DataFrame (df) for consistency if needed later
df.loc[df.index[-1], 'y_next'] = scaled_tomorrow_prediction
df_orig.loc[df_orig.index[-1], 'y_next'] = unscaled_tomorrow_prediction

print(f"Stoke: {project_config['TKL']}")
print(f"Date: { df['Date'].tail(1).iloc[0].date()}")
print(f"Close Prediction: ${unscaled_tomorrow_prediction:.2f}")

In [None]:
display(df.tail(1))
display(df_orig.tail(1))

# Feature selection

In [None]:
import matplotlib.pyplot as plt
import xgboost # Import the xgboost module

# Plot feature importance based on "weight" (number of times a feature appears in a tree)
plt.figure(figsize=(10, 6))
xgboost.plot_importance(xgb, max_num_features=20) # Pass the xgb regressor object
plt.title("Feature Importance (Weight)")
plt.show()

# 1. Get feature importance by weight
importance_dict = xgb.get_booster().get_score(importance_type='weight')

# 2. Convert to DataFrame
df_importance = pd.DataFrame(list(importance_dict.items()), columns=['Feature', 'Score'])

# 3. Sort high â†’ low
df_importance = df_importance.sort_values(by='Score', ascending=False)

# 4. Compute threshold = 10% of top feature
top_score = df_importance['Score'].iloc[0]
threshold = top_score * 0.05   # 5%

# 5. Select only strong features
df_top = df_importance[df_importance['Score'] >= threshold]

# 6. Convert to list
top_weight_features_list = df_top['Feature'].tolist()[:10:]

print("Weight Threshold:", threshold)
print("Selected Weight Features:", top_weight_features_list)

In [None]:
# OPTIONAL: Plot based on "gain" (average gain of splits which use the feature)
# "Gain" is often more accurate for finding what actually drives the prediction.
plt.figure(figsize=(10, 6))
xgboost.plot_importance(xgb, importance_type='gain', max_num_features=20) # Pass the xgb regressor object
plt.title("Feature Importance (Gain)")
plt.show()

# 1. Build importance DataFrame
importance_dict = xgb.get_booster().get_score(importance_type='gain')
df_importance = pd.DataFrame(list(importance_dict.items()), columns=['Feature', 'Score'])

# 2. Sort high to low
df_importance = df_importance.sort_values(by='Score', ascending=False)

# 3. Compute threshold = 10% of top feature
top_score = df_importance['Score'].iloc[0]
threshold = top_score * 0.05

# 4. Select features with Score >= threshold
df_top = df_importance[df_importance['Score'] >= threshold]

# 5. Extract feature names
top_gain_features_list = df_top['Feature'].tolist()[:10:]

print("Threshold:", threshold)
print("Selected features:", top_gain_features_list)

In [None]:
top_features = list(set(top_gain_features_list) | set(top_weight_features_list))
top_features

# Final df for this TKL

In [None]:
final_df = df[['Date','y_next'] + top_features]

for col in top_features + ['y_next']:
  final_df[f"{col}_orig"] = df_orig[col]

display(final_df.tail(1))
final_df.info()

# CSV

In [None]:
if project_config["dataprep.override_csv"] == '1':

  df_csv_path = f"{os.getenv('PROJECT_PATH')}{project_config['data_directory']}{project_config['TKL']}.df.csv"
  final_df[:-2:].to_csv(df_csv_path)