# Imports

In [25]:
import sys
import os
from google.colab import drive
from google.colab import files
from dotenv import load_dotenv
import json
import warnings
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
%matplotlib inline

# Bootstrap

In [26]:
np.random.seed(31071967)

# Find and load the .env file from the current or parent directories
load_dotenv()

drive.mount('/content/drive')

with open(f"{os.getenv('PROJECT_PATH')}/src/config.json", 'r') as f:
    project_config = json.load(f)
    project_config.pop('_comment', None)
    project_config.pop('_note', None)
    f.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Download TKL data from YF

In [27]:
tickers_yf = {
    "y"     : f"{project_config['TKL']}",
    "NASDAQ"       : "^IXIC",
    "SP500"        : "^GSPC",
    "Gold"         : "GC=F",
    "Oil"          : "CL=F",
    "RealEstate"   : "VNQ",
    "InflationExp": "^TNX"
}

desired_order = [
    "Date",
    "y",
    "NASDAQ",
    "SP500",
    "Oil",
    "Gold",
    "RealEstate",
    "InflationExp",
]

In [28]:
import yfinance as yf
import pandas as pd
from pandas_datareader import data as pdr

from datetime import date, timedelta
end_date = date.today() - timedelta(days=1)
start_date = end_date - timedelta(days=int(project_config["HISTORY_DEPTH"]))

if project_config['TKL'] == 'TNYA':
  start_date = pd.to_datetime("30.07.2022", format="%d.%m.%Y")

# ---- DOWNLOAD FROM YAHOO FINANCE ----
ts_yf = yf.download(
    tickers=list(tickers_yf.values()),
    start=start_date,
    end=end_date,
    auto_adjust=True
)["Close"]

# rename columns to readable names
rename_map = {v: k for k, v in tickers_yf.items()}
ts_yf = ts_yf.rename(columns=rename_map)

# Fill missing daily values for macro data (monthly)
ts_yf = ts_yf.fillna(method='ffill').fillna(method='bfill')
ts_yf = ts_yf.reset_index().rename(columns={"Date": "Date",})

print(f"\n\nDataset for y={project_config['TKL']}")
display(ts_yf.head(1))
display(ts_yf.tail(1))
ts_yf.info()

[*********************100%***********************]  7 of 7 completed



Dataset for y=NVDA





Ticker,Date,Oil,Gold,y,RealEstate,SP500,NASDAQ,InflationExp
0,2006-10-23,58.810001,579.700012,0.48256,33.903469,1377.02002,2355.560059,4.826


Ticker,Date,Oil,Gold,y,RealEstate,SP500,NASDAQ,InflationExp
4823,2025-12-19,56.66,4361.399902,180.990005,88.589996,6834.5,23307.619141,4.151


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4824 entries, 0 to 4823
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          4824 non-null   datetime64[ns]
 1   Oil           4824 non-null   float64       
 2   Gold          4824 non-null   float64       
 3   y             4824 non-null   float64       
 4   RealEstate    4824 non-null   float64       
 5   SP500         4824 non-null   float64       
 6   NASDAQ        4824 non-null   float64       
 7   InflationExp  4824 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 301.6 KB


In [29]:
df = ts_yf.copy()
df_orig = ts_yf.copy()

# Keep Date
date_col = df["Date"]

# Targets
y_col = ['y']

# Features
X_cols = df.drop(columns=y_col+['Date']).columns

# Initialize scalers
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Scale
df_X_scaled = pd.DataFrame(X_scaler.fit_transform(df[X_cols]),
                           columns=X_cols, index=df.index)

df_y_scaled = pd.DataFrame(y_scaler.fit_transform(df[y_col]),
                           columns=y_col, index=df.index)

# Rebuild dataframe
df = pd.concat([date_col, df_X_scaled, df_y_scaled], axis=1)

df = df[desired_order]
df_orig = df_orig[desired_order]

df['y_next'] = df['y'].shift(-1)                            # y_next = tomorrow's y (close price)
df.loc[df.index[-1], 'y_next'] = df.loc[df.index[-1], 'y']  # the TARGET cell. y_next tomorrow = y today

df = df.fillna(method='ffill').fillna(method='bfill')
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(method='ffill').fillna(method='bfill')


df_orig['y_next'] = df_orig['y'].shift(-1)                  # y_next = tomorrow's y (close price)
df_orig.loc[df_orig.index[-1], 'y_next'] = df_orig.loc[df_orig.index[-1], 'y']  # the TARGET cell. y_next tomorrow = y today

df_orig = df_orig.fillna(method='ffill').fillna(method='bfill')
df_orig = df_orig.replace([np.inf, -np.inf], np.nan)
df_orig = df_orig.fillna(method='ffill').fillna(method='bfill')

display(df.tail(1))
display(df_orig.tail(1))

Unnamed: 0,Date,y,NASDAQ,SP500,Oil,Gold,RealEstate,InflationExp,y_next
4823,2025-12-19,0.874145,0.971315,0.989316,0.515471,1.0,0.87179,0.769004,0.874145


Ticker,Date,y,NASDAQ,SP500,Oil,Gold,RealEstate,InflationExp,y_next
4823,2025-12-19,180.990005,23307.619141,6834.5,56.66,4361.399902,88.589996,4.151,180.990005


In [30]:
from pathlib import Path

pickles_path = Path(f"{os.getenv('PROJECT_PATH')}{project_config['pickles_directory']}")
folder = Path(pickles_path)

if list(folder.glob(f"{project_config['TKL']}.model*.keras")) != []:

  import ast
  model_path = list(folder.glob(f"{project_config['TKL']}.model*.keras"))[0]
  df_path = list(folder.glob(f"{project_config['TKL']}.df.pkl"))[0]
  df_orig_path = list(folder.glob(f"{project_config['TKL']}.df_orig.pkl"))[0]

  fname = model_path.name  # extract filename only
  base = fname.removesuffix(".keras")
  tkl_name, tag, model_and_features = base.split(".", maxsplit=2)
  best_model_name, features_str = model_and_features.rsplit(".", maxsplit=1)
  best_model_features = ast.literal_eval(features_str)

  print(f"TKL: {tkl_name}")
  print(f"Model: {best_model_name}")
  print(f"Features: {best_model_features}")

else:
    tkl_name = project_config['TKL']
    best_model_name = 'GRU'
    best_model_features = ['y']

df_path = f"{os.getenv('PROJECT_PATH')}{project_config['pickles_directory']}{project_config['TKL']}.df.pkl"
df_orig_path = f"{os.getenv('PROJECT_PATH')}{project_config['pickles_directory']}{project_config['TKL']}.df_orig.pkl"

df = df[ ['Date','y_next']+best_model_features ]#[:-1:] #.to_pickle(df_path)            #without the last business day, for not yet having a y_next
df_orig = df_orig[ ['Date','y_next']+best_model_features ]#[:-1:] #.to_pickle(df_orig_path)  #without the last business day, for not yet having a y_next
df_orig = df_orig.rename(columns=lambda c: c if c == "Date" else f"{c}_orig")

df.to_pickle(df_path)
df_orig.to_pickle(df_orig_path)

TKL: NVDA
Model: GRU
Features: ['y']


In [32]:
display(df.tail(1))
display(df_orig.tail(1))

Unnamed: 0,Date,y_next,y
4823,2025-12-19,0.874145,0.874145


Ticker,Date,y_next_orig,y_orig
4823,2025-12-19,180.990005,180.990005
