# Imports

In [1]:
import sys
import os
from google.colab import drive
from google.colab import files
from dotenv import load_dotenv
import json
import warnings
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
%matplotlib inline

# Bootstrap

In [2]:
np.random.seed(31071967)

# Find and load the .env file from the current or parent directories
load_dotenv()

drive.mount('/content/drive')

with open(f"{os.getenv('PROJECT_PATH')}/src/config.json", 'r') as f:
    project_config = json.load(f)
    project_config.pop('_comment', None)
    project_config.pop('_note', None)
    f.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Download TKL data from YF

In [3]:
tickers_yf = {
    "y"     : f"{project_config['TKL']}",
 }

desired_order = [
    "Date",
    "y",
  ]

In [4]:
import yfinance as yf
import pandas as pd
from pandas_datareader import data as pdr

from datetime import date, timedelta
end_date = date.today() - timedelta(days=1)
start_date = end_date - timedelta(days=int(project_config["HISTORY_DEPTH"]))

if project_config['TKL'] == 'TNYA':
  start_date = pd.to_datetime("30.07.2022", format="%d.%m.%Y")

# ---- DOWNLOAD FROM YAHOO FINANCE ----
ts_yf = yf.download(
    tickers=list(tickers_yf.values()),
    start=start_date,
    end=end_date,
    auto_adjust=True
)["Close"]

# rename columns to readable names
rename_map = {v: k for k, v in tickers_yf.items()}
ts_yf = ts_yf.rename(columns=rename_map)

# Fill missing daily values for macro data (monthly)
ts_yf = ts_yf.fillna(method='ffill').fillna(method='bfill')
ts_yf = ts_yf.reset_index().rename(columns={"Date": "Date",})

print(f"\n\nDataset for y={project_config['TKL']}")
display(ts_yf.head(1))
display(ts_yf.tail(1))
ts_yf.info()
df = ts_yf.copy()

[*********************100%***********************]  1 of 1 completed



Dataset for y=NVDA





Ticker,Date,y
0,2006-10-23,0.482561


Ticker,Date,y
4820,2025-12-19,180.990005


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4821 entries, 0 to 4820
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    4821 non-null   datetime64[ns]
 1   y       4821 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 75.5 KB


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import random # Needed for shuffling

# -----------------------
# Parameters
# -----------------------
STEP_DAYS   = int(project_config['FASTAI.STEP_DAYS'])
WINDOW_DAYS = int(project_config['FASTAI.WINDOW_DAYS'])
FUTURE_DAYS = int(project_config['FASTAI.FUTURE_DAYS'])

BUY_TH      = int(project_config['FASTAI.BUY_TH'])
SELL_TH     = int(project_config['FASTAI.SELL_TH'])

# Setup Directories
OUT_ROOT_DIR = f"{os.getenv('PROJECT_PATH')}{project_config['images_directory']}{project_config['TKL']}_graphs/"

# If the directory exists, delete it and all its contents
if os.path.exists(OUT_ROOT_DIR): shutil.rmtree(OUT_ROOT_DIR)
# Now create it fresh
os.makedirs(OUT_ROOT_DIR, exist_ok=True)

LABELS = ["BUY", "SELL", "KEEP"]

# Create Grandparent/Parent folders
for s in ["train", "valid"]:
    for l in LABELS:
        os.makedirs(os.path.join(OUT_ROOT_DIR, s, l), exist_ok=True)

# Create pred folder separately
os.makedirs(os.path.join(OUT_ROOT_DIR, "pred"), exist_ok=True)

# -----------------------
# Prepare data
# -----------------------
df = df.copy()
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# All possible start indices
all_indices = list(range(0, len(df) - WINDOW_DAYS - FUTURE_DAYS, STEP_DAYS))

# 1. Identify the most recent graph for 'pred'
pred_idx_val = all_indices[-1]

# 2. Take all other indices and shuffle them randomly
history_indices = all_indices[:-1]
random.seed(42) # For reproducibility
random.shuffle(history_indices)

# 3. Split the shuffled history: 80% train, 20% valid
train_count = int(len(history_indices) * 0.8)
train_indices = set(history_indices[:train_count])
valid_indices = set(history_indices[train_count:])

results = []

# -----------------------
# Main loop
# -----------------------
for start_idx in all_indices:
    graph_df = df.iloc[start_idx : start_idx + WINDOW_DAYS]
    future_df = df.iloc[start_idx + WINDOW_DAYS : start_idx + WINDOW_DAYS + FUTURE_DAYS]

    start_date = graph_df['Date'].iloc[0].date()
    end_date = graph_df['Date'].iloc[-1].date()

    # Recommendation logic
    last_price = graph_df['y'].iloc[-1]
    future_max = future_df['y'].max()
    ret = (future_max - last_price) / last_price

    if ret > BUY_TH:
        rec = "BUY"
    elif ret < SELL_TH:
        rec = "SELL"
    else:
        rec = "KEEP"

    fname = f"{project_config['TKL']}_{start_date}_{end_date}_{rec}.png"

    # -----------------------
    # Split Logic (New)
    # -----------------------
    if start_idx == pred_idx_val:
        save_path = os.path.join(OUT_ROOT_DIR, "pred", fname)
        split_name = "pred"
    elif start_idx in train_indices:
        save_path = os.path.join(OUT_ROOT_DIR, "train", rec, fname)
        split_name = "train"
    else:
        save_path = os.path.join(OUT_ROOT_DIR, "valid", rec, fname)
        split_name = "valid"

    # Plotting
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(graph_df['y'].values)
    ax.set_axis_off()
    plt.tight_layout()

    plt.savefig(save_path, bbox_inches="tight", pad_inches=0)
    plt.close()

    results.append({"split": split_name, "recommendation": rec, "file": fname})

print(f"Done! Processed {len(train_indices)} train, {len(valid_indices)} valid, and 1 pred image.")

Done! Processed 725 train, 182 valid, and 1 pred image.
