# Import Libs

In [48]:
import pandas as pd
import numpy as np
import datetime as dt
import logging
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns

from torch import optim
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
from tqdm import tqdm

from numpy.lib.stride_tricks import sliding_window_view
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.optim import AdamW
from utils import SklearnWrapper, verify_scaling
from utils.paths import CHECKPOINTS_DIR
from pypfopt import risk_models, expected_returns, plotting

In [2]:
from config import *
from entities import *
from strategies import *
from datasets import *
from engine import Engine
from models import DiffusionTransformer
from frameworks import Diffusion

# Setup

## Config

In [3]:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [4]:
cfg = TrainConfig(
    batch_size=32,
    epochs=20,
    optimizer=OptimizerConfig(lr=2e-4)
)

print(cfg)

TrainConfig(batch_size=32, epochs=20, device='cuda', optimizer=OptimizerConfig(lr=0.0002, weight_decay=1e-06, betas=(0.9, 0.999), eps=1e-08), scheduler=SchedulerConfig(use_scheduler=False, type='cosine', eta_min=1e-06), ddpm=DDPMConfig(noise_steps=1000, beta_start=0.0001, beta_end=0.02, schedule='cosine', d_model=128, n_heads=4, n_layers=4, max_t=128))


# Data

## Load assets (T, A)

In [5]:
symbols = ['AAPL', 'TSLA', 'MSFT', 'NVDA', 'GOOGL', 'AMZN', 'GOOG', 'META', 'AVGO', 'ORCL', 'CRM', 'ADBE', 'AMD', 'CSCO']
freq = "1d"
# Basket
basket = Basket(symbols=symbols)
basket.load_all_assets(freq=freq)

DEBUG:entities.basket:Initialized Asset Basket: ['AAPL', 'TSLA', 'MSFT', 'NVDA', 'GOOGL', 'AMZN', 'GOOG', 'META', 'AVGO', 'ORCL', 'CRM', 'ADBE', 'AMD', 'CSCO'] with 0 assets which loaded.
INFO:entities.basket:Starting batch load for 14 symbols...
DEBUG:entities.basket:Attempting to load AAPL...
DEBUG:entities.asset:Initialized Asset: AAPL with 2724 rows.
INFO:entities.basket:Successfully loaded AAPL (2724 rows).
DEBUG:entities.basket:Attempting to load TSLA...
DEBUG:entities.asset:Initialized Asset: TSLA with 2760 rows.
INFO:entities.basket:Successfully loaded TSLA (2760 rows).
DEBUG:entities.basket:Attempting to load MSFT...
DEBUG:entities.asset:Initialized Asset: MSFT with 2724 rows.
INFO:entities.basket:Successfully loaded MSFT (2724 rows).
DEBUG:entities.basket:Attempting to load NVDA...
DEBUG:entities.asset:Initialized Asset: NVDA with 2724 rows.
INFO:entities.basket:Successfully loaded NVDA (2724 rows).
DEBUG:entities.basket:Attempting to load GOOGL...
DEBUG:entities.asset:Initia

## Feature (C)

### Align Data (Align C)
$\text{Align}(FT) \to FT_{\text{aligned}}$
1. We must find Joint Distribution $FT_{\text{date\ A}} \cap FT_{\text{date\ B}}$

2. Other plan

In [6]:
targets = ["Close"]
print(basket.get_unique_features())

['Close', 'High', 'Low', 'Open', 'Volume']


In [7]:
print(f"Basket data shape: {basket.data.shape}")

strategy = IntersectionStrategy()
basket.align(strategy)

print(f"Basket data shape: {basket.data.shape}")

INFO:strategies.concrete:Aligned: 14 orig -> 14 clean assets -> 1464 rows
DEBUG:entities.basket:Aligned data shape: (1464, 70)
INFO:entities.basket:Assets updated in-place to aligned index (Length: 1464)


Basket data shape: (2760, 70)
Basket data shape: (1464, 70)


### Select Features for norm to Returns 
$ \|FT\| \to FT_{\text{returns}} $

In [8]:
basket.to_returns(features=targets, log=True, keep=False)

print(basket.get_unique_features())
basket.data.head(5)

DEBUG:entities.asset:AAPL converted to Returns (log=True)
DEBUG:entities.asset:TSLA converted to Returns (log=True)
DEBUG:entities.asset:MSFT converted to Returns (log=True)
DEBUG:entities.asset:NVDA converted to Returns (log=True)
DEBUG:entities.asset:GOOGL converted to Returns (log=True)
DEBUG:entities.asset:AMZN converted to Returns (log=True)
DEBUG:entities.asset:GOOG converted to Returns (log=True)
DEBUG:entities.asset:META converted to Returns (log=True)
DEBUG:entities.asset:AVGO converted to Returns (log=True)
DEBUG:entities.asset:ORCL converted to Returns (log=True)
DEBUG:entities.asset:CRM converted to Returns (log=True)
DEBUG:entities.asset:ADBE converted to Returns (log=True)
DEBUG:entities.asset:AMD converted to Returns (log=True)
DEBUG:entities.asset:CSCO converted to Returns (log=True)


['Close (Log_Returns)', 'High', 'Low', 'Open', 'Volume']


Unnamed: 0_level_0,AAPL,AAPL,AAPL,AAPL,AAPL,TSLA,TSLA,TSLA,TSLA,TSLA,...,AMD,AMD,AMD,AMD,AMD,CSCO,CSCO,CSCO,CSCO,CSCO
Unnamed: 0_level_1,High,Low,Open,Volume,Close (Log_Returns),High,Low,Open,Volume,Close (Log_Returns),...,High,Low,Open,Volume,Close (Log_Returns),High,Low,Open,Volume,Close (Log_Returns)
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-01-03,72.594055,71.608685,71.765667,146322800,-0.00977,30.266666,29.128,29.366667,266677500,0.029203,...,49.389999,47.540001,48.029999,73127400,-0.010236,40.453623,39.899004,40.260347,15577400,-0.01645
2020-01-06,72.444344,70.703034,70.95421,118387200,0.007937,30.104,29.333332,29.364668,151995000,0.019072,...,48.860001,47.860001,48.02,47934900,-0.00433,40.184715,39.504044,39.613288,22183600,0.003563
2020-01-07,72.671341,71.845369,72.415337,108872000,-0.004715,31.441999,30.224001,30.76,268231500,0.038067,...,49.389999,48.040001,49.349998,58061400,-0.002897,40.100678,39.57967,40.100678,16501900,-0.006507
2020-01-08,73.52631,71.768094,71.768094,132079200,0.015958,33.232666,31.215334,31.58,467164500,0.048033,...,48.299999,47.139999,47.849998,53767000,-0.008743,40.159507,39.335982,39.470435,25175900,0.000632
2020-01-09,74.972955,73.951358,74.202527,170108400,0.021018,33.253334,31.524668,33.139999,426606000,-0.022189,...,49.959999,48.389999,48.939999,76512800,0.023555,40.235133,39.554462,40.159503,18203600,-0.004218


## Tensor (T, A, C)


### Select Feature to tensor
* std, mean, max require dim = Time (T)
* cov, corr require dim = Assets, Time (A, T)

In [50]:
targets = ["Close (Log_Returns)"]
basket_tensor = basket.to_tensor(features=targets)
print(f"Basket Tensor Shape: {basket_tensor.shape}")

DEBUG:entities.asset:Asset: AAPL is using cuda device.
DEBUG:entities.asset:Asset: TSLA is using cuda device.
DEBUG:entities.asset:Asset: MSFT is using cuda device.
DEBUG:entities.asset:Asset: NVDA is using cuda device.
DEBUG:entities.asset:Asset: GOOGL is using cuda device.
DEBUG:entities.asset:Asset: AMZN is using cuda device.
DEBUG:entities.asset:Asset: GOOG is using cuda device.
DEBUG:entities.asset:Asset: META is using cuda device.
DEBUG:entities.asset:Asset: AVGO is using cuda device.
DEBUG:entities.asset:Asset: ORCL is using cuda device.
DEBUG:entities.asset:Asset: CRM is using cuda device.
DEBUG:entities.asset:Asset: ADBE is using cuda device.
DEBUG:entities.asset:Asset: AMD is using cuda device.
DEBUG:entities.asset:Asset: CSCO is using cuda device.


Basket Tensor Shape: torch.Size([1463, 14, 1])


### Define Ratios Dataset

In [51]:
market_dataset = TensorDataset(basket_tensor)

ratios = [0.8, 0.1, 0.1]
total_count = len(market_dataset)
train_count = int(total_count * ratios[0])
val_count = int(total_count * ratios[1])
test_count = total_count - train_count - val_count

print(f"Market ds - Total: {total_count}\nTrain:\t{train_count}\nVal:\t{val_count}\nTest:\t{test_count}")

Market ds - Total: 1463
Train:	1170
Val:	146
Test:	147


### Scale Dataset

In [57]:
scaler = MinMaxScaler(feature_range=(-1, 1))
# scaler = StandardScaler()

train_part = basket_tensor[:train_count].cpu().numpy()
scaler.fit(train_part.reshape(-1, 1))

basket_scaled_np = scaler.transform(basket_tensor.cpu().numpy().reshape(-1, 1))
basket_scaled_tensor = torch.from_numpy(basket_scaled_np.reshape(basket_tensor.shape)).float()

verify_scaling(basket_scaled_tensor)
print(f"train_ds_scaled shape: {basket_scaled_tensor.shape}")

--- Global Stats ---
Max: 1.2813
Min: -1.0000
Mean: 0.1431
Std: 0.0979
train_ds_scaled shape: torch.Size([1463, 14, 1])


### Sliding Window Tensor [N, T, A, C]
N = Num of windows

In [58]:
basket_scaled_tensor = torch.from_numpy(sliding_window_view(basket_scaled_tensor.cpu(), window_shape=64, axis=0))
print(f"Org windowing basket shape: {basket_scaled_tensor.shape}")

basket_scaled_tensor = basket_scaled_tensor.permute(0,3,1,2)
print(f"New windowing basket shape: {basket_scaled_tensor.shape}")

verify_scaling(basket_scaled_tensor)

Org windowing basket shape: torch.Size([1400, 14, 1, 64])
New windowing basket shape: torch.Size([1400, 64, 14, 1])
--- Global Stats ---
Max: 1.2813
Min: -1.0000
Mean: 0.1430
Std: 0.0971


### Split Dataset
Have 2 Versions!
1. Sequence
2. Random Split

In [65]:
end_train = train_count
end_val = train_count + val_count

train_ds = TensorDataset(basket_scaled_tensor[:end_train])
val_ds = TensorDataset(basket_scaled_tensor[:end_val])
test_ds = TensorDataset(basket_scaled_tensor[end_val:])

In [65]:
# full_ds = TensorDataset(basket_scaled_tensor)

# train_ds, val_ds, test_ds = random_split(
#     full_ds, 
#     [train_count, val_count, test_count],
#     generator=torch.Generator().manual_seed(42) 
# )

### Dataloader

In [73]:
train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False)

batch = next(iter(train_loader)) # Batch: X_t only // for future if u need more data add to batch.
X_train = batch[0]
print(f"Num of Batchs in train dataloader: {len(train_loader)}")
print(f"Batch size: {len(batchs)}")
print(f"X train shape: {X_train.shape}")

Num of Batchs in train dataloader: 37
Batch size: 1
X train shape: torch.Size([32, 64, 14, 1])


## Model

In [74]:
window_size = X_train.shape[1]       # T
n_assets = X_train.shape[2]        # A
n_features = X_train.shape[3]      # FT
total_inp_dim = n_assets * n_features # C = A * FT (Flatten)

print(f"Running on: {cfg.device}")
print(f"Input Dimension (Channels): {total_inp_dim}")

Running on: cuda
Input Dimension (Channels): 14


In [78]:
diffusion = Diffusion(
    noise_steps=cfg.ddpm.noise_steps,
    beta_start=cfg.ddpm.beta_start,
    beta_end=cfg.ddpm.beta_end,
    schedule=cfg.scheduler.type,
    device=cfg.device
)

DEBUG:frameworks.ddpm:Diffusion is using cosine schedule.


In [75]:
model = DiffusionTransformer(
    features_in=total_inp_dim, 
    d_model=cfg.ddpm.d_model,                  
    nhead=cfg.ddpm.n_heads,
    num_layers=cfg.ddpm.n_layers,
    max_len=window_size    
).to(cfg.device)



In [76]:
optimizer = AdamW(model.parameters(), lr=cfg.optimizer.lr, weight_decay=cfg.optimizer.weight_decay)

In [79]:
engine = Engine(
    model=model,
    diffusion=diffusion,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    optimizer=optimizer,
    device=cfg.device,
    scaler=scaler
)

## Training

In [80]:
engine.fit(epochs=cfg.epochs, save_dir=CHECKPOINTS_DIR)

INFO:engine.trainer:Engine started Training for 20 epochs on cuda...
Epoch 1/20: 100%|██████████| 37/37 [00:01<00:00, 21.47it/s, loss=0.1590]
Epoch 2/20: 100%|██████████| 37/37 [00:01<00:00, 26.45it/s, loss=0.2875]
Epoch 3/20: 100%|██████████| 37/37 [00:01<00:00, 26.37it/s, loss=0.0762]
Epoch 4/20: 100%|██████████| 37/37 [00:01<00:00, 26.36it/s, loss=0.0823]
Epoch 5/20: 100%|██████████| 37/37 [00:01<00:00, 26.43it/s, loss=0.0863]
Epoch 6/20: 100%|██████████| 37/37 [00:01<00:00, 26.39it/s, loss=0.0600]
Epoch 7/20: 100%|██████████| 37/37 [00:01<00:00, 26.39it/s, loss=0.0954]
Epoch 8/20: 100%|██████████| 37/37 [00:01<00:00, 26.37it/s, loss=0.1047]
Epoch 9/20: 100%|██████████| 37/37 [00:01<00:00, 26.37it/s, loss=0.1166]
Epoch 10/20: 100%|██████████| 37/37 [00:01<00:00, 26.38it/s, loss=0.0634]
Epoch 11/20: 100%|██████████| 37/37 [00:01<00:00, 26.42it/s, loss=0.0747]
Epoch 12/20: 100%|██████████| 37/37 [00:01<00:00, 26.38it/s, loss=0.0422]
Epoch 13/20: 100%|██████████| 37/37 [00:01<00:00, 26

## Testing

In [None]:
X_test_smp = next(iter(test_loader))
print(f"X_test_smp: {X_test_smp.shape}")

# pick 1 window
X_test_smp = X_test_smp[0]
verify_scaling(X_test_smp)
print(f"X_test_smp: {X_test_smp.shape}")

In [None]:
steps_to_sim = 8
X_test_smp_context = X_test_smp[:-8,:,:]
y_test_smp = X_test_smp[-8:,:,:]
print(f"X_test_smp_context: {X_test_smp_context.shape}")
print(f"y_test_smp: {y_test_smp.shape}")

In [None]:
sim = engine.simulate(X_test_smp_context, steps=steps_to_sim, extend=True)
sim = sim[-8:, :, :]
verify_scaling(sim)
print(f"Type of simulation: {type(sim)}")
print(f"simulation: {sim.shape}")

In [None]:
X_test_smp = scaler.decode(X_test_smp)
y_test_smp = scaler.decode(y_test_smp)
sim = scaler.decode(torch.from_numpy(sim))

verify_scaling(X_test_smp)
verify_scaling(y_test_smp)
verify_scaling(sim)

In [None]:
X_test_smp_df = pd.DataFrame(X_test_smp.detach().cpu().numpy().squeeze(-1))
y_test_smp_df = pd.DataFrame(y_test_smp.detach().cpu().numpy().squeeze(-1))
sim_df = pd.DataFrame(sim.squeeze(-1))

print(f"X_test_smp_df: {type(X_test_smp_df.shape)}, {X_test_smp_df.shape}")
print(f"y_test_smp_df: {type(y_test_smp_df.shape)}, {y_test_smp_df.shape}")
print(f"sim_df: {type(sim_df.shape)}, {sim_df.shape}")

In [None]:
mu_ai = expected_returns.mean_historical_return(sim_df)

mu_real = expected_returns.mean_historical_return(y_test_smp_df)

comparison = pd.DataFrame({
    'AI Prediction': mu_ai,
    'Actual (Real)': mu_real,
    'Diff (Error)': mu_ai - mu_real 
})

print(comparison)

In [None]:
sim_cov = risk_models.sample_cov(sim_df)
plotting.plot_covariance(sim_cov, plot_correlation=True)

In [None]:
y_test_smp_cov = risk_models.sample_cov(y_test_smp_df)
plotting.plot_covariance(y_test_smp_cov, plot_correlation=True)

In [None]:
corr_matrix = risk_models.cov_to_corr(cov)
corr_matrix

# Data Stats (Portfolio optimization) by PyPortfolioOpt lib
ref: https://medium.com/qunt-i-love-u/python-library-%E0%B8%97%E0%B8%B5%E0%B9%88%E0%B9%80%E0%B8%81%E0%B8%B4%E0%B8%94%E0%B8%A1%E0%B8%B2%E0%B9%80%E0%B8%9E%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%97%E0%B8%B3-portfolio-optimization-b07398e2e228

In [None]:
key = "Close"
gt_sampling = basket.data.xs(key=key,level=1, axis=1)
gt_sampling.head(5)

In [None]:
gt_sampling_cov = risk_models.sample_cov(gt_sampling)
plotting.plot_covariance(gt_sampling_cov, plot_correlation=True)
plt.show()

### Exponential covariance

In [None]:
gt_sampling_exp_cov = risk_models.exp_cov(gt_sampling)
plotting.plot_covariance(gt_sampling_cov, plot_correlation=True)
plotting.plot_covariance(gt_sampling_exp_cov, plot_correlation=True)
plt.show()

### We applied both shrinkage methods to compare their results.

In [None]:
gt_sampling_cov_shrinkage = risk_models.CovarianceShrinkage(gt_sampling).ledoit_wolf()
plotting.plot_covariance(gt_sampling_cov, plot_correlation=True)
plotting.plot_covariance(gt_sampling_cov_shrinkage, plot_correlation=True)
plt.show()