<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/training_loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%capture
!pip install wandb
!git clone https://github.com/Krankile/npmf.git

In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


##General setup

In [21]:
%%capture
!cd npmf && git pull

import os
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from operator import itemgetter
import pickle
import multiprocessing

import numpy as np
import math
from numpy.ma.core import outerproduct
import pandas as pd
from pandas.tseries.offsets import Day, BDay
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm

import wandb as wb

import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler, minmax_scale

from npmf.utils.colors import main, main2, main3
from npmf.utils.wandb import get_dataset, put_dataset
from npmf.utils.eikon import column_mapping
from npmf.utils.dataset import TimeDeltaDataset
from npmf.utils.tests import pickle_df
from npmf.utils.dtypes import fundamental_types

In [5]:
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [7]:
np.random.seed(69)

# Create a Neural network class

# Get some data

In [8]:
stock_df = get_dataset("stock-oil-final:latest", project="master-test")
fundamentals_df = get_dataset("fundamentals-oil-final:latest", project="master-test")
meta_df = get_dataset("meta-oil-final:latest", project="master-test")
macro_df = get_dataset("macro-oil-final:latest", project="master-test")

stock_df = stock_df.drop_duplicates(subset=["ticker", "date"])

[34m[1mwandb[0m: Currently logged in as: [33mankile[0m ([33mkrankile[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact stock-oil-final:latest, 77.63MB. 1 files... Done. 0:0:0


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Define a dataset which can iterate through time 

We want to train our neural network like the person experiences the world. I.e. we have a window of time and look at recent financial reports and macro variables to predict future market capitalization. We want to train multiple epochs over one time window and validation period, in this manner we will not have any "learned future" effect which could occur if one were to have epochs run over the all time windows. 

In [23]:
current_time = pd.to_datetime("2010-03-01")
n_reports = 4
training_window = 240
forecast_window = 20

delta_set = TimeDeltaDataset(current_time, forecast_window, training_window, n_reports, stock_df, fundamentals_df, meta_df, macro_df)

# Run the loop! (Like Odd-Geir Lademo)

![picture](https://drive.google.com/uc?id=1Y55gFQSi4Baovmi0kUQGhbgGOBTI03E7)


In [9]:
class MultivariateNetwork(nn.Module):
    def __init__(self, lag_len, meta_cont_len, meta_cat_len, macro_len, hidden_dim, out_len, **params):
        super().__init__()

        self.lags = nn.Sequential(
            nn.Linear(lag_len, hidden_dim),
            nn.ReLU(),
        )

        self.meta_cont = nn.Sequential(
            nn.Linear(meta_cont_len, hidden_dim),
            nn.ReLU(),
        )

        self.meta_cat = [nn.Embedding(l, hidden_dim) for l in meta_cat_len]

        self.macro = nn.Sequential(
            nn.Linear(macro_len, hidden_dim),
            nn.ReLU(),
        )

        self.predict = nn.Sequential(
            nn.Linear(3*hidden_dim + 9*hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, out_len),
        )


    def forward(self, lags, meta_cont, meta_cat, macro):

        lags = self.lags(lags)
        meta_cont = self.meta_cont(meta_cont)

        meta_cat = torch.cat([emb(meta_cat[:, i]) for i, emb in enumerate(self.meta_cat)], dim=1)

        print("macro.shape", macro.shape)
        macro = self.macro(macro)

        x = torch.cat((lags, meta_cont, meta_cat, macro), dim=1)
        x = self.predict(x)

        return x

In [10]:
def mape_loss(target, y_pred):
    return ((y_pred - target).abs() / (target.abs() + 1e-8)).mean(axis=1).mean()

In [11]:
from dataclasses import dataclass, asdict
from typing import Callable, List


@dataclass
class RunParams:
    n_reports: int
    training_w: int
    forecast_w: int
    epochs: int
    loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]

    lag_len: int
    meta_cont_len: int
    meta_cat_len: List[int]
    macro_len: int
    out_len: int
    hidden_dim: int

In [22]:
def train(model, optimizer, data_train, data_val, device, params: RunParams):
    train_losses = []
    val_losses = []
    it = tqdm(range(params.epochs), disable=True)
    for epoch in it: 
        for run_type, dataloader in {"train": data_train, "val": data_val}.items():
            model.train(run_type == "train")
            
            for stocks_and_fundamentals, meta_cont, meta_cat, macro, forecast in dataloader:
                print(stocks_and_fundamentals.shape, meta_cont.shape, meta_cat.shape, macro.shape, forecast.shape,)
                print(torch.isnan(stocks_and_fundamentals).sum(), torch.isnan(meta_cont).sum(), torch.isnan(meta_cat).sum(), torch.isnan(macro).sum(), torch.isnan(forecast).sum(),)
                
                stocks_and_fundamentals = stocks_and_fundamentals.to(torch.float).to(device)
                meta_cont = meta_cont.to(torch.float).to(device)
                meta_cat = meta_cat.to(torch.long).to(device)
                macro = macro.to(torch.float).to(device)
                forecast = forecast.to(torch.float).to(device)

                pred = model(stocks_and_fundamentals, meta_cont, meta_cat, macro)

                # TODO: Remember to mask this
                loss = params.loss_fn(forecast, pred)

                if run_type == "train":
                    train_losses.append(loss.item())
                    loss.backward()

                    optimizer.step()
                else:
                    val_losses.append(loss.item())

        it.set_postfix({"train_loss": np.mean(train_losses), "val_loss": np.mean(val_losses)})

    return train_losses, val_losses

In [24]:
def main():
    params = RunParams(
        n_reports=4,
        training_w=240,
        forecast_w=20,
        epochs=100,
        loss_fn=mape_loss,
        lag_len=302,
        meta_cont_len=1,
        meta_cat_len=np.array([89, 5, 70, 185, 1, 3, 5, 10, 44]) + 1, 
        macro_len=1920,
        out_len=20,
        hidden_dim=32,
    )

    cpus = multiprocessing.cpu_count()
    cpus = 0

    model = MultivariateNetwork(**asdict(params))
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    periods = iter(pd.date_range(start="2000-12-31", end="2018-10-31", freq="M"))
    tra = None
    val = TimeDeltaDataset(next(periods), params.training_w, params.forecast_w, params.n_reports, stock_df, fundamentals_df, meta_df, macro_df)

    for period in tqdm(periods):
        print("Working on period:", period)
        tra = val
        # TODO Refactor this class to only require the top-level params once
        val = TimeDeltaDataset(period, params.training_w, params.forecast_w, params.n_reports, stock_df, fundamentals_df, meta_df, macro_df)

        tra_loader = DataLoader(tra, batch_size=128, shuffle=True, drop_last=True, num_workers=cpus)
        val_loader = DataLoader(val, batch_size=128, shuffle=False, num_workers=cpus)

        train(model, optimizer, tra_loader, val_loader, device, params)

main()

0it [00:00, ?it/s]

Working on period: 2001-01-31 00:00:00
torch.Size([128, 302]) torch.Size([128, 1]) torch.Size([128, 9]) torch.Size([128, 1920]) torch.Size([128, 20])
tensor(0) tensor(0) tensor(0) tensor(0) tensor(60)
macro.shape torch.Size([128, 1920])
Pred output from model
torch.Size([128, 20])
tensor([[-0.1324,  0.0259, -0.0149,  ...,  0.1153,  0.0445,  0.0367],
        [-0.0891,  0.0140,  0.0106,  ...,  0.0816,  0.0557,  0.1731],
        [ 0.0034, -0.0679, -0.0570,  ...,  0.0598,  0.0842,  0.0587],
        ...,
        [-0.0249, -0.0512, -0.0425,  ...,  0.1100,  0.0223,  0.0935],
        [-0.0375,  0.0630,  0.0785,  ...,  0.0606, -0.0171,  0.0626],
        [-0.1050,  0.0836, -0.0229,  ..., -0.0650,  0.0569,  0.1138]],
       grad_fn=<AddmmBackward0>)
torch.Size([128, 302]) torch.Size([128, 1]) torch.Size([128, 9]) torch.Size([128, 1920]) torch.Size([128, 20])
tensor(0) tensor(0) tensor(0) tensor(0) tensor(20)
macro.shape torch.Size([128, 1920])
Pred output from model
torch.Size([128, 20])
tensor([

0it [00:01, ?it/s]

torch.Size([128, 302]) torch.Size([128, 1]) torch.Size([128, 9]) torch.Size([128, 1912]) torch.Size([128, 20])
tensor(0) tensor(0) tensor(0) tensor(0) tensor(0)
macro.shape torch.Size([128, 1912])





RuntimeError: ignored