<a href="https://colab.research.google.com/github/Krankile/npmf/blob/none-counting/notebooks/training_loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%capture
!pip install wandb
!git clone https://github.com/Krankile/npmf.git

In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


##General setup

In [85]:
!cd npmf && git checkout none-counting && git pull

import math
import multiprocessing
import os
import pickle
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta
from operator import itemgetter
from typing import Callable, List, Tuple

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from npmf.utils.colors import main, main2, main3
from npmf.utils.dataset import TimeDeltaDataset
from npmf.utils.dtypes import fundamental_types
from npmf.utils.eikon import column_mapping
from npmf.utils.tests.utils import pickle_df
from npmf.utils.wandb import get_dataset, put_dataset
from npmf.utils.training import EarlyStop, to_device

from numpy.ma.core import outerproduct
from pandas.tseries.offsets import BDay, Day
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, trange

import wandb as wb

Already on 'none-counting'
Your branch is up to date with 'origin/none-counting'.
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 12 (delta 9), reused 12 (delta 9), pack-reused 0[K
Unpacking objects: 100% (12/12), done.
From https://github.com/Krankile/npmf
   e00a245..f9111c1  none-counting -> origin/none-counting
   c9b6c78..7456eca  main          -> origin/main
Updating e00a245..f9111c1
Fast-forward
 utils/dataset.py | 2 [32m+[m[31m-[m
 1 file changed, 1 insertion(+), 1 deletion(-)


In [5]:
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [7]:
np.random.seed(69)

# Create a Neural network class

# Get some data

In [8]:
%%capture
stock_df = get_dataset("stock-oil-final:latest", project="master-test")
fundamentals_df = get_dataset("fundamentals-oil-final:latest", project="master-test")
meta_df = get_dataset("meta-oil-final:latest", project="master-test")
macro_df = get_dataset("macro-oil-final:latest", project="master-test")

stock_df = stock_df.drop_duplicates(subset=["ticker", "date"])

# Run the loop! (Like Odd-Geir Lademo)

![picture](https://drive.google.com/uc?id=1Y55gFQSi4Baovmi0kUQGhbgGOBTI03E7)


In [72]:
class MultivariateNetwork(nn.Module):
    def __init__(self, lag_len, meta_cont_len, meta_cat_len, macro_len, hidden_dim, out_len, **params):
        super().__init__()

        self.lags = nn.Sequential(
            nn.Linear(lag_len, hidden_dim),
            nn.ReLU(),
        )

        self.meta_cont = nn.Sequential(
            nn.Linear(meta_cont_len, hidden_dim),
            nn.ReLU(),
        )

        self.meta_cat = torch.nn.ModuleList([nn.Embedding(l, hidden_dim) for l in meta_cat_len])

        self.macro = nn.Sequential(
            nn.Linear(macro_len, hidden_dim),
            nn.ReLU(),
        )

        self.predict = nn.Sequential(
            nn.Linear(3*hidden_dim + 9*hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, out_len),
        )


    def forward(self, lags, meta_cont, meta_cat, macro):

        lags = self.lags(lags)
        meta_cont = self.meta_cont(meta_cont)
        meta_cat = torch.cat([emb(meta_cat[:, i]) for i, emb in enumerate(self.meta_cat)], dim=1)
        macro = self.macro(macro)

        x = torch.cat((lags, meta_cont, meta_cat, macro), dim=1)
        x = self.predict(x)

        return x

In [66]:
def mape_loss(target, y_pred):
    mask = ~target.isnan()
    denom = mask.sum(dim=1)
    target[target != target] = 0
    l = ((((y_pred - target).abs() / (target.abs() + 1e-8) * mask)).sum(dim=1) / denom).mean()
    return l

In [67]:
@dataclass
class RunParams:
    n_reports: int
    training_w: int
    forecast_w: int
    max_epochs: int
    loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]

    lag_len: int
    meta_cont_len: int
    meta_cat_len: List[int]
    macro_len: int
    out_len: int
    hidden_dim: int
    batch_size: int

    patience: int
    min_delta: float

In [76]:
def train_one_era(model, optimizer, data_train, data_val, stopper, losses, device, params: RunParams, pbar):
    train_losses = []
    val_losses = []
    postfix = dict()
    for epoch in range(params.max_epochs):
        epoch_losses = dict(train=[], val=[])
        postfix = {**postfix, "epoch": epoch}
        pbar.set_postfix(postfix)
        for run_type, dataloader in {"train": data_train, "val": data_val}.items():
            model.train(run_type == "train")
            for stocks_and_fundamentals, meta_cont, meta_cat, macro, target in to_device(dataloader, device):
                optimizer.zero_grad()
                y_pred = model(stocks_and_fundamentals, meta_cont, meta_cat, macro)
    
                loss = params.loss_fn(target, y_pred)
                epoch_losses[run_type].append(loss.item())

                if run_type == "train":
                    train_losses.append(loss.item())
                    loss.backward()

                    optimizer.step()
                else:
                    val_losses.append(loss.item())


            losses[run_type].append(
                np.mean(epoch_losses[run_type])
            )

        postfix = {**postfix, "train_loss": np.mean(train_losses), "val_loss": np.mean(val_losses)}
        pbar.set_postfix(postfix)

        if run_type == "val" and stopper(epoch_losses["val"]):
            losses["epoch_lens"].append(epoch + 1)
            break
        
        postfix = {**postfix, "triggers": f"{stopper.triggers}/{stopper.patience}"}
        pbar.set_postfix(postfix)
        
    return train_losses, val_losses

In [91]:
def train(params: RunParams) -> nn.Module:
    cpus = multiprocessing.cpu_count()
    cpus = 0

    model = MultivariateNetwork(**asdict(params))
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

    stopper = EarlyStop(params.patience, params.min_delta)

    date_range = pd.date_range(start="2000-12-31", end="2018-10-31", freq="M")
    n_ranges = len(date_range)
    periods = iter(date_range)
    period = next(periods)

    losses = dict(train=[], val=[], epoch_lens=[])

    val = TimeDeltaDataset(period, params.training_w, params.forecast_w, params.n_reports, stock_df, fundamentals_df, meta_df, macro_df)
    na_percentages = {df_key: [] for df_key in val.na_percentage}
    
    pbar = tqdm(periods, total=(n_ranges-1), desc=f"Period {period.date()}", leave=True, position=0)
    for period in pbar:
        pbar.set_description(f"Period {period.date()}")
        tra = val
        # TODO Refactor this class to only require the top-level params once
        val = TimeDeltaDataset(period, params.training_w, params.forecast_w, params.n_reports, stock_df, fundamentals_df, meta_df, macro_df)
        
        for df_key in na_percentages:
            na_percentages[df_key] += [val.na_percentage[df_key]]
        
        data_train = DataLoader(tra, params.batch_size, shuffle=True, drop_last=False, num_workers=cpus)
        data_val = DataLoader(val, params.batch_size, shuffle=False, num_workers=cpus)

        stopper.reset()
        train_one_era(
            model=model, 
            optimizer=optimizer, 
            data_train=data_train, 
            data_val=data_val, 
            stopper=stopper,
            losses=losses,
            device=device, 
            params=params,
            pbar=pbar,
        )

    return model, losses

In [92]:
params = RunParams(
    n_reports=4,
    training_w=240,
    forecast_w=20,
    loss_fn=mape_loss,
    lag_len=302,
    meta_cont_len=1,
    meta_cat_len=np.array([89, 5, 70, 185, 1, 3, 5, 10, 44]) + 1, 
    macro_len=1920,
    out_len=20,
    hidden_dim=32,
    batch_size=64,

    max_epochs=5,

    patience=2,
    min_delta=0.01,
)

model, losses = train(params)

Period 2002-09-30:   9%|▉         | 20/214 [00:34<05:34,  1.72s/it, epoch=1, train_loss=0.767, val_loss=0.766, triggers=0/2]


KeyboardInterrupt: ignored