<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/training_loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Kernel setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%capture
!pip install wandb
!git clone https://github.com/Krankile/npmf.git

In [3]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mankile[0m ([33mkrankile[0m). Use [1m`wandb login --relogin`[0m to force relogin


## General setup

In [4]:
%%capture
!cd npmf && git pull

import math
import multiprocessing
import os
import pickle
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta
from operator import itemgetter
from typing import Callable, List, Tuple
from functools import partial
from glob import glob


from more_itertools import chunked

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from npmf.utils.colors import main, main2, main3
from npmf.utils.dataset import TimeDeltaDataset, EraDataset
from npmf.utils.dtypes import fundamental_types
from npmf.utils.eikon import column_mapping
from npmf.utils.tests.utils import pickle_df
from npmf.utils.wandb import get_datasets, put_dataset, put_nn_model
from npmf.utils.training import EarlyStop, to_device, TqdmPostFix, loss_fns
from npmf.utils.models import models

from numpy.ma.core import outerproduct
from pandas.tseries.offsets import BDay, Day
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from torch import nn
from torch.utils.data import DataLoader, Dataset

import wandb as wb

In [5]:
np.seterr(all="raise")

mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [7]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Jun  1 19:46:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    12W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
pre_proc_data_dir = None
np.random.seed(69)


# Get some data

In [9]:
%%capture
reload_data = True

if reload_data or not "stock_df" in vars():
    names = ["stock-data:final", "fundamental-data:final", "meta-data:final", "macro-data:final"]

    stock_df, fundamental_df, meta_df, macro_df = get_datasets(names=names, project="master")

    stock_df = stock_df.drop(columns=["close_price", "currency"]).astype({"market_cap": np.float32})
    fundamental_df = fundamental_df.drop(columns="period_end_date").astype(fundamental_types)
    macro_df.iloc[:, 1:] = macro_df.iloc[:, 1:].astype(np.float32)

In [10]:
%%capture
reload_proc_data = True

if reload_proc_data or not "pre_proc_data_dir" in vars():
    with wb.init(job_type="get-data", project="master", entity="krankile") as run:
        art = run.use_artifact("era-datasets:v0")
        pre_proc_data_dir = art.download()

## Define a class to handle information across eras

In [11]:
class EraController:
    def __init__(
        self,
        start_date,
        end_metric_start_date,
        queue_length,
        stock_df,
        fundamental_df,
        meta_df,
        macro_df,
        conf,
    ):
        self.conf = conf
        self.stock_df, self.fundamental_df, self.meta_df, self.macro_df = (
            stock_df,
            fundamental_df,
            meta_df,
            macro_df,
        )

        self.path_dict = None
        if "pre_proc_data_dir" in self.conf and self.conf.pre_proc_data_dir is not None:
            self.path_dict = {path.split("/")[-1]: path for path in sorted(glob(self.conf.pre_proc_data_dir + "/*"))}


        self.dates_have_overlapped = False
        self.loader_to_na_dict = {}

        self.infront_dates = pd.date_range(
            start=start_date, periods=queue_length, freq="M"
        )
        self.end_dates = pd.date_range(
            start=end_metric_start_date, periods=queue_length, freq="M"
        )

        self.infront_loaders = self.dates_to_loader(self.infront_dates)
        self.end_loaders = self.dates_to_loader(self.end_dates)

        self.total = len(
            pd.date_range(start=self.infront_dates[0], end=self.end_dates[0], freq="M")
        )
        
        self.date = self.infront_dates[0]

    def date_to_loader(self, date):
        if self.path_dict is not None:
            with open(self.path_dict[str(date)], "rb") as f:
                dataset_infront = pickle.load(f)
        else:
            dataset_infront = EraDataset(
                date,
                self.conf.training_w,
                self.conf.forecast_w,
                self.conf.n_reports,
                self.stock_df,
                self.fundamental_df,
                self.meta_df,
                self.macro_df,
            )

        loader = DataLoader(
            dataset_infront,
            batch_size=len(dataset_infront),
            shuffle=False,
            num_workers=self.conf.cpus,
        )
        self.loader_to_na_dict[loader] = dataset_infront.na_percentage
        return loader

    def dates_to_loader(self, dates):
        dataloaders = []
        for date in dates:
            loader_infront = self.date_to_loader(date)
            dataloaders.append(loader_infront)
        return dataloaders
    
    def get_next_month(self, date):
        month_factor = 1 if date.day == date.days_in_month else 2
        next_month_end = date + pd.tseries.offsets.MonthEnd() * month_factor
        return next_month_end

    def validation_loaders(self):
        return self.infront_loaders, self.end_loaders

    def __iter__(self):
        return self

    def __next__(self):
        if self.infront_dates[-1] == self.end_dates[-1]:
            raise StopIteration

        self.date = self.infront_dates[0]
        dataloader_train, dataloader_val, self.date = (
            self.infront_loaders.pop(0),
            self.infront_loaders[0],
            self.infront_dates[0],
        )

        self.infront_dates = self.infront_dates[1:]
        next_date = self.get_next_month(self.infront_dates[-1])
        
        if (
            (self.infront_dates[0] <= self.end_dates[-1])
            and (next_date >= self.end_dates[0])
            and not self.dates_have_overlapped
        ):  # If overlap
            self.dates_have_overlapped = True

            self.infront_dates = self.infront_dates.append(
                pd.DatetimeIndex([self.end_dates[0]])
            )  # At first overlap, fix infront_dates to end_date start
            self.infront_loaders.append(self.end_loaders[0])

        else:
            self.infront_dates = self.infront_dates.append(
                pd.DatetimeIndex([next_date])
            )
            self.infront_loaders.append(self.date_to_loader(next_date))

        return dataloader_train, dataloader_val

# Run the loop! (Like Odd-Geir Lademo)

In [12]:
# Check if it's necessary to calculate naive loss every epoch
def get_epoch_loss(model, optimizer, dataloader, loss_fns, device, run_type, conf):
    model_losses = []
    naive_losses = []
    for data, meta_cont, meta_cat, target in to_device(dataloader, device):

        optimizer.zero_grad()
        y_pred: torch.Tensor = model(data, meta_cont, meta_cat)

        naive_loss = loss_fns[conf.loss_fn](target.clone(), torch.ones(target.shape, device=device))
        loss = loss_fns[conf.loss_fn](target, y_pred)

        model_losses.append(loss.item())
        naive_losses.append(naive_loss.item())

        if run_type == "train":
            loss.backward()
            optimizer.step()

    return model_losses, naive_losses

In [13]:
def eras_ahead_loss(model, data_loaders, optimizer, conf):
    model_infront = []
    naive_infront = []
    
    for loader in data_loaders:
        model_loss, naive_loss = get_epoch_loss(model, optimizer, loader, loss_fns, device, None, conf)
    
        model_infront += model_loss
        naive_infront += naive_loss
    
    return np.array(model_infront), np.array(naive_infront)

In [14]:
def train_one_era(run, model, optimizer, data_train, data_val, stopper, losses, device, config, pbar):

    for epoch in range(config.max_epochs):
        epoch_losses = dict(train=[], val=[])
        
        pbar.update_postfix({"epoch": epoch})
        for run_type, dataloader in {"train": data_train, "val": data_val}.items():
            model.train(run_type == "train")
            
            epoch_model_loss, naive_losses = get_epoch_loss(model, optimizer, dataloader, loss_fns, device, run_type, config)
            epoch_losses[run_type] += epoch_model_loss

            epoch_loss = np.mean(epoch_losses[run_type])
            losses[run_type].append(epoch_loss)

            run.log({f"epoch_{run_type}": epoch_loss, "epoch": epoch})

        pbar.update_postfix({"train_loss": np.mean(epoch_losses["train"]), "val_loss": np.mean(epoch_losses["val"]), "naive": np.mean(naive_losses)})

        if run_type == "val" and stopper(epoch_losses["val"], pbar):
            losses["epoch_lens"].append(epoch + 1)
            break

    return epoch_losses["train"], epoch_losses["val"]

In [15]:
def train(config, project=None, entity=None, enablewb=True) -> nn.Module:
    
    mode = "online" if enablewb else "offline"
    with wb.init(config=config, project=project, entity=entity, job_type="training", mode=mode) as run:

        conf = run.config
        print(conf)

        #TODO Define model from string given by conf
        model = models[conf.model](**conf).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=conf.learning_rate)

        stopper = EarlyStop(conf.patience, conf.min_delta)
        losses = dict(train=[], val=[], epoch_lens=[])

        eras = EraController(start_date=conf.start_date, end_metric_start_date=conf.end_date, queue_length=6, stock_df=stock_df, fundamental_df=fundamental_df, meta_df=meta_df, macro_df=macro_df, conf=conf)
        pbar = TqdmPostFix(eras, total=eras.total)
        for i, (data_train, data_val) in enumerate(pbar):
            # Does this work??
            torch.cuda.empty_cache()
            
            pbar.set_description(f"Era {eras.date} [{i+1}/{eras.total}]")

            train_losses, val_losses = train_one_era(
                run=run, 
                model=model, 
                optimizer=optimizer, 
                data_train=data_train, 
                data_val=data_val,
                stopper=stopper.reset(),
                losses=losses,
                device=device, 
                config=conf,
                pbar=pbar,
            )

            loaders_infront, loaders_end = eras.validation_loaders()
            model_infront, naive_infront = eras_ahead_loss(model, loaders_infront, optimizer, conf)
            model_end, naive_end = eras_ahead_loss(model, loaders_end, optimizer, conf)

            metric_loss = 0.5*(np.mean(model_infront/naive_infront-1) +  np.mean(model_end/naive_end-1))

            run.log({"era_train": np.mean(train_losses), "era_val" : np.mean(val_losses),"model_infront": np.mean(model_infront),
                     "naive_infront": np.mean(naive_infront), "model_end": np.mean(model_end), "naive_end": np.mean(naive_end),
                     "metric_loss": metric_loss, **eras.loader_to_na_dict[data_val], "time": eras.date.timestamp(), "era": i})

        if conf.save_model:
            put_nn_model(model, run)

    return model, losses

In [16]:
def get_params_from_data(stock_df, fundamental_df, meta_df, macro_df, params_human):
    meta_cont_len = 1
    meta_cat_len = np.array([len(meta_df[col].unique()) for col in meta_df.iloc[:,1:] if col != "founding_year"]) + 1
    
    stock_feats = 1
    macro_feats = (macro_df.shape[1]-1)
    funda_feats = (fundamental_df.loc[:,"revenue":].shape[1] - 1) + 2

    n_features = stock_feats + macro_feats + funda_feats
    
    data_given_params = dict(
        n_reports=4,
        meta_cont_lens=(meta_cont_len, 1),
        meta_cat_lens=list(map(lambda x: (x, int(math.ceil(x**0.5))), meta_cat_len)),
        out_len=params_human["forecast_w"],
        input_size=n_features,
    )
    return data_given_params

In [17]:
params_human = dict(
    cpus=1,
    n_reports=4,
    training_w=240,
    forecast_w=20,
    loss_fn="mape",
    start_date="2000-12-31",
    end_date="2018-10-31",
    model="TcnV1",
    save_model=True,
    batch_size=256,
    pre_proc_data_dir=pre_proc_data_dir,
    clamp=None,
    dtype="float32",
    queue_length=6,
)

params_wb = dict(
    max_epochs=5,
    patience=10,
    min_delta=0.0001,
    learning_rate=0.001,

    hd=128,
    dropout=0.5,
    num_layers=7,
    channels=256,
    kernel_size=3,

    meta_hd=32,
)

params_from_data = get_params_from_data(stock_df, fundamental_df, meta_df, macro_df, params_human)

config = {
    **params_human,
    **params_wb,
    **params_from_data,
}

In [None]:
enablewb = False
sweepid = "krankile/master/nuxe3sw7"

if sweepid:
    count = 500 # number of runs to execute
    wb.agent(sweepid, partial(train,config=config, enablewb=enablewb), count=count)

else:
    model, losses = train(config=config, project="master-test", entity="krankile", enablewb=enablewb)

[34m[1mwandb[0m: Agent Starting Run: ekdhpgzw with config:
[34m[1mwandb[0m: 	channels: 64
[34m[1mwandb[0m: 	dropout: 0.7241806217479805
[34m[1mwandb[0m: 	hd: 512
[34m[1mwandb[0m: 	kernel_size: 7
[34m[1mwandb[0m: 	learning_rate: 0.026080648168121223
[34m[1mwandb[0m: 	max_epochs: 711
[34m[1mwandb[0m: 	meta_hd: 64
[34m[1mwandb[0m: 	min_delta: 0.01624266609493051
[34m[1mwandb[0m: 	num_layers: 7
[34m[1mwandb[0m: 	patience: 5


{'cpus': 1, 'n_reports': 4, 'training_w': 240, 'forecast_w': 20, 'loss_fn': 'mape', 'start_date': '2000-12-31', 'end_date': '2018-10-31', 'model': 'TcnV1', 'save_model': True, 'batch_size': 256, 'pre_proc_data_dir': './artifacts/era-datasets:v0', 'clamp': None, 'dtype': 'float32', 'queue_length': 6, 'max_epochs': 711, 'patience': 5, 'min_delta': 0.01624266609493051, 'learning_rate': 0.026080648168121223, 'hd': 512, 'dropout': 0.7241806217479805, 'num_layers': 7, 'channels': 64, 'kernel_size': 7, 'meta_hd': 64, 'meta_cont_lens': [1, 1], 'meta_cat_lens': [[110, 11], [6, 3], [91, 10], [285, 17], [3, 2], [5, 3], [7, 3], [14, 4], [58, 8]], 'out_len': 20, 'input_size': 37}


Era 2000-12-31 00:00:00 [1/215]:   0%|          | 0/215 [00:06<?, ?it/s, epoch=6, train_loss=0.947, val_loss=0.899, naive=0.0679, triggers=5/5, best_loss=0.299]