<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/training_loop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%capture
!pip install wandb
!git clone https://github.com/Krankile/npmf.git

In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


##General setup

In [24]:
%%capture
!cd npmf && git pull

import os
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from operator import itemgetter

import numpy as np
import math
from numpy.ma.core import outerproduct
import pandas as pd
from pandas.tseries.offsets import Day, BDay
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm


import wandb as wb

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from sklearn.preprocessing import MinMaxScaler, minmax_scale

from npmf.utils.colors import main, main2, main3
from npmf.utils.wandb import get_dataset, put_dataset
from npmf.utils.eikon import column_mapping

In [5]:
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [7]:
np.random.seed(69)

# Create a Neural network class

In [8]:
class MultivariateNetwork(nn.Module):
    def __init__(self, lag_len, cat_len, out_len, hidden_dim):
        super().__init__()

        self.pre = nn.Sequential(
            nn.Linear(lag_len, hidden_dim),
            nn.ReLU(),
        )

        self.predict = nn.Sequential(
            nn.Linear(hidden_dim + cat_len, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, out_len),
        )


    def forward(self, lags, cats):

        x = self.pre(lags)
        x = torch.cat((x, cats), dim=1)
        x = self.predict(x)

        return x

# Get some data

In [9]:
stock_df = get_dataset("stock-oil-final:latest", project="master-test")
meta_df = get_dataset("meta-oil-final:latest", project="master-test")
fundamentals_df = get_dataset("fundamentals-oil-final:latest", project="master-test")
macro_df = get_dataset("macro-oil-final:latest", project="master-test")

stock_df = stock_df.drop_duplicates(subset=["ticker", "date"])

[34m[1mwandb[0m: Currently logged in as: [33mankile[0m ([33mkrankile[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact stock-oil-final:latest, 77.63MB. 1 files... Done. 0:0:0


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Define a dataset which can iterate through time 

We want to train our neural network like the person experiences the world. I.e. we have a window of time and look at recent financial reports and macro variables to predict future market capitalization. We want to train multiple epochs over one time window and validation period, in this manner we will not have any "learned future" effect which could occur if one were to have epochs run over the all time windows. 

In [41]:
d = pd.DataFrame([[1, np.nan, np.nan], [np.nan, np.nan, np.nan]], index=["a", "b"], columns=[7, 8, 9])
d.ffill(axis=1)

Unnamed: 0,7,8,9
a,1.0,1.0,1.0
b,,,


In [35]:
class TimeDeltaDataset(Dataset):
    def _get_last_q_fundamentals(self, target, fundamental_df, q):
        for key, df in tqdm(fundamental_df.groupby(by="ticker"), desc="1/3 Get fundamentals"):

            padding = pd.DataFrame(
                np.empty((q, fundamental_df.loc[:, "revenue":].shape[1])),
                columns=df.loc[:, "revenue":].columns,
            )
            padding[:] = np.nan
            padded_df = pd.concat([padding, df.loc[:, "revenue":]], axis=0)

            target = pd.concat([target, padded_df.iloc[-q:, :]], axis=0)
        return target

    def _get_global_local_column(self, stock_df):
        
        last_market_cap_col = stock_df[["ticker", "market_cap"]].dropna(subset=["market_cap"]).drop_duplicates(subset=["ticker"], keep="last").set_index("ticker")
        
        min_max_scaler = MinMaxScaler()

        # Add column to learn relative values
        apple_market_cap = 2.687 * (10**12)  # ish as of may 2022 (USD)

        relative_to_global_market_column = last_market_cap_col / apple_market_cap

        relative_to_current_market_column = min_max_scaler.fit_transform(
            last_market_cap_col.to_numpy().reshape((-1, 1))
        )
        relative_to_current_market_column = pd.Series(
            relative_to_current_market_column[:, 0], index=last_market_cap_col.index
        )

        return (
            relative_to_global_market_column,
            relative_to_current_market_column,
            last_market_cap_col,
        )

    def OLD_get_stocks_in_timeframe(self, stock_df, stock_dates, min_max_scaler=True, desc=""):
        # TODO this needs a speedup when getting forecasts

        to_be_transposed = pd.DataFrame(index=stock_dates)

        for i, (ticker, df) in enumerate(tqdm(stock_df.groupby(by="ticker"), desc=desc)):

            if min_max_scaler:
                min_max_scaler = MinMaxScaler()

                df.market_cap = min_max_scaler.fit_transform(
                    df.market_cap.to_numpy().reshape((-1, 1))
                )

            ticker_df = (
                pd.DataFrame(df[["date", "market_cap"]])
                .set_index("date", drop=True)
                .rename(columns={"market_cap": ticker})
            )

            to_be_transposed = to_be_transposed.join(ticker_df)
            print(to_be_transposed)
            if i % 50 and i != 0:
                to_be_transposed = to_be_transposed.loc[
                    ~to_be_transposed.index.duplicated(), :
                ]

        return to_be_transposed.T.loc[:, ~to_be_transposed.T.columns.duplicated()]


    def _get_stocks_in_timeframe(self, stock_df, stock_dates, scale=True, desc=""):

        out = pd.DataFrame(data=0, columns=stock_dates, index=stock_df.ticker.unique(), dtype=np.float64)
        stock_df = stock_df.pivot(index="ticker", columns="date", values="market_cap")
        out = out.add(stock_df).ffill()

        # out.loc[out.isna()] = 0

        print(out)
        print(out.isna().sum().sum(), out.count().sum())
    
        # Perform MinMaxScaling on the full dataset
        if scale:
            out = pd.DataFrame(
                data=minmax_scale(out.values),
                index=out.index,
                columns=out.columns,
            )
        return out

    def _get_macro_normalized(self, legal_macro_df, macro_df):
        df = legal_macro_df.copy()
        for column in [
            c for c in legal_macro_df.columns if (c != "date") and ("_fx" not in c)
        ]:
            df[column] = legal_macro_df[column] / (
                int(math.ceil(macro_df[column].max() / 100.0)) * 100
            )
        return df

    def __init__(
        self,
        current_time,
        forecast_window,
        number_of_trading_days,
        n_quarterly_reports,
        stock_df,
        fundamental_df,
        meta_df,
        macro_df,
    ):
        back_in_time_buffer = timedelta(
            number_of_trading_days + number_of_trading_days * 5
        )
        forward_in_time_buffer = timedelta(forecast_window + forecast_window * 5)

        # Define relevant dates
        historic_dates = pd.date_range(
            start=current_time - back_in_time_buffer, end=current_time, freq="B"
        )[-number_of_trading_days:]
        forecast_dates = pd.date_range(
            start=current_time + timedelta(1),
            end=current_time + forward_in_time_buffer,
            freq="B",
        )[:forecast_window]

        legal_stock_df = stock_df.copy().loc[stock_df.date.isin(historic_dates), :]
        legal_fundamental_df = fundamental_df[
            fundamental_df.announce_date < current_time
        ]
        legal_meta_df = meta_df.set_index("ticker")
        legal_macro_df = macro_df.loc[
            macro_df.date.isin(historic_dates), :
        ]  # TODO change to current_time - stock__macro_days_lookback_days

        # Important dimensions
        n_companies_with_fundamentals = len(legal_fundamental_df.ticker.unique())
        m_fundamentals = legal_fundamental_df.loc[:, "revenue":].shape[1]

        # Get last q fundamentals and return NA rows if they are still missing
        fundamental_df_all_quarters = pd.DataFrame(
            data=np.empty((0, m_fundamentals)),
            columns=fundamental_df.loc[:, "revenue":].columns,
        )
        fundamental_df_all_quarters = self._get_last_q_fundamentals(
            fundamental_df_all_quarters, legal_fundamental_df, n_quarterly_reports
        )
        fundamentals = fundamental_df_all_quarters.to_numpy().reshape(
            (n_companies_with_fundamentals, n_quarterly_reports * m_fundamentals)
        )

        # Construct columns for relative information
        (
            relative_to_global_market_column,
            relative_to_current_market_column,
            last_market_cap_col,
        ) = self._get_global_local_column(stock_df)

        # Create dataframe
        fund_columns = []
        for i in range(n_quarterly_reports):
            fund_columns.extend(
                fundamental_df.loc[0, "revenue":]
                .index.to_series()
                .map(lambda title: f"{title}_q=-{n_quarterly_reports-i}")
            )
        columns = ["global_relative"] + ["peers_relative"] + fund_columns
        fundamental_df = pd.DataFrame(
            index=legal_fundamental_df.ticker.unique(), columns=columns
        )

        # Load data
        fundamental_df["peers_relative"] = relative_to_current_market_column.loc[
            legal_fundamental_df.ticker.unique()
        ]
        fundamental_df["global_relative"] = relative_to_global_market_column.loc[
            legal_fundamental_df.ticker.unique()
        ]

        formated_stocks = self._get_stocks_in_timeframe(legal_stock_df, historic_dates, desc="2/3 Get stock data")

        fundamental_df.loc[
            :, f"revenue_q={-n_quarterly_reports}":"net_income_p_q=-1"
        ] = fundamentals
        for q in range(n_quarterly_reports, 0, -1):
            fundamental_df.loc[:, f"revenue_q={-q}":f"fcf_q={-q}"] = fundamental_df.loc[
                :, f"revenue_q={-q}":f"fcf_q={-q}"
            ].div(last_market_cap_col, axis=0)
            fundamental_df.loc[
                :, f"total_assets_q={-q}":f"total_current_liabilities_q={-q}"
            ] = fundamental_df.loc[
                :, f"total_assets_q={-q}":f"total_current_liabilities_q={-q}"
            ].div(
                fundamental_df.loc[:, f"total_assets_q={-q}"], axis=0
            )
            fundamental_df = fundamental_df.drop(columns=f"total_assets_q={-q}")

        self.stocks_and_fundamentals = formated_stocks.join(fundamental_df)

        # Get forecasts
        forecasts = stock_df[stock_df.date.isin(forecast_dates)]

        forecasts_unormalized = self._get_stocks_in_timeframe(
            forecasts, forecast_dates, min_max_scaler=False, desc="3/3 Get actual values"
        )
        forecasts_normalized = forecasts_unormalized.div(last_market_cap_col, axis=0)
        self.forecast = forecasts_normalized.loc[self.stocks_and_fundamentals.index, :]

        # Join meta and stock-fundamentals
        legal_meta_df = legal_meta_df.loc[self.stocks_and_fundamentals.index, :]
        legal_meta_df.loc[:, "exchange_code":"state_province_hq"] = legal_meta_df.loc[
            :, "exchange_code":"state_province_hq"
        ].astype("category")
        legal_meta_df.loc[:, "economic_sector":"activity"] = legal_meta_df.loc[
            :, "economic_sector":"activity"
        ].astype("category")

        legal_meta_df["founding_year"] = legal_meta_df["founding_year"] / 2000
        self.meta_df = legal_meta_df

        # Ready macro data
        self.macro_df = self._get_macro_normalized(legal_macro_df, macro_df).iloc[:, 1:]

    def __len__(self):
        return self.stocks_and_fundamentals.shape[0]

    def __getitem__(self, idx):

        return (
            self.stocks_and_fundamentals.iloc[idx, :],
            self.meta_df.iloc[idx, :],
            self.macro_df.T,
        ), self.forecast.iloc[idx, :]


In [36]:
current_time = pd.to_datetime("2010-03-01")
number_of_quarterly_reports = 4
number_of_trading_days = 240
forecast_window = 20

delta_set = TimeDeltaDataset(current_time, forecast_window, number_of_trading_days, number_of_quarterly_reports, stock_df, fundamentals_df, meta_df, macro_df)

1/3 Get fundamentals: 100%|██████████| 493/493 [00:04<00:00, 105.84it/s]


                    2009-03-31           2009-04-01        2009-04-02  \
000096.SZ     410320654.480527     418759419.950541  414890254.609306   
000159.SZ     509453203.191909     535076403.576288  535068573.953761   
000440.KQ      10799802.812796      10826818.265007   11147476.043625   
000552.SZ     209032197.163722     226179824.112146  223834064.969271   
000554.SZ     443296099.167264     450266648.326724  447445934.328358   
...                        ...                  ...               ...   
XOM.N      332308261487.398987  337822333961.419983    342799638318.5   
XOP.CD            3642914.3809       3384585.492063    3713010.073571   
YPFD.BA       9857994814.25683    9542566755.357861    3713010.073571   
ZENZ.L          1128059.014833       1128685.714286    1149764.734417   
ZEST.OQ         1128059.014833       1128685.714286    1149764.734417   

                   2009-04-03           2009-04-06           2009-04-07  \
000096.SZ    416417189.763399                 <N

TypeError: ignored

In [None]:
stock_df.pivot(index="ticker", columns="date", values="market_cap")

In [None]:
it = iter(delta_set)
(stock, meta, macro), actuals = next(it)
(stock, meta, macro), actuals = next(it)

In [None]:
stock

2009-02-02 00:00:00                   0.0
2009-02-03 00:00:00              0.059504
2009-02-04 00:00:00              0.060508
2009-02-05 00:00:00              0.045324
2009-02-06 00:00:00              0.104092
                                   ...   
short_term_debt_p_assets_q=-1        <NA>
gross_profit_p_q=-1             -0.145236
ebitda_p_q=-1                   -0.214431
ebit_p_q=-1                     -0.251064
net_income_p_q=-1                     NaN
Name: 000159.SZ, Length: 302, dtype: object

# Run the loop! (Like Odd-Geir Lademo)

![picture](https://drive.google.com/uc?id=1Y55gFQSi4Baovmi0kUQGhbgGOBTI03E7)


In [None]:
def train_multivar(model, optimizer, loss_fn, data_train, data_val, one_hot_encoding, batch_number, forecast_window, epochs, device):

    # print(model)
    train_losses = []
    val_losses = []
    it = tqdm(range(epochs), disable=True)
    for epoch in it: 
        for run_type in ["train", "val"]:
            model.train(run_type == "train")

            if run_type == "train":
                optimizer.zero_grad()

                mu, sigma = 0, 0.1
                noise = np.random.normal(mu, sigma, data_train.shape)
                noise[:,-forecast_window:] = 0

                data_train = data_train + noise
                data_encoded = pd.concat([one_hot_encoding, data_train], axis=1, join="inner")

            else:
                data_encoded = pd.concat([one_hot_encoding, data_val], axis=1, join="inner")

            data_shuffled = torch.tensor(data_encoded.sample(frac=1).values, dtype=torch.float32)

            for batch in torch.split(data_shuffled, batch_number, dim=0):

                inputs = batch[:,:-forecast_window].to(device)
                actuals = batch[:,-forecast_window:].to(device)

                pred = model(inputs[:, 3:], inputs[:, 0:3])
                loss = loss_fn(pred, actuals)

                if run_type == "train":
                    train_losses.append(loss.item())
                    loss.backward()

                    optimizer.step()
                else:
                    val_losses.append(loss.item())

        it.set_postfix({"train_loss": np.mean(train_losses), "val_loss": np.mean(val_losses)})

    return train_losses, val_losses

IndentationError: ignored

In [None]:
def run_multivar(hidden_dim):
    #Training loop params
    forecast_window = 16
    amount_of_time_series = 999
    length_of_time_series = 500
    epochs = 200
    batch_size = 111
    


    #_, TS_signal, one_hot_encoding  = time_series_df(amount_of_time_series,length_of_time_series,periods, horisontal_shift, vertical_shift, forecast_window)

    df_train = TS_signal.iloc[-int(amount_of_time_series/3):,:]
    df_val = TS_signal.iloc[:-int(amount_of_time_series/3),:]

    loss_fn = nn.L1Loss()
    
    model = MultivariateNetwork(length_of_time_series, one_hot_encoding.shape[1], forecast_window, hidden_dim)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    return train_multivar(model, optimizer, loss_fn, df_train, df_val, one_hot_encoding, batch_size, forecast_window, epochs, device)