<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/preprocess_nn_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%capture
!pip install wandb
!git clone https://github.com/Krankile/npmf.git

In [3]:
# https://wandb.ai/authorize
!wandb login 

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


##General setup

In [4]:
%%capture
!cd npmf && git pull

import math
import multiprocessing
import os
import pickle
import random
from collections import Counter, defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta
from operator import itemgetter
from typing import Callable, List, Tuple
from functools import partial
from glob import glob
from enum import Enum
from pathlib import Path

from more_itertools import chunked

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from npmf.utils import Problem
from npmf.utils.colors import main, main2, main3
from npmf.utils.dataset import EraDataset, EraController
from npmf.utils.dtypes import fundamental_types
from npmf.utils.eikon import column_mapping
from npmf.utils.tests.utils import pickle_df
from npmf.utils.wandb import get_datasets, put_dataset, put_nn_model, get_processed_data
from npmf.utils.training import EarlyStop, to_device, TqdmPostFix, loss_fns, get_naive_pred
from npmf.utils.models import models

from numpy.ma.core import outerproduct
from pandas.tseries.offsets import BDay, Day
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from torch import nn
from torch.utils.data import DataLoader, Dataset, ConcatDataset

import wandb as wb

In [5]:
np.seterr(all="raise")

mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [7]:
np.random.seed(69)


# Get some data

In [8]:
%%capture
reload_data = True

if reload_data or not "stock_df" in vars():
    names = ["stock-data:final", "fundamental-data:final", "meta-data:final", "macro-data:final"]

    stock_df, fundamental_df, meta_df, macro_df = get_datasets(names=names, project="master")

    stock_df = stock_df.drop(columns=["close_price", "currency"]).astype({"market_cap": np.float32})
    fundamental_df = fundamental_df.drop(columns="period_end_date").astype(fundamental_types)
    macro_df.iloc[:, 1:] = macro_df.iloc[:, 1:].astype(np.float32)

In [9]:
def generate_data(config, project="master"):
    if not os.path.exists('era_datasets'):
        os.makedirs('era_datasets')

    with wb.init("dataprocessing", project=project, config=config) as run:
        conf = run.config

        dates = pd.date_range(start=conf.start_date, end=conf.end_date, freq="M")

        for date in TqdmPostFix(dates):
            data = EraDataset(current_time=date, stock_df=stock_df, fundamental_df=fundamental_df, meta_df=meta_df, macro_df=macro_df, **conf)

            with open(f"era_datasets/{date}", "wb") as f:
                pickle.dump(data, f)
        
        artname = "era-dataset-target-minmax"

        art = wb.Artifact(artname, "dataset", metadata=conf.as_dict())

        art.add_dir("./era_datasets")

        run.log_artifact(art)

In [10]:
def get_params_from_data(stock_df, fundamental_df, meta_df, macro_df, params):
    meta_cont_len = 1
    meta_cat_len = np.array([len(meta_df[col].unique()) for col in meta_df.iloc[:,1:] if col != "founding_year"]) + 1
    
    stock_feats = 1
    macro_feats = (macro_df.shape[1]-1)
    funda_feats = (fundamental_df.loc[:,"revenue":].shape[1] - 1) + 2

    n_features = stock_feats + macro_feats + funda_feats

    if params.get("feature_subset") is not None:
        n_features = len(params["feature_subset"])

    data_given_params = dict(
        meta_cont_lens=(meta_cont_len, 1),
        meta_cat_lens=list(map(lambda x: (x, int(math.ceil(x**0.25))), meta_cat_len)),
        out_len=1 if params["forecast_problem"] == forecast_problem.name else params["forecast_w"],
        input_size=n_features,
    )

    return data_given_params

In [13]:
forecast_problem = Problem.volatility

params_human = dict(
    forecast_problem=forecast_problem.name,

    cpus=1,
    training_w=240*2,
    forecast_w=240,
    start_date="2001-12-31",
    end_date="2021-03-31",
    dtype="float32",
)

params_from_data = get_params_from_data(stock_df, fundamental_df, meta_df, macro_df, {**params_human})

config = {  
    **params_human,
    **params_from_data,
}

In [None]:
generate_data(config)

 36%|███▌      | 84/232 [25:13<54:56, 22.27s/it]

## In case an escape hatch is needed

In [None]:
!mkdir era_datasets
!cp artifacts/era-datasets\:v2/* era_datasets

In [None]:
!rm -rf era_datasets