<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/arima_forecacst.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
%%capture
!git clone https://github.com/Krankile/npmf.git
!pip install wandb

In [8]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mkrankile[0m (use `wandb login --relogin` to force relogin)


##General setup

In [4]:
%%capture
!pip install pmdarima

In [136]:
%%capture
!cd npmf && git pull

import os
import pickle
from collections import defaultdict
from datetime import datetime
from operator import itemgetter

import numpy as np
from numpy.ma.core import outerproduct
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm

import pmdarima as pm

import wandb as wb

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

from npmf.utils.colors import main, main2, main3
from npmf.utils.wandb import get_df_artifact
from npmf.utils.data import test_start_pd

In [9]:
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [11]:
np.random.seed(420)

# Let's get the data and split into training and testing 😂✨KAWAIII ^^✨



In [118]:
data = get_df_artifact("stock-oil-final:v3", "master-test")
data = data[data.date > "01.01.2000"]

[34m[1mwandb[0m: Downloading large artifact stock-oil-final:v3, 82.95MB. 1 files... Done. 0:0:0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [133]:
trn = data[(data.date >= "2018-10-01") & (data.date < test_start_pd)].copy()
tst = data[data.date >= test_start_pd].copy()

tst["obs_number"] = tst.groupby("ticker").date.rank(method='first', ascending=True).astype(int)
tst = tst[tst.obs_number <= 20]

tickers = set(trn.ticker.unique()) & set(tst.ticker.unique())

trn.shape, tst.shape

((58355, 5), (19898, 6))

## Train ARIMA models on last quarter of 2018 🎯

And upload the result to WandB


In [135]:
trained_models = dict()

for ticker in tqdm(tickers):

    series = trn.loc[trn.ticker == ticker, ["date", "market_cap"]].set_index("date").squeeze()
    
    mod = pm.arima.AutoARIMA()
    mod.fit(series)

    trained_models[ticker] = mod

100%|██████████| 933/933 [13:15<00:00,  1.17it/s]


In [137]:
filename = "arima-trained-1Q.pickle"

with open(filename, mode="wb") as f:
    pickle.dump(trained_models, f)

with wb.init(project="master-test") as run:
    art = wb.Artifact(filename.split(".")[0], type="model")
    art.add_file(filename)

    run.log_artifact(art)
    

VBox(children=(Label(value='119.941 MB of 119.941 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

## Predict for first month of 2019

In [143]:
# outdf = pd.DataFrame(columns=[f"V{i+1}" for i in range(20)], index=tickers, dtype=np.float64)

ob_num = list(range(1, 21))
out = dict(ticker=[], market_cap=[], obs_number=[])

for ticker, model in tqdm(trained_models.items()):
    forec = model.predict(20)
    ts = [ticker] * 20

    out["ticker"].extend(ts)
    out["market_cap"].extend(forec)
    out["obs_number"].extend(ob_num)

forec_df = pd.DataFrame(out)

forec_df

100%|██████████| 933/933 [00:03<00:00, 273.80it/s]


Unnamed: 0,ticker,market_cap,obs_number
0,CSAN3.SA,3.517287e+09,1
1,CSAN3.SA,3.517287e+09,2
2,CSAN3.SA,3.517287e+09,3
3,CSAN3.SA,3.517287e+09,4
4,CSAN3.SA,3.517287e+09,5
...,...,...,...
18655,OANDO.LG,2.024660e+08,16
18656,OANDO.LG,2.024660e+08,17
18657,OANDO.LG,2.024660e+08,18
18658,OANDO.LG,2.024660e+08,19


In [160]:
errors = (forec_df.set_index(["ticker", "obs_number"]) - tst.set_index(["ticker", "obs_number"])).drop(columns=["close_price", "currency", "date"]).dropna()

In [164]:
errors.abs().groupby("ticker").mean().mean()

market_cap    3.680191e+08
dtype: float64