<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/inner_join_fundamentals_and_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [55]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
%%capture
!pip install wandb
!git clone https://github.com/Krankile/npmf.git

In [57]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mkjartan[0m ([33mkrankile[0m). Use [1m`wandb login --relogin`[0m to force relogin


##General setup

In [58]:
%%capture
!cd npmf && git pull

import os
from collections import defaultdict
from collections import Counter
from datetime import datetime
from operator import itemgetter

import numpy as np
from numpy.ma.core import outerproduct
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm

import wandb as wb

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

from npmf.utils.colors import main, main2, main3
from npmf.utils.wandb import get_dataset, put_dataset
from npmf.utils.eikon import column_mapping 

In [59]:
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [60]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [61]:
np.random.seed(420)

# Data processing

## Collect data from stock and fundamental and find intersection of unique tickers

In [62]:
stock_df = get_dataset("stock-data-clean:latest", "master-test")
fundamental_df = get_dataset("fundamental-data-clean:latest", "master-test")
meta_df = get_dataset("oil-meta-data:latest", "master-test")

[34m[1mwandb[0m: Downloading large artifact stock-data-clean:latest, 126.38MB. 1 files... Done. 0:0:0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [63]:
fundamental_df

Unnamed: 0,index,Instrument,Date,Period End Date,Balance Sheet Orig Announce Date,Total Revenue,Gross Profit,EBITDA,EBIT,Net Income after Tax,...,Total Current Assets,Total Liabilities,Total Current Liabilities,Long Term Debt Percentage of Total Assets,Short Term Debt Percentage of Total Assets,Free Cash Flow,Gross Profitp,EBITDAp,EBITp,Net Income after Taxp
0,1,OMVV.VI,2000-06-30T00:00:00Z,2000-06-30,2000-10-23,1591395023.28372,230190745.555143,181045434.200878,105530002.190288,,...,,,,,,,0.144647,0.113765,0.066313,
1,3,OMVV.VI,2000-12-31T00:00:00Z,2000-12-31,2001-04-30,1938098647.94837,372273990.672257,228378009.139304,132398360.578508,,...,,,,,,,0.192082,0.117836,0.068314,
2,4,OMVV.VI,2001-03-31T00:00:00Z,2001-03-31,2001-11-15,1675754784.0278,270127135.374168,246253937.336036,174937923.894256,,...,,,,,,,0.161197,0.146951,0.104394,
3,5,OMVV.VI,2001-06-30T00:00:00Z,2001-06-30,2001-11-15,1704178457.61632,261593371.574251,199438283.407691,139758657.318887,,...,,,,,,,0.153501,0.117029,0.082009,
4,6,OMVV.VI,2001-09-30T00:00:00Z,2001-09-30,2001-11-08,1692966706.46458,247021992.14371,223213423.136865,133206040.77615,,...,,,,,,,0.145911,0.131847,0.078682,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57598,88073,ARACA.NFF,2008-12-31T00:00:00Z,2008-12-31,2011-03-03,92904.554159,-2117171.324493,-4619760.125553,-7909109.757678,-2137357.171855,...,5109582.451442,21447887.060314,9690425.035636,18.99665,,,-22.788671,-49.725874,-85.131561,-23.005946
57599,88074,ARACA.NFF,2009-03-31T00:00:00Z,2009-03-31,2009-07-02,10549.150125,-144567.930584,-1106769.285629,-1137376.678949,-4988326.377089,...,6308391.774634,26654730.773803,11597379.056223,16.66563,,-6881572.556783,-13.704225,-104.915493,-107.816901,-472.865237
57600,88075,ARACA.NFF,2009-06-30T00:00:00Z,2009-06-30,2009-09-03,37654.234545,-51969.067513,-1521106.598827,-1548491.496678,-1570316.767768,...,6098274.440243,27039941.4959,12361947.439668,17.77714,,-2599751.794234,-1.380165,-40.396694,-41.123967,-41.703590
57601,88076,ARACA.NFF,2009-09-30T00:00:00Z,2009-09-30,2010-01-07,478926.658462,345792.09208,-961160.706547,-969172.632564,-4784882.925321,...,2744454.228709,29735080.691293,5689130.510149,34.19498,,-8585810.204493,0.722015,-2.006906,-2.023635,-9.990847


In [64]:
unique_fundamental = set(fundamental_df["Instrument"].unique())
unique_stock = set(stock_df["ticker"].unique())

full_data_companies = unique_fundamental & unique_stock

## Inspect the raw data

To know what we're working with

In [65]:
fundamental_df = fundamental_df.rename(columns=column_mapping).astype({"announce_date":"datetime64[ns]"})
meta_df = meta_df.rename(columns=column_mapping)

full_data_fundamentals = fundamental_df[fundamental_df["ticker"].isin(full_data_companies)].drop(columns=["index"])
full_data_stock = stock_df[stock_df["ticker"].isin(full_data_companies)]
full_data_meta = meta_df[meta_df["ticker"].isin(full_data_companies)]

In [66]:
(
    full_data_stock.shape[0],
    full_data_fundamentals.count(axis=0, numeric_only=True).sum(),
    full_data_meta.count(axis=0, numeric_only=True).sum(),
)

(3206180, 621023, 861)

## Upload full data to WandB

In [67]:
put_dataset(full_data_fundamentals, filename="fundamentals-oil-final.feather", project="master-test")
put_dataset(full_data_stock, filename="stock-oil-final.feather", project="master-test")
put_dataset(full_data_meta, filename="meta-oil-final.feather", project="master-test")

VBox(children=(Label(value='4.995 MB of 4.995 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…