<a href="https://colab.research.google.com/github/Krankile/npmf/blob/main/notebooks/inner_join_fundamentals_and_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

##Kernel setup

In [1]:
%%capture
!pip install wandb

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%%capture
!git clone https://github.com/Krankile/npmf.git

In [4]:
%%capture
!cd npmf && git pull

In [5]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mkrankile[0m (use `wandb login --relogin` to force relogin)


##General setup

In [6]:
import os
from collections import defaultdict
from collections import Counter
from datetime import datetime
from operator import itemgetter

import numpy as np
from numpy.ma.core import outerproduct
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm

import wandb as wb

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

from npmf.utils.colors import main, main2, main3
from npmf.utils.wandb import get_df_artifact, put_dataset
from npmf.utils.eikon import column_mapping 

In [7]:
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[main, main2, main3, "black"])
mpl.rcParams['figure.figsize'] = (6, 4)  # (6, 4) is default and used in the paper

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [9]:
np.random.seed(420)

## Collect data from stock and fundamental and find intersection of unique tickers

In [10]:
stock_df = get_df_artifact("stock-data-sufficiency-cleaned:latest", "master-test")
fundamental_df = get_df_artifact("fundamental-data-clean:latest", "master-test")
meta_df = get_df_artifact("oil-meta-data:latest", "master-test")

[34m[1mwandb[0m: Currently logged in as: [33mkrankile[0m (use `wandb login --relogin` to force relogin)


[34m[1mwandb[0m: Downloading large artifact stock-data-sufficiency-cleaned:latest, 156.14MB. 1 files... Done. 0:0:0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [11]:
unique_fundamental = set(fundamental_df["Instrument"].unique())
unique_stock = set(stock_df["ticker"].unique())

full_data_companies = unique_fundamental & unique_stock

In [12]:
meta_df

Unnamed: 0,Instrument,Exchange Market Identifier Code,Region of Headquarters,Country of Headquarters,State or Province of Headquarters,Organization Founded Year,TRBC Economic Sector Name,TRBC Business Sector Name,TRBC Industry Group Name,TRBC Industry Name,TRBC Activity Name
0,OMVV.VI,WBAH,Europe,Austria,WIEN,1956,Energy,Energy - Fossil Fuels,Oil & Gas,Oil & Gas Refining and Marketing,Oil & Gas Refining and Marketing (NEC)
1,ENRA.KL,XKLS,Asia,Malaysia,WILAYAH PERSEKUTUAN,1992,Energy,Energy - Fossil Fuels,Oil & Gas,Integrated Oil & Gas,Integrated Oil & Gas
2,MDINp.TA,XTAE,Asia,Israel,,1992,Energy,Energy - Fossil Fuels,Oil & Gas,Oil & Gas Exploration and Production,Oil & Gas Exploration and Production (NEC)
3,CWV.V,XTSX,America,Canada,ALBERTA,2012,Energy,Energy - Fossil Fuels,Oil & Gas,Oil & Gas Exploration and Production,Oil & Gas Exploration and Production (NEC)
4,2178.HK,XHKG,Asia,China,GUANGDONG,2007,Energy,Energy - Fossil Fuels,Oil & Gas Related Equipment and Services,Oil & Gas Drilling,Oil & Gas Drilling (NEC)
...,...,...,...,...,...,...,...,...,...,...,...
2063,ALHAF.PA,ALXP,Europe,France,GRAND EST,2015,Energy,Renewable Energy,Renewable Energy,Renewable Fuels,Biomass & Biogas Fuels
2064,SMRTG.IS,XEQY,Asia,Turkey,,2014,Energy,Renewable Energy,Renewable Energy,Renewable Energy Equipment & Services,Renewable Energy Equipment & Services (NEC)
2065,FDR.AX,XASX,Oceania,Australia,WESTERN AUSTRALIA,2004,Energy,Energy - Fossil Fuels,Oil & Gas,Oil & Gas Exploration and Production,Oil & Gas Exploration and Production (NEC)
2066,DINO.N,XNYS,America,United States of America,TEXAS,2021,Energy,Energy - Fossil Fuels,Oil & Gas,Oil & Gas Refining and Marketing,Petroleum Refining


In [13]:
print(column_mapping)

{'Instrument': 'ticker', 'Date': 'date', 'Company Market Cap': 'market_cap', 'Price Close': 'close_price', 'Currency': 'currency', 'Period End Date': 'period_end_date', 'Balance Sheet Orig Announce Date': 'announce_date', 'Total Revenue': 'revenue', 'Gross Profit': 'gross_profit', 'EBITDA': 'ebitda', 'EBIT': 'ebit', 'Net Income after Tax': 'net_income', 'Total Assets': 'total_assets', 'Total Current Assets': 'total_current_assets', 'Total Liabilities': 'total_liabilites', 'Total Current Liabilities': 'total_current_liabilities', 'Long Term Debt Percentage of Total Assets': 'long_term_debt_p_assets', 'Short Term Debt Percentage of Total Assets': 'short_term_debt_p_assets', 'Free Cash Flow': 'fcf', 'Gross Profitp': 'gross_profit_p', 'EBITDAp': 'ebitda_p', 'EBITp': 'ebit_p', 'Net Income after Taxp': 'net_income_p', 'Exchange Market Identifier Code': 'exchange_code', 'Region of Headquarters': 'region_hq', 'Country of Headquarters': 'country_hq', 'State or Province of Headquarters': 'state_

In [14]:
fundamental_df = fundamental_df.rename(columns=column_mapping)
meta_df = meta_df.rename(columns=column_mapping)

full_data_fundamentals = fundamental_df[fundamental_df["ticker"].isin(full_data_companies)]
full_data_stock = stock_df[stock_df["ticker"].isin(full_data_companies)]
full_data_meta = meta_df[meta_df["ticker"].isin(full_data_companies)]

In [15]:
full_data_fundamentals = full_data_fundamentals.drop(columns=["index"])
full_data_stock = full_data_stock.drop(columns=["index"])

In [18]:
full_data_fundamentals.count(axis=0, numeric_only=True).sum()

667382

## Upload full data to eikon 

In [16]:
put_dataset(full_data_fundamentals, filename="fundamentals-oil-final.feather", project="master-test")
put_dataset(full_data_stock, filename="stock-oil-final.feather", project="master-test")
put_dataset(full_data_meta, filename="meta-oil-final.feather", project="master-test")

VBox(children=(Label(value='5.477 MB of 5.477 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='82.954 MB of 82.954 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

VBox(children=(Label(value='0.077 MB of 0.077 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…