In [24]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [25]:
# Import all required raw data files, I will ignore earnings and stock info from euronext now
# To make things easier, my index will always we a combination of stock_isin and date
balance_sheet_yahoo = pd.read_csv(r'raw_data\balance_sheet_yahoo.csv', sep='|').set_index(['stock_isin','date' ])
cashflow_yahoo = pd.read_csv(r'raw_data\cashflow_yahoo.csv', sep='|').set_index(['stock_isin','date' ])
financials_yahoo = pd.read_csv(r'raw_data\financials_yahoo.csv', sep='|').set_index(['stock_isin','date' ])
price_yahoo = pd.read_csv(r'raw_data\price_yahoo.csv', sep='|').set_index(['stock_isin','date' ])
stock_info_yahoo = pd.read_csv(
    r'raw_data\stock_info_yahoo.csv', sep='|').set_index(['stock_isin'])
stock_info_euronext = pd.read_csv(
    r'raw_data\stock_info_euronext.csv', sep='|').set_index(['stock_isin'])


In [26]:
# The rows that make up a distinct combination of stock_isin and date (year-end) will make up by full dataset
df = financials_yahoo.index.unique().to_frame()
df.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5552 entries, ('BMG9156K1018', '2021-12-31') to ('NO0010934748', '2019-12-31')
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   stock_isin  5552 non-null   object
 1   date        5552 non-null   object
dtypes: object(2)
memory usage: 157.3+ KB


In [28]:
# We first focus on finding our predictor which which will be the one year holding return following the financial year end
# I want to focus on total return which will include dividends, this means days that pay a dividend get a higher return
price_yahoo['close_price_plus_dividends'] = price_yahoo['close_price'] + \
    price_yahoo['dividends']
price_yahoo.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,open_price,high_price,low_price,close_price,trading_volume,dividends,stock_splits,adjusted_close_price,close_price_plus_dividends
stock_isin,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BMG9156K1018,2019-07-12,83.686412,86.373593,79.655647,79.655647,40831.0,0.0,0.0,,79.655647
BMG9156K1018,2019-07-15,79.655642,80.596158,77.736229,77.736229,8810.0,0.0,0.0,,77.736229
BMG9156K1018,2019-07-16,77.736229,77.736229,76.776522,77.736229,21400.0,0.0,0.0,,77.736229
BMG9156K1018,2019-07-17,77.736229,77.736229,77.265975,77.736229,12966.0,0.0,0.0,,77.736229
BMG9156K1018,2019-07-18,76.786127,76.786127,73.907007,76.776527,4789.0,0.0,0.0,,76.776527


In [30]:
# First I will sort by date for each unique ISN
price_yahoo.sort_values(["stock_isin", "date"],
               axis=0, ascending=True,
               inplace=True)
price_yahoo


Unnamed: 0_level_0,Unnamed: 1_level_0,open_price,high_price,low_price,close_price,trading_volume,dividends,stock_splits,adjusted_close_price,close_price_plus_dividends
stock_isin,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AN8068571086,2017-02-23,62.592112,63.196190,62.484539,62.484539,1068.0,0.0,0.0,,62.484539
AN8068571086,2017-02-24,62.550735,62.964485,62.170085,62.790710,4463.0,0.0,0.0,,62.790710
AN8068571086,2017-02-27,63.146519,63.154796,62.476246,63.113419,1020.0,0.0,0.0,,63.113419
AN8068571086,2017-02-28,63.072062,63.535460,62.807262,62.807262,664.0,0.0,0.0,,62.807262
AN8068571086,2017-03-01,62.757614,64.089890,62.757614,64.007141,3064.0,0.0,0.0,,64.007141
...,...,...,...,...,...,...,...,...,...,...
ZM0000000037,2022-02-17,0.910000,0.910000,0.910000,0.910000,1972.0,0.0,0.0,,0.910000
ZM0000000037,2022-02-18,0.950000,0.950000,0.950000,0.950000,20.0,0.0,0.0,,0.950000
ZM0000000037,2022-02-21,0.950000,0.950000,0.950000,0.950000,0.0,0.0,0.0,,0.950000
ZM0000000037,2022-02-22,0.860000,0.900000,0.860000,0.900000,4877.0,0.0,0.0,,0.900000


In [32]:
price_yahoo['pct_change'] = price_yahoo['close_price'].pct_change() + 1
price_yahoo.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,open_price,high_price,low_price,close_price,trading_volume,dividends,stock_splits,adjusted_close_price,close_price_plus_dividends,pct_change
stock_isin,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AN8068571086,2017-02-23,62.592112,63.19619,62.484539,62.484539,1068.0,0.0,0.0,,62.484539,
AN8068571086,2017-02-24,62.550735,62.964485,62.170085,62.79071,4463.0,0.0,0.0,,62.79071,1.0049
AN8068571086,2017-02-27,63.146519,63.154796,62.476246,63.113419,1020.0,0.0,0.0,,63.113419,1.005139
AN8068571086,2017-02-28,63.072062,63.53546,62.807262,62.807262,664.0,0.0,0.0,,62.807262,0.995149
AN8068571086,2017-03-01,62.757614,64.08989,62.757614,64.007141,3064.0,0.0,0.0,,64.007141,1.019104
