# Data transformation

# 1. Introduction and purpose

This notebook has the purpose of transformating/cleaning the data obtained in the extraction step from `notebooks/1.data_extraction.ipynb`.

Transformation goals are:
- Cleaning and formatting of tables extracted
- Generation of Technical Indicators from OHLCV data

TO-DO:
- [] Modify/clean technical indicators function
- [] Add ffil for macroindicators when missing with respect to other markets due to non-working days (as opposed to not yet reported), as of "last price".
- [] Inthe case of BTC, as we are predicting stocks, exclude non coincidental days.
- [] Save all transformed files. Explore cloud sources to host the data.

# 2. Data transformation

In [2]:
%load_ext autoreload
%autoreload 2

# Data preparation
import numpy as np
import polars as pl
import pandas as pd

# Data
import sys
sys.path.append("..")

from src.support.data_transformation import TickerExtender, TechnicalIndicators, FileHandler

# 1. OHLCV

## 1.1 Adding simple temporal growth and simple technical indicators

In [3]:
ticker_extender = TickerExtender()

stocks_df_list = ticker_extender.transform_daily_tickers_parallel("../../data/extracted/OHLCV")

## 1.2 Adding Technical Indicators

In [4]:
print("Enriching ticker dataframes.")
ticker_df_list = ticker_extender.transform_daily_tickers_parallel("../../data/extracted/OHLCV")
print("Merging into one collection.")
merged_df_with_tech_ind = ticker_extender.merge_tickers(ticker_df_list)

merged_df_with_tech_ind

Enriching ticker dataframes.
Merging into one collection.


datetime,close,high,low,open,volume,symbol,currency,industry,sector,country,region,year,month,weekday,quarter_n,month_dt,quarter,growth_adj_1d,growth_adj_3d,growth_adj_7d,growth_adj_30d,growth_adj_90d,growth_adj_365d,SMA10,SMA22,30d_volatility,high_minus_low_relative,is_growing_moving_average,growth_adj_future_7d,is_positive_growth_7d,growth_adj_future_30d,is_positive_growth_30d,adx,adxr,apo,aroon_1,…,cdlhangingman,cdlharami,cdlharamicross,cdlhighwave,cdlhikkake,cdlhikkakemod,cdlhomingpigeon,cdlidentical3crows,cdlinneck,cdlinvertedhammer,cdlkicking,cdlkickingbylength,cdlladderbottom,cdllongleggeddoji,cdllongline,cdlmarubozu,cdlmatchinglow,cdlmathold,cdlmorningdojistar,cdlmorningstar,cdlonneck,cdlpiercing,cdlrickshawman,cdlrisefall3methods,cdlseparatinglines,cdlshootingstar,cdlshortline,cdlspinningtop,cdlstalledpattern,cdlsticksandwich,cdltakuru,cdltasukigap,cdlthrusting,cdltristar,cdlunique3river,cdlupsidegap2crows,cdlxsidegap3methods
datetime[ns],f64,f64,f64,f64,i64,str,str,str,str,str,str,i32,i8,i8,i8,datetime[ns],date,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,f64,i8,f64,i8,f64,f64,f64,f64,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
2009-08-06 00:00:00,1.150074,1.201963,1.106005,1.17282,241978000,"""AVGO""","""USD""","""semiconductors""","""technology""","""united_states""","""US_AMERICA""",2009,8,4,3,2009-08-01 00:00:00,2009-07-01,,,,,,,,,,0.083436,,1.022356,1,1.085875,1,,,,,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2009-08-07 00:00:00,1.167844,1.191301,1.139412,1.147942,24543000,"""AVGO""","""USD""","""semiconductors""","""technology""","""united_states""","""US_AMERICA""",2009,8,5,3,2009-08-01 00:00:00,2009-07-01,1.015451,,,,,,,,,0.044431,,1.010241,1,1.048088,1,,,,,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2009-08-10 00:00:00,1.135147,1.18206,1.109558,1.18206,24210000,"""AVGO""","""USD""","""semiconductors""","""technology""","""united_states""","""US_AMERICA""",2009,8,1,3,2009-08-01 00:00:00,2009-07-01,0.972002,,,,,,,,,0.06387,,1.015413,1,1.063343,1,,,,,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2009-08-11 00:00:00,1.113823,1.13728,1.10174,1.135858,23054000,"""AVGO""","""USD""","""semiconductors""","""technology""","""united_states""","""US_AMERICA""",2009,8,2,3,2009-08-01 00:00:00,2009-07-01,0.981215,0.968479,,,,,,,,0.031908,,1.082015,1,1.071682,1,,,,,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2009-08-12 00:00:00,1.13728,1.151495,1.113112,1.147941,14513000,"""AVGO""","""USD""","""semiconductors""","""technology""","""united_states""","""US_AMERICA""",2009,8,3,3,2009-08-01 00:00:00,2009-07-01,1.021059,0.973828,,,,,,,,0.03375,,1.105646,1,1.039039,1,,,,,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-01-17 00:00:00,59.490002,59.639999,58.970001,59.23,2748200,"""TTE""","""USD""","""oil-gas-integrated""","""energy""","""france""","""EU""",2025,1,5,1,2025-01-01 00:00:00,2025-01-01,1.008134,1.03353,1.049577,1.079933,0.884761,0.992873,57.303,55.282934,33.222224,0.011262,1,,,,,34.117609,42.486492,1.571198,14.285714,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-21 00:00:00,59.110001,59.459999,58.740002,59.009998,2480500,"""TTE""","""USD""","""oil-gas-integrated""","""energy""","""france""","""EU""",2025,1,2,1,2025-01-01 00:00:00,2025-01-01,0.993612,1.024792,1.040486,1.107892,0.881304,0.974348,57.666,55.544589,34.881658,0.012181,1,,,,,34.43987,42.47543,1.831681,7.142857,…,0,0,0,100,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0,0,0,0,0,0,0,0
2025-01-22 00:00:00,58.110001,58.790001,58.080002,58.779999,1263600,"""TTE""","""USD""","""oil-gas-integrated""","""energy""","""france""","""EU""",2025,1,3,1,2025-01-01 00:00:00,2025-01-01,0.983082,0.984748,1.009555,1.087744,0.911634,0.959514,57.825,55.757657,35.066826,0.012218,1,,,,,33.8584,41.274417,1.995928,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-100,-100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2025-01-23 00:00:00,58.75,59.02,58.25,58.869999,1871100,"""TTE""","""USD""","""oil-gas-integrated""","""energy""","""france""","""EU""",2025,1,4,1,2025-01-01 00:00:00,2025-01-01,1.011014,0.987561,1.018551,1.111401,0.925679,0.979282,57.999001,56.02533,34.893309,0.013106,1,,,,,33.488507,39.928288,2.128669,0.0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-100,0,0,0,0,0,0,0,0,0


In [5]:
merged_df_with_tech_ind[["datetime","symbol"]]

datetime,symbol
datetime[ns],str
2009-08-06 00:00:00,"""AVGO"""
2009-08-07 00:00:00,"""AVGO"""
2009-08-10 00:00:00,"""AVGO"""
2009-08-11 00:00:00,"""AVGO"""
2009-08-12 00:00:00,"""AVGO"""
…,…
2025-01-17 00:00:00,"""TTE"""
2025-01-21 00:00:00,"""TTE"""
2025-01-22 00:00:00,"""TTE"""
2025-01-23 00:00:00,"""TTE"""


# 2. Macroindicators

## 2.1 Indices

In [8]:
pl.read_parquet("../data/extracted/macro/^GSPC.parquet")

date,close,high,low,open,volume,symbol,currency,country,region
datetime[ns],f64,f64,f64,f64,i64,str,str,str,str
1927-12-30 00:00:00,17.66,17.66,17.66,17.66,0,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
1928-01-03 00:00:00,17.76,17.76,17.76,17.76,0,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
1928-01-04 00:00:00,17.719999,17.719999,17.719999,17.719999,0,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
1928-01-05 00:00:00,17.549999,17.549999,17.549999,17.549999,0,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
1928-01-06 00:00:00,17.66,17.66,17.66,17.66,0,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
…,…,…,…,…,…,…,…,…,…
2025-01-17 00:00:00,5996.660156,6014.959961,5978.439941,5995.399902,4366830000,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
2025-01-21 00:00:00,6049.240234,6051.509766,6006.879883,6014.120117,4702920000,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
2025-01-22 00:00:00,6086.370117,6100.810059,6076.129883,6081.390137,4323040000,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""
2025-01-23 00:00:00,6118.709961,6118.72998,6074.669922,6076.319824,4432250000,"""^GSPC""","""USD""","""United States""","""US_AMERICA"""


In [52]:
from functools import partial

indices = ["^GSPC","^DJI","^GDAXI","EPI"]

prefixed_growth = partial(ticker_extender.compute_daily_index_features,prefix=f"{indices[0]}_")
merged_indices = ticker_extender.read_transform_save(prefixed_growth,f"../../data/extracted/macro/{indices[0]}.parquet")
for index in indices[1:]:
    prefixed_growth = partial(ticker_extender.compute_daily_index_features,prefix=f"{index}_")
    index_df = ticker_extender.read_transform_save(prefixed_growth,f"../../data/extracted/macro/{index}.parquet")
    merged_indices = merged_indices.join(index_df,how="full",on="datetime")
    merged_indices = merged_indices.with_columns(pl.col("datetime")
                                                 .fill_null(pl.col("datetime_right"))) \
                                    .sort(by="datetime") \
                                    .select(pl.exclude("datetime_right")) \
                                    .fill_null(strategy="forward") 


merged_indices

datetime,^GSPC_growth_adj_1d,^GSPC_growth_adj_3d,^GSPC_growth_adj_7d,^GSPC_growth_adj_30d,^GSPC_growth_adj_90d,^GSPC_growth_adj_365d,^GSPC_SMA10,^GSPC_SMA22,^GSPC_30d_volatility,^GSPC_high_minus_low_relative,^GSPC_is_growing_moving_average,^DJI_growth_adj_1d,^DJI_growth_adj_3d,^DJI_growth_adj_7d,^DJI_growth_adj_30d,^DJI_growth_adj_90d,^DJI_growth_adj_365d,^DJI_SMA10,^DJI_SMA22,^DJI_30d_volatility,^DJI_high_minus_low_relative,^DJI_is_growing_moving_average,^GDAXI_growth_adj_1d,^GDAXI_growth_adj_3d,^GDAXI_growth_adj_7d,^GDAXI_growth_adj_30d,^GDAXI_growth_adj_90d,^GDAXI_growth_adj_365d,^GDAXI_SMA10,^GDAXI_SMA22,^GDAXI_30d_volatility,^GDAXI_high_minus_low_relative,^GDAXI_is_growing_moving_average,EPI_growth_adj_1d,EPI_growth_adj_3d,EPI_growth_adj_7d,EPI_growth_adj_30d,EPI_growth_adj_90d,EPI_growth_adj_365d,EPI_SMA10,EPI_SMA22,EPI_30d_volatility,EPI_high_minus_low_relative,EPI_is_growing_moving_average
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8
1927-12-30 00:00:00,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1928-01-03 00:00:00,1.005663,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1928-01-04 00:00:00,0.997748,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1928-01-05 00:00:00,0.990406,0.993771,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1928-01-06 00:00:00,1.006268,0.994369,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-01-21 00:00:00,1.008768,1.016694,1.0365,0.995911,1.03232,1.265283,5924.198047,5935.673184,1116.980945,0.007378,0,1.012371,1.018608,1.04087,1.007053,1.022305,1.175005,42851.228516,42828.904297,7869.451471,0.011845,1,1.002463,1.018717,1.038018,1.031171,1.078618,1.265525,20543.17793,20287.221325,5596.451063,0.005839,1,0.995912,0.997271,1.01317,0.921711,0.891074,1.058137,44.179,45.112237,16.36082,0.003877,0
2025-01-22 00:00:00,1.006138,1.025101,1.041668,1.00591,1.046621,1.257564,5935.29707,5937.298651,1166.59265,0.004055,0,1.002974,1.023257,1.038535,1.016268,1.033137,1.166199,42996.245703,42861.033026,8842.262975,0.003765,1,1.010088,1.016786,1.03303,1.046297,1.090735,1.2584,20635.610938,20329.96946,6484.715341,0.007931,1,0.993159,0.991801,0.99634,0.925102,0.889485,1.039638,44.027999,44.951969,15.617646,0.005741,0
2025-01-23 00:00:00,1.005313,1.020353,1.02837,1.041986,1.047281,1.261478,5956.265088,5948.50546,1292.975812,0.007201,1,1.009248,1.024771,1.031084,1.052879,1.034528,1.172709,43199.916797,42962.769354,10337.993507,0.010136,1,1.007399,1.020067,1.036607,1.057549,1.101824,1.266436,20745.053906,20382.931286,7434.004099,0.00789,1,1.009185,0.998183,0.999545,0.953188,0.896559,1.03978,43.914999,44.853858,15.40674,0.004323,0
2025-01-24 00:00:00,0.997145,1.008596,1.027605,1.039911,1.04447,1.254212,5974.564111,5959.149103,1357.296013,0.006464,1,0.99684,1.00905,1.029456,1.049171,1.02741,1.171975,43378.821875,43057.40625,11203.574905,0.004801,1,0.999225,1.016773,1.023515,1.056928,1.092504,1.26139,20863.067969,20435.311257,8160.475788,0.007829,1,0.990899,0.993159,0.991801,0.941458,0.899008,1.061887,43.768999,44.730764,15.169271,0.003215,0


# 2.2 GDP

In [6]:
file_handler = FileHandler()
gdppot = pd.read_csv("../data/extracted/macro/GDPPOT.csv",index_col=0)

gdppot['gdppot_us_yoy'] = gdppot["GDPPOT"].pct_change(4)
gdppot['gdppot_us_qoq'] = gdppot["GDPPOT"].pct_change(1)
gdppot.drop(columns="GDPPOT",inplace=True)

gdppot.index = pd.to_datetime(gdppot.index, utc=False, format="%Y-%m-%d")
gdppot

Unnamed: 0_level_0,gdppot_us_yoy,gdppot_us_qoq
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1955-01-01,,
1955-04-01,,0.006516
1955-07-01,,0.006366
1955-10-01,,0.006565
1956-01-01,0.026246,0.006545
...,...,...
2024-01-01,0.020357,0.005102
2024-04-01,0.020474,0.005143
2024-07-01,0.020675,0.005201
2024-10-01,0.020854,0.005247


## 2.3 CPI

In [8]:
cpilfesl = pd.read_csv("../data/extracted/macro/CPILFESL.csv",index_col=0)
cpilfesl.index = pd.to_datetime(cpilfesl.index, utc=False, format="%Y-%m-%d")

# information in current month is actually lagging from previous
cpilfesl.index = cpilfesl.index + pd.DateOffset(months=1)

cpilfesl['cpi_core_yoy_prev_month'] = cpilfesl["CPILFESL"].pct_change(12)
cpilfesl['cpi_core_mom_prev_month'] = cpilfesl["CPILFESL"].pct_change(1)

# cpilfesl.drop(columns="CPILFESL",inplace=True)
# cpilfesl.index = pd.to_datetime(cpilfesl.index, utc=False, format="%Y-%m-%d")
cpilfesl.tail(3)

Unnamed: 0_level_0,CPILFESL,cpi_core_yoy_prev_month,cpi_core_mom_prev_month
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-11-01,321.666,0.033,0.002803
2024-12-01,322.657,0.033002,0.003081
2025-01-01,323.383,0.032483,0.00225


## 2.4 FEDFUNDS

In [8]:
fedfunds = pd.read_csv("../data/extracted/macro/FEDFUNDS.csv",index_col=0)
fedfunds.index = pd.to_datetime(fedfunds.index, utc=False, format="%Y-%m-%d")

# information in current month is actually lagging from previous
fedfunds.index = fedfunds.index + pd.DateOffset(months=1)
fedfunds.columns = ["FEDFUNDS_prev_month"]

fedfunds

Unnamed: 0_level_0,FEDFUNDS_prev_month
DATE,Unnamed: 1_level_1
1955-02-01,1.39
1955-03-01,1.29
1955-04-01,1.35
1955-05-01,1.43
1955-06-01,1.43
...,...
2024-09-01,5.33
2024-10-01,5.13
2024-11-01,4.83
2024-12-01,4.64


## 2.5 Credit Risk - Euro AAA Bonds (Eurozone)

In [9]:
euro_yield_df = pd.read_csv("../data/extracted/macro/eurostat_euro_yield.csv", index_col=0)

# information in current day is actually lagging 2 days
euro_yield_df = ticker_extender.transform_euro_yield_df(euro_yield_df)


euro_yield_df


  eurostat_euro_yield_df = pd.concat([eurostat_euro_yield_df,pd.DataFrame(columns=eurostat_euro_yield_df.columns, index=new_index)],axis=0).shift(2)


Unnamed: 0,eur_yld_Y1_prev_2d,eur_yld_Y10_prev_2d,eur_yld_Y5_prev_2d
2004-09-08 00:00:00+00:00,2.29884,4.20922,3.45722
2004-09-09 00:00:00+00:00,2.32889,4.20963,3.47952
2004-09-10 00:00:00+00:00,2.34667,4.22842,3.50789
2004-09-13 00:00:00+00:00,2.30899,4.16187,3.43063
2004-09-14 00:00:00+00:00,2.27157,4.12098,3.37473
...,...,...,...
2025-01-13 00:00:00+00:00,2.33536,2.61630,2.27237
2025-01-14 00:00:00+00:00,2.36144,2.65624,2.32642
2025-01-15 00:00:00+00:00,2.37913,2.68203,2.35564
2025-01-16 00:00:00+00:00,2.37854,2.71869,2.39536


## 2.6 Credit Risk - DGS (Deposit Guarantee Schemes, US)

In [10]:
dgs = file_handler.read_csv_file("../data/extracted/macro/DGS.csv")
dgs.columns = [col + "_prev_1d" for col in dgs.columns] # lagged 1d
dgs.index = dgs.index + pd.offsets.BusinessDay(1)
dgs

Unnamed: 0_level_0,DGS1_prev_1d,DGS5_prev_1d,DGS10_prev_1d
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1962-01-03 00:00:00+00:00,3.22,3.88,4.06
1962-01-04 00:00:00+00:00,3.24,3.87,4.03
1962-01-05 00:00:00+00:00,3.24,3.86,3.99
1962-01-08 00:00:00+00:00,3.26,3.89,4.02
1962-01-09 00:00:00+00:00,3.31,3.91,4.03
...,...,...,...
2025-01-13 00:00:00+00:00,4.25,4.59,4.77
2025-01-14 00:00:00+00:00,4.24,4.61,4.79
2025-01-15 00:00:00+00:00,4.22,4.59,4.78
2025-01-16 00:00:00+00:00,4.19,4.45,4.66


## 2.7 Volatility - VIX (US)  

In [11]:
vix = file_handler.read_csv_file("../data/extracted/macro/VIX.csv")
vix = vix[["Close"]]
vix.columns = ["VIX_close"]
vix

Unnamed: 0_level_0,VIX_close
Date,Unnamed: 1_level_1
1990-01-02 00:00:00+00:00,17.240000
1990-01-03 00:00:00+00:00,18.190001
1990-01-04 00:00:00+00:00,19.219999
1990-01-05 00:00:00+00:00,20.110001
1990-01-08 00:00:00+00:00,20.260000
...,...
2025-01-14 00:00:00+00:00,18.709999
2025-01-15 00:00:00+00:00,16.120001
2025-01-16 00:00:00+00:00,16.600000
2025-01-17 00:00:00+00:00,15.970000


## 2.8 Commodities - GOLD  

In [12]:
prefixed_growth_gold = partial(ticker_extender.calculate_growth_features,prefix=f"gold_")
gold = ticker_extender.read_transform_save(prefixed_growth_gold,f"../data/extracted/macro/GOLD.csv")
gold

Unnamed: 0_level_0,gold_growth_adj_1d,gold_growth_adj_3d,gold_growth_adj_7d,gold_growth_adj_30d,gold_growth_adj_90d,gold_growth_adj_365d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-08-30 00:00:00+00:00,,,,,,
2000-08-31 00:00:00+00:00,1.016064,,,,,
2000-09-01 00:00:00+00:00,0.995329,,,,,
2000-09-05 00:00:00+00:00,0.995668,1.006937,,,,
2000-09-06 00:00:00+00:00,0.994199,0.985268,,,,
...,...,...,...,...,...,...
2025-01-10 00:00:00+00:00,1.009203,1.019498,1.024008,1.016361,1.024085,1.339714
2025-01-13 00:00:00+00:00,0.987078,1.003378,1.013304,0.991066,1.022371,1.327260
2025-01-14 00:00:00+00:00,1.001496,0.997653,1.007829,0.979406,1.027437,1.308203
2025-01-15 00:00:00+00:00,1.013072,1.001477,1.018015,1.009302,1.035068,1.338845


## 2.9 Commodities - WTI Crude Oil (Futures and Spot)  

In [13]:
prefixed_growth_crude = partial(ticker_extender.calculate_growth_features,prefix=f"crude_oil_")
crude_oil = ticker_extender.read_transform_save(prefixed_growth_crude,f"../data/extracted/macro/WTI_oil_futures.csv")
crude_oil

Unnamed: 0_level_0,crude_oil_growth_adj_1d,crude_oil_growth_adj_3d,crude_oil_growth_adj_7d,crude_oil_growth_adj_30d,crude_oil_growth_adj_90d,crude_oil_growth_adj_365d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-08-23 00:00:00+00:00,,,,,,
2000-08-24 00:00:00+00:00,0.986895,,,,,
2000-08-25 00:00:00+00:00,1.013279,,,,,
2000-08-28 00:00:00+00:00,1.025585,1.025585,,,,
2000-08-29 00:00:00+00:00,0.995437,1.034461,,,,
...,...,...,...,...,...,...
2025-01-10 00:00:00+00:00,1.035850,1.031246,1.035289,1.119936,0.992611,1.072860
2025-01-13 00:00:00+00:00,1.029385,1.075014,1.071506,1.149147,1.071361,1.094418
2025-01-14 00:00:00+00:00,0.983253,1.048431,1.043771,1.102575,1.058165,1.066318
2025-01-15 00:00:00+00:00,1.032774,1.045318,1.091653,1.143102,1.055241,1.105525


## 2.10 Commodities - Brent Oil (Futures and Spot)  

In [14]:
prefixed_growth_brent = partial(ticker_extender.calculate_growth_features,prefix=f"brent_oil_")
brent_oil = ticker_extender.read_transform_save(prefixed_growth_brent,f"../data/extracted/macro/Brent_oil_futures.csv")
brent_oil

Unnamed: 0_level_0,brent_oil_growth_adj_1d,brent_oil_growth_adj_3d,brent_oil_growth_adj_7d,brent_oil_growth_adj_30d,brent_oil_growth_adj_90d,brent_oil_growth_adj_365d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-07-30 00:00:00+00:00,,,,,,
2007-07-31 00:00:00+00:00,1.017296,,,,,
2007-08-01 00:00:00+00:00,0.977936,,,,,
2007-08-02 00:00:00+00:00,1.005441,1.000264,,,,
2007-08-03 00:00:00+00:00,0.986668,0.970149,,,,
...,...,...,...,...,...,...
2025-01-10 00:00:00+00:00,1.036922,1.035172,1.042478,1.105628,0.985543,1.038542
2025-01-13 00:00:00+00:00,1.015672,1.063682,1.061730,1.122178,1.049624,1.046506
2025-01-14 00:00:00+00:00,0.986545,1.039002,1.037248,1.087051,1.043614,1.020820
2025-01-15 00:00:00+00:00,1.026401,1.028460,1.077075,1.117423,1.033123,1.047771


## 2.11 Cryptocurrency - BTC-USD  

Non coincidental days from stocks market to BTC, as stocks are the goal, exclude those days of data from BTC.

In [15]:
prefixed_growth_btc = partial(ticker_extender.calculate_growth_features,prefix=f"BTC_")
BTC_usd = ticker_extender.read_transform_save(prefixed_growth_btc,f"../data/extracted/macro/BTC_usd.csv")
BTC_usd

Unnamed: 0_level_0,BTC_growth_adj_1d,BTC_growth_adj_3d,BTC_growth_adj_7d,BTC_growth_adj_30d,BTC_growth_adj_90d,BTC_growth_adj_365d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-09-17 00:00:00+00:00,,,,,,
2014-09-18 00:00:00+00:00,0.928074,,,,,
2014-09-19 00:00:00+00:00,0.930157,,,,,
2014-09-20 00:00:00+00:00,1.035735,0.894104,,,,
2014-09-21 00:00:00+00:00,0.975341,0.939640,,,,
...,...,...,...,...,...,...
2025-01-16 00:00:00+00:00,0.992562,1.055444,1.054885,1.004609,1.124636,1.582188
2025-01-17 00:00:00+00:00,1.047166,1.082126,1.105554,1.090469,1.187665,1.718330
2025-01-18 00:00:00+00:00,0.999483,1.038840,1.104654,1.108780,1.152608,1.717416
2025-01-19 00:00:00+00:00,0.968216,1.013360,1.047191,1.062268,1.158614,1.645114


# 3. Merging all together

In [16]:
stocks_indices_merged = pd.merge(merged_df_with_tech_ind,merged_indices, on="date", how="left")

In [17]:
stocks_indices_merged.tail()

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,^GDAXI_growth_adj_7d,^GDAXI_growth_adj_30d,^GDAXI_growth_adj_90d,^GDAXI_growth_adj_365d,EPI_growth_adj_1d,EPI_growth_adj_3d,EPI_growth_adj_7d,EPI_growth_adj_30d,EPI_growth_adj_90d,EPI_growth_adj_365d
226949,349.140015,350.910004,344.369995,345.130005,2711200.0,ACN,ireland,EU,information-technology-services,technology,...,0.995878,0.988902,1.052922,1.211201,0.986102,0.960071,0.944154,0.905175,0.901227,1.04918
226950,348.98999,352.519989,345.630005,351.209991,1825400.0,ACN,ireland,EU,information-technology-services,technology,...,0.996596,0.994443,1.061098,1.223251,1.009935,0.971118,0.97004,0.915505,0.893845,1.058069
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,1.012038,1.011241,1.079103,1.252134,1.005948,1.001823,0.975377,0.918265,0.895146,1.045433
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,1.016651,1.016047,1.072733,1.246753,0.998635,1.014556,0.975561,0.9253,0.897203,1.051487
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,1.034064,1.024718,1.088101,1.262653,1.002733,1.007321,1.00319,0.925494,0.896003,1.069948


In [18]:
stocks_to_gdppot = pd.merge(stocks_indices_merged, gdppot, how="left", left_on="quarter", right_index=True)

In [19]:
stocks_to_cpilfesl = pd.merge(stocks_to_gdppot, cpilfesl, how="left", left_on="month_dt", right_index=True)
stocks_to_cpilfesl.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,EPI_growth_adj_3d,EPI_growth_adj_7d,EPI_growth_adj_30d,EPI_growth_adj_90d,EPI_growth_adj_365d,gdppot_us_yoy,gdppot_us_qoq,CPILFESL,cpi_core_yoy_prev_month,cpi_core_mom_prev_month
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,1.001823,0.975377,0.918265,0.895146,1.045433,0.021063,0.005308,323.383,0.032483,0.00225
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,1.014556,0.975561,0.9253,0.897203,1.051487,0.021063,0.005308,323.383,0.032483,0.00225
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,1.007321,1.00319,0.925494,0.896003,1.069948,0.021063,0.005308,323.383,0.032483,0.00225


In [20]:
stocks_to_fedfunds = pd.merge(stocks_to_cpilfesl, fedfunds, how="left", left_on="month_dt", right_index=True)
stocks_to_fedfunds.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,EPI_growth_adj_7d,EPI_growth_adj_30d,EPI_growth_adj_90d,EPI_growth_adj_365d,gdppot_us_yoy,gdppot_us_qoq,CPILFESL,cpi_core_yoy_prev_month,cpi_core_mom_prev_month,FEDFUNDS_prev_month
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,0.975377,0.918265,0.895146,1.045433,0.021063,0.005308,323.383,0.032483,0.00225,4.48
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,0.975561,0.9253,0.897203,1.051487,0.021063,0.005308,323.383,0.032483,0.00225,4.48
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,1.00319,0.925494,0.896003,1.069948,0.021063,0.005308,323.383,0.032483,0.00225,4.48


In [21]:
stocks_to_euro_yld = pd.merge(stocks_to_fedfunds, euro_yield_df, how="left", left_on="date", right_index=True)
stocks_to_euro_yld.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,EPI_growth_adj_365d,gdppot_us_yoy,gdppot_us_qoq,CPILFESL,cpi_core_yoy_prev_month,cpi_core_mom_prev_month,FEDFUNDS_prev_month,eur_yld_Y1_prev_2d,eur_yld_Y10_prev_2d,eur_yld_Y5_prev_2d
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,1.045433,0.021063,0.005308,323.383,0.032483,0.00225,4.48,2.37913,2.68203,2.35564
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,1.051487,0.021063,0.005308,323.383,0.032483,0.00225,4.48,2.37854,2.71869,2.39536
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,1.069948,0.021063,0.005308,323.383,0.032483,0.00225,4.48,2.33828,2.63028,2.30341


In [22]:
stocks_to_dgs = pd.merge(stocks_to_euro_yld, dgs, how="left", left_on="date", right_index=True)
stocks_to_dgs.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,CPILFESL,cpi_core_yoy_prev_month,cpi_core_mom_prev_month,FEDFUNDS_prev_month,eur_yld_Y1_prev_2d,eur_yld_Y10_prev_2d,eur_yld_Y5_prev_2d,DGS1_prev_1d,DGS5_prev_1d,DGS10_prev_1d
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,323.383,0.032483,0.00225,4.48,2.37913,2.68203,2.35564,4.22,4.59,4.78
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,323.383,0.032483,0.00225,4.48,2.37854,2.71869,2.39536,4.19,4.45,4.66
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,323.383,0.032483,0.00225,4.48,2.33828,2.63028,2.30341,4.18,4.39,4.61


In [23]:
stocks_to_vix = pd.merge(stocks_to_dgs, vix, how="left", left_on="date", right_index=True)
stocks_to_vix.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,cpi_core_yoy_prev_month,cpi_core_mom_prev_month,FEDFUNDS_prev_month,eur_yld_Y1_prev_2d,eur_yld_Y10_prev_2d,eur_yld_Y5_prev_2d,DGS1_prev_1d,DGS5_prev_1d,DGS10_prev_1d,VIX_close
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,0.032483,0.00225,4.48,2.37913,2.68203,2.35564,4.22,4.59,4.78,16.120001
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,0.032483,0.00225,4.48,2.37854,2.71869,2.39536,4.19,4.45,4.66,16.6
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,0.032483,0.00225,4.48,2.33828,2.63028,2.30341,4.18,4.39,4.61,15.97


In [24]:
stocks_to_gold = pd.merge(stocks_to_vix, gold, how="left", left_on="date", right_index=True)
stocks_to_gold.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,DGS1_prev_1d,DGS5_prev_1d,DGS10_prev_1d,VIX_close,gold_growth_adj_1d,gold_growth_adj_3d,gold_growth_adj_7d,gold_growth_adj_30d,gold_growth_adj_90d,gold_growth_adj_365d
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,4.22,4.59,4.78,16.120001,1.013072,1.001477,1.018015,1.009302,1.035068,1.338845
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,4.19,4.45,4.66,16.6,,,,,,
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,4.18,4.39,4.61,15.97,,,,,,


In [25]:
stocks_to_crude = pd.merge(stocks_to_gold, crude_oil, how="left", left_on="date", right_index=True)
stocks_to_crude.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,gold_growth_adj_7d,gold_growth_adj_30d,gold_growth_adj_90d,gold_growth_adj_365d,crude_oil_growth_adj_1d,crude_oil_growth_adj_3d,crude_oil_growth_adj_7d,crude_oil_growth_adj_30d,crude_oil_growth_adj_90d,crude_oil_growth_adj_365d
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,1.018015,1.009302,1.035068,1.338845,1.032774,1.045318,1.091653,1.143102,1.055241,1.105525
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,,,,,,,,,,
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,,,,,,,,,,


In [26]:
stocks_to_brent = pd.merge(stocks_to_crude, brent_oil, how="left", left_on="date", right_index=True)
stocks_to_brent.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,crude_oil_growth_adj_7d,crude_oil_growth_adj_30d,crude_oil_growth_adj_90d,crude_oil_growth_adj_365d,brent_oil_growth_adj_1d,brent_oil_growth_adj_3d,brent_oil_growth_adj_7d,brent_oil_growth_adj_30d,brent_oil_growth_adj_90d,brent_oil_growth_adj_365d
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,1.091653,1.143102,1.055241,1.105525,1.026401,1.02846,1.077075,1.117423,1.033123,1.047771
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,,,,,,,,,,
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,,,,,,,,,,


In [27]:
stocks_to_btc = pd.merge(stocks_to_brent, BTC_usd, how="left", left_on="date", right_index=True)
stocks_to_btc.tail(3)

Unnamed: 0,close,high,low,open,volume,symbol,country,region,industry,sector,...,brent_oil_growth_adj_7d,brent_oil_growth_adj_30d,brent_oil_growth_adj_90d,brent_oil_growth_adj_365d,BTC_growth_adj_1d,BTC_growth_adj_3d,BTC_growth_adj_7d,BTC_growth_adj_30d,BTC_growth_adj_90d,BTC_growth_adj_365d
226951,349.730011,355.200012,349.059998,352.350006,2624000.0,ACN,ireland,EU,information-technology-services,technology,...,1.077075,1.117423,1.033123,1.047771,1.04113,1.06367,1.061277,1.018529,1.248903,1.642554
226952,350.559998,353.25,347.0,349.109985,2025800.0,ACN,ireland,EU,information-technology-services,technology,...,,,,,0.992562,1.055444,1.054885,1.004609,1.124636,1.582188
226953,352.589996,357.0,351.910004,354.920013,4061300.0,ACN,ireland,EU,information-technology-services,technology,...,,,,,1.047166,1.082126,1.105554,1.090469,1.187665,1.71833


# 4. Dataset save

In [28]:
stocks_to_btc.to_parquet("../data/transformed/dataset.parquet")