In [1]:
import sys
from pathlib import Path
import pandas as pd 
import numpy as np

sys.path.append(str(Path("..").resolve()))

from src.constants import raw_data_dir, raw_data_name, processed_data_dir

In [2]:
def lag_return(x, lag=1):
    r = ((x - x.shift(lag)) / x.shift(lag)) 
    return pd.Series(np.round(r * 100, 3))

def impute_col(df_sc, col, values):
    df_col = df_sc.loc[:, col].copy()
    idx_non_missing = df_col.notnull()
    df_col.loc[~idx_non_missing] = values.loc[~idx_non_missing]
    df_col.loc[idx_non_missing] = lag_return(df_col).loc[idx_non_missing]
    return df_col

In [3]:
factors = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=1)
factors.set_index("Date", inplace=True)
factors = factors.apply(lambda x: (x - np.mean(x)) / np.std(x))
categories = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=2)

In [16]:
factors = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=1)

In [47]:
factors

Unnamed: 0,Date,VIX,V2X,MSCI EM Vol,MOVE,Oil Vol,Gold Vol,HY Vol,JPM Global FX Vol,JPM G7 FX Vol,...,EURUSD ATM Vol 1Y-3M,G10 Carry Trade Index,JPY/USD Carry,GBP/USD Carry,EUR/USD Carry,CAD/USD Carry,AUD/USD Carry,NZD/USD Carry,CHF/USD Carry,Copper 1 year carry
0,2000-05-30,23.62,25.6300,,105.47,,,,12.21,12.68,...,13.400,110.765,98.7917,90.2306,76.1611,101.1919,93.7378,86.1437,77.0754,
1,2000-05-31,23.65,25.2600,,103.10,,,,12.11,12.56,...,13.275,110.382,97.7475,90.5178,76.8011,101.7900,93.2954,85.8620,77.4453,
2,2000-06-01,22.36,24.1400,,101.26,,,,11.95,12.40,...,13.175,110.611,96.9475,90.0934,76.2372,102.1823,93.2278,85.7492,77.1084,
3,2000-06-02,21.48,22.7600,,98.23,,,,11.96,12.42,...,13.400,110.789,97.3623,91.0199,77.4666,103.1199,94.4805,87.2503,78.2057,
4,2000-06-05,22.71,23.4500,,96.08,,,,11.86,12.33,...,13.175,110.838,97.9722,91.6087,77.5905,102.9840,94.8368,87.8504,78.2794,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5301,2021-06-24,15.97,17.3518,17.65,54.83,,13.0375,5.28,6.62,5.93,...,6.015,221.912,65.4594,92.9385,89.4526,128.5486,189.4258,218.1859,105.9092,3.45
5302,2021-06-25,15.62,16.6823,17.37,55.58,,12.6175,5.05,6.45,5.74,...,5.960,221.949,65.5295,92.6514,89.4726,128.8834,189.5998,218.6194,105.9633,4.00
5303,2021-06-28,15.76,17.6455,18.16,56.33,,12.7000,5.13,6.55,5.85,...,5.950,221.761,65.5999,92.6780,89.4026,128.4135,189.0244,217.3842,105.7177,3.25
5304,2021-06-29,16.02,16.9495,18.50,56.80,,13.5000,4.78,6.69,6.03,...,6.065,221.373,65.6585,92.3576,89.1828,127.7513,187.6497,216.0874,105.5419,7.05


In [112]:
sc_group = (
    categories.groupby("Subcategory")["Variable"]
    .agg("unique")
    .reset_index()
    .to_dict("list")
)

In [113]:
dfs = []

for idx, sc in enumerate(sc_group["Subcategory"]):
    if sc not in ["Policy Uncertainty", "Sentiment", "Inflation"]:
        cols = sc_group["Variable"][idx]
        df_sc = factors.loc[:, cols]
        mean_returns = df_sc.apply(lag_return).mean(axis=1)
        for col in cols:
            df_col = impute_col(df_sc, col, mean_returns)
            dfs.append(df_col)

In [114]:
pd.concat(dfs, axis=1).apply(np.isinf).sum().sort_values(ascending=False)

JPM EMBI+ Sovereign Spread                                                                 3
JPM EMBI Global Spread                                                                     3
EURJPY Risk Reversal                                                                       1
Ted Spread                                                                                 1
US 2Yr swap spread                                                                         1
                                                                                          ..
MSCI USA Enhanced Value Index                                                              0
MSCI USA Momentum Index                                                                    0
MSCI USA Sector Neutral Quality Index                                                      0
MSCI USA High Dividend Yield Index                                                         0
VIX                                                                   

In [115]:
# drop first and last row (not in targets)
df_all = pd.concat(dfs, axis=1).reset_index().replace([-np.inf, np.inf], np.nan)
# forward fill remaining nas
df_all = df_all.loc[~df_all["Date"].isin(["2000-05-30", "2021-06-30"])].fillna(method="ffill")
# fill S&P 500 VRP, which is has no other subcategory members and has consecutive missing values on top
for col in ["Global Inflation-linked debt", "S&P 500 VRP"]:
    df_all.loc[df_all[col].isna(), col] = df_all.drop(["Date", col], axis=1).loc[df_all[col].isna(), :].mean(axis=1)

df_all.to_csv(processed_data_dir / "df_imputed.csv", index=False)

In [116]:
df_x = pd.read_csv(processed_data_dir / "df_imputed.csv")
df_x["Date"] = df_x["Date"].astype(str)
df_y = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=3)[["Date", "value_1d_fwd_rel_ret"]]
df_y["Date"] = df_y["Date"].astype(str)
df_y = df_y.query("Date != '2000-05-30'")
df_model_value = df_x.merge(df_y, on = "Date")
df_model_value.head()

Unnamed: 0,Date,S&P 500,MSCI DM,MSCI EM,WTI Crude Oil,S&P GSCI,Gold,Global Inflation-linked debt,iTraxx Europe 5Yr,iTraxx Crossover 5Yr,...,S&P 500 Price-to-Earnings,P/B,US Value P/E over Growth P/E,US Value P/B over Growth P/B,EquityBond premia,S&P 500 Skew,EURUSD Risk Reversal,USDJPY Risk Reversal,EURJPY Risk Reversal,value_1d_fwd_rel_ret
0,2000-05-31,0.636,1.357,-0.975,4.305,2.188,0.15,-1.798349,1.276833,1.276833,...,-2.556,-2.556,-2.556,-2.556,-2.556,10.28,10.28,10.28,10.28,0.004223
1,2000-06-01,-9.632,-4.054,-0.561,-3.481,-1.406,-0.095,-3.204833,-3.204833,-3.204833,...,-0.206,-0.206,-0.206,-0.206,-0.206,-2.725,-2.725,-2.725,-2.725,-0.00308
2,2000-06-02,-10.749,-9.483,-1.93,-0.702,-0.501,-1.135,-4.083333,-4.083333,-4.083333,...,0.591,0.591,0.591,0.591,0.591,-9.435,-9.435,-9.435,-9.435,0.004965
3,2000-06-05,4.077,-1.211,-0.514,2.121,0.514,-0.591,0.732667,0.732667,0.732667,...,-1.127,-1.127,-1.127,-1.127,-1.127,5.209,5.209,5.209,5.209,-0.005362
4,2000-06-06,3.982,-3.157,0.448,-0.157,0.013,-0.491,0.106333,0.106333,0.106333,...,-0.498,-0.498,-0.498,-0.498,-0.498,-0.928,-0.928,-0.928,-0.928,-0.000852
