In [109]:
import sys
from pathlib import Path
import pandas as pd 
import numpy as np

sys.path.append(str(Path("..").resolve()))

from src.constants import raw_data_dir, raw_data_name, processed_data_dir

In [110]:
def lag_return(x, lag=1):
    r = ((x - x.shift(lag)) / x.shift(lag)) 
    return pd.Series(np.round(r * 100, 3))

def impute_col(df_sc, col, values):
    df_col = df_sc.loc[:, col].copy()
    idx_non_missing = df_col.notnull()
    df_col.loc[~idx_non_missing] = values.loc[~idx_non_missing]
    df_col.loc[idx_non_missing] = lag_return(df_col).loc[idx_non_missing]
    return df_col

In [111]:
factors = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=1)
factors.set_index("Date", inplace=True)
factors = factors.apply(lambda x: (x - np.mean(x)) / np.std(x))
categories = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=2)

In [112]:
sc_group = (
    categories.groupby("Subcategory")["Variable"]
    .agg("unique")
    .reset_index()
    .to_dict("list")
)

In [113]:
dfs = []

for idx, sc in enumerate(sc_group["Subcategory"]):
    if sc not in ["Policy Uncertainty", "Sentiment", "Inflation"]:
        cols = sc_group["Variable"][idx]
        df_sc = factors.loc[:, cols]
        mean_returns = df_sc.apply(lag_return).mean(axis=1)
        for col in cols:
            df_col = impute_col(df_sc, col, mean_returns)
            dfs.append(df_col)

In [114]:
pd.concat(dfs, axis=1).apply(np.isinf).sum().sort_values(ascending=False)

JPM EMBI+ Sovereign Spread                                                                 3
JPM EMBI Global Spread                                                                     3
EURJPY Risk Reversal                                                                       1
Ted Spread                                                                                 1
US 2Yr swap spread                                                                         1
                                                                                          ..
MSCI USA Enhanced Value Index                                                              0
MSCI USA Momentum Index                                                                    0
MSCI USA Sector Neutral Quality Index                                                      0
MSCI USA High Dividend Yield Index                                                         0
VIX                                                                   

In [115]:
# drop first and last row (not in targets)
df_all = pd.concat(dfs, axis=1).reset_index().replace([-np.inf, np.inf], np.nan)
# forward fill remaining nas
df_all = df_all.loc[~df_all["Date"].isin(["2000-05-30", "2021-06-30"])].fillna(method="ffill")
# fill S&P 500 VRP, which is has no other subcategory members and has consecutive missing values on top
for col in ["Global Inflation-linked debt", "S&P 500 VRP"]:
    df_all.loc[df_all[col].isna(), col] = df_all.drop(["Date", col], axis=1).loc[df_all[col].isna(), :].mean(axis=1)

df_all.to_csv(processed_data_dir / "df_imputed.csv", index=False)

In [116]:
df_x = pd.read_csv(processed_data_dir / "df_imputed.csv")
df_x["Date"] = df_x["Date"].astype(str)
df_y = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=3)[["Date", "value_1d_fwd_rel_ret"]]
df_y["Date"] = df_y["Date"].astype(str)
df_y = df_y.query("Date != '2000-05-30'")
df_model_value = df_x.merge(df_y, on = "Date")
df_model_value.head()

Unnamed: 0,Date,S&P 500,MSCI DM,MSCI EM,WTI Crude Oil,S&P GSCI,Gold,Global Inflation-linked debt,iTraxx Europe 5Yr,iTraxx Crossover 5Yr,...,S&P 500 Price-to-Earnings,P/B,US Value P/E over Growth P/E,US Value P/B over Growth P/B,EquityBond premia,S&P 500 Skew,EURUSD Risk Reversal,USDJPY Risk Reversal,EURJPY Risk Reversal,value_1d_fwd_rel_ret
0,2000-05-31,0.636,1.357,-0.975,4.305,2.188,0.15,-1.798349,1.276833,1.276833,...,-2.556,-2.556,-2.556,-2.556,-2.556,10.28,10.28,10.28,10.28,0.004223
1,2000-06-01,-9.632,-4.054,-0.561,-3.481,-1.406,-0.095,-3.204833,-3.204833,-3.204833,...,-0.206,-0.206,-0.206,-0.206,-0.206,-2.725,-2.725,-2.725,-2.725,-0.00308
2,2000-06-02,-10.749,-9.483,-1.93,-0.702,-0.501,-1.135,-4.083333,-4.083333,-4.083333,...,0.591,0.591,0.591,0.591,0.591,-9.435,-9.435,-9.435,-9.435,0.004965
3,2000-06-05,4.077,-1.211,-0.514,2.121,0.514,-0.591,0.732667,0.732667,0.732667,...,-1.127,-1.127,-1.127,-1.127,-1.127,5.209,5.209,5.209,5.209,-0.005362
4,2000-06-06,3.982,-3.157,0.448,-0.157,0.013,-0.491,0.106333,0.106333,0.106333,...,-0.498,-0.498,-0.498,-0.498,-0.498,-0.928,-0.928,-0.928,-0.928,-0.000852
