In [126]:
import sys
from pathlib import Path
import pandas as pd 
import numpy as np
import dask


sys.path.append(str(Path("..").resolve()))

from src.constants import raw_data_dir, raw_data_name, processed_data_dir

In [271]:
def lag_return(x, lag=1):
    return ((x - x.shift(lag)) / x.shift(lag))

def impute_col(df_sc, col, value):
    df_col = df_sc.loc[:, col].copy()
    idx_non_missing = df_col.notnull()
    df_col.loc[~idx_non_missing] = value.loc[~idx_non_missing]
    df_col.loc[idx_non_missing] = lag_return(df_col).loc[idx_non_missing]
    return df_col

In [340]:
factors = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=1)
factors.set_index("Date", inplace=True)
categories = pd.read_excel(raw_data_dir / raw_data_name, sheet_name=2)

In [158]:
sc_group = categories.groupby("Subcategory")["Variable"].agg("unique").reset_index().to_dict("list")
sc_group

{'Subcategory': ['Asset Class Performances',
  'Credit Spread',
  'Currency',
  'ETF',
  'FX Carry',
  'Growth',
  'Implied Volatility',
  'Inflation',
  'Liquidity',
  'Policy Uncertainty',
  'Rates',
  'Sentiment',
  'TERM/CARRY',
  'VRP',
  'Valuation',
  'Volatility Skew & Hedge Ratio'],
 'Variable': [array(['S&P 500', 'MSCI DM', 'MSCI EM', 'WTI Crude Oil', 'S&P GSCI',
         'Gold', 'Global Inflation-linked debt', 'iTraxx Europe 5Yr',
         'iTraxx Crossover 5Yr', 'CDX IG 5Yr', 'CDX HY 5Yr'], dtype=object),
  array(['Credit Spread A', 'Credit Spread BBB', 'High Yield Spread',
         'JPM EMBI Global Spread', 'JPM EMBI+ Sovereign Spread',
         'US IG CDS spread', 'US HY CDS spread', 'US HY over IG CDS spread',
         'EU IG CDS spread'], dtype=object),
  array(['US Dollar', 'EUR', 'JPY', 'Asian currency', 'EM currency'],
        dtype=object),
  array(['Min Vol Index', 'IWF', 'IWM',
         'MSCI USA High Dividend Yield Index',
         'MSCI USA Sector Neutral Qualit

In [315]:
dfs = []

for idx, sc in enumerate(sc_group["Subcategory"]):
    if sc not in ["Policy Uncertainty", "Sentiment", "Inflation"]:
        cols = sc_group["Variable"][idx]
        df_sc = factors.loc[:, cols]
        mean_returns = df_sc.apply(lag_return).mean(axis=1)
        for col in cols:
            df_col = impute_col(df_sc, col, mean_returns)
            dfs.append(df_col)

In [346]:
# drop first and last row (not in targets)
df_all = pd.concat(dfs, axis=1).reset_index()
# forward fill remaining nas
df_all = df_all.loc[~df_all["Date"].isin(["2000-05-30", "2021-06-30"])].fillna(method="ffill")
# fill S&P 500 VRP, which is has no other subcategory members and has consective missing values on top
for col in ["Global Inflation-linked debt", "S&P 500 VRP"]:
    df_all.loc[df_all[col].isna(), col] = df_all.drop(["Date", col], axis=1).loc[df_all[col].isna(), :].mean(axis=1)

df_all.to_csv(processed_data_dir / "df_imputed.csv", index=False)

In [347]:
df_all

Unnamed: 0,Date,S&P 500,MSCI DM,MSCI EM,WTI Crude Oil,S&P GSCI,Gold,Global Inflation-linked debt,iTraxx Europe 5Yr,iTraxx Crossover 5Yr,...,S&P 500 VRP,S&P 500 Price-to-Earnings,P/B,US Value P/E over Growth P/E,US Value P/B over Growth P/B,EquityBond premia,S&P 500 Skew,EURUSD Risk Reversal,USDJPY Risk Reversal,EURJPY Risk Reversal
1,2000-05-31,-0.001301,-0.004770,0.015917,-0.044152,-0.020002,-0.004026,-0.014489,-0.009722,-0.009722,...,-0.014489,-0.045988,-0.045988,-0.045988,-0.045988,-0.045988,-0.011825,-0.011825,-0.011825,-0.011825
2,2000-06-01,0.019858,0.014511,0.008920,0.038952,0.013407,0.002573,0.016370,0.016370,0.016370,...,0.003813,-0.003790,-0.003790,-0.003790,-0.003790,-0.003790,0.003498,0.003498,0.003498,0.003498
3,2000-06-02,0.019637,0.032103,0.030264,0.007299,0.004650,0.030609,0.020760,0.020760,0.020760,...,-0.044493,0.010877,0.010877,0.010877,0.010877,0.010877,0.011741,0.011741,0.011741,0.011741
4,2000-06-05,-0.006519,0.003596,0.007675,-0.021739,-0.004720,0.015294,-0.001069,-0.001069,-0.001069,...,0.004492,-0.020645,-0.020645,-0.020645,-0.020645,-0.020645,-0.005802,-0.005802,-0.005802,-0.005802
5,2000-06-06,-0.006671,0.009226,-0.006599,0.001684,-0.000117,0.012437,0.001660,0.001660,0.001660,...,0.001823,-0.009201,-0.009201,-0.009201,-0.009201,-0.009201,0.001094,0.001094,0.001094,0.001094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5300,2021-06-23,-0.001083,-0.002636,0.010285,0.003011,0.005710,-0.000034,0.005205,0.000304,0.002261,...,-0.571545,-0.001083,-0.001083,-0.001947,-0.001946,-0.009651,-0.035263,-0.480000,-0.435484,-0.031579
5301,2021-06-24,0.005811,0.004152,0.005384,0.002320,0.000546,-0.001973,-0.000975,0.000294,0.001568,...,4.154264,0.005789,0.005790,0.001365,0.001365,-0.013899,0.039974,-0.000000,0.114286,-0.038043
5302,2021-06-25,0.003331,0.004929,0.008972,0.010892,0.002517,0.003532,-0.001607,0.000261,0.001349,...,0.939571,0.003001,0.003286,0.005436,0.005429,-0.022790,0.059054,-0.051282,-0.256410,-0.152542
5303,2021-06-28,0.002315,-0.005298,0.001262,-0.018047,-0.002078,-0.001662,0.002128,-0.000391,-0.002420,...,-0.291623,-0.010330,0.001783,-0.099512,-0.109279,0.045537,-0.049604,0.054054,0.034483,-0.040000
