In [None]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from numpy.linalg import lstsq
from itertools import combinations
from statsmodels.api import OLS
from sklearn.linear_model import LassoCV, RidgeCV
from tqdm import tqdm
import statsmodels.api as sm

In [86]:
## Fetching data - using value weighted returns

industries = pd.read_csv("data/17_Industry_Portfolios.csv",index_col="Date")
industries.index = pd.to_datetime(industries.index,format="%Y%m")
industries = industries.resample("M").last()

mom_size = pd.read_csv("data/MOM-size.csv",index_col="Date")
mom_size.index = pd.to_datetime(mom_size.index,format="%Y%m")
mom_size = mom_size.resample("M").last()

size_value = pd.read_csv("data/size_value.csv",index_col="Date")
size_value.index = pd.to_datetime(size_value.index,format="%Y%m")
size_value = size_value.resample("M").last()

# momentum
momentum = pdr.get_data_famafrench("F-F_Momentum_Factor_daily", start="1920", end="2020-12-31")[0]
momentum = momentum.resample("M").last().squeeze()
momentum.rename(level=0,index="Mom")

momentum = pd.read_csv("data/F-F_Momentum_Factor.csv",index_col="Date")
momentum.index = pd.to_datetime(momentum.index,format="%Y%m")
momentum = momentum.resample("M").last().squeeze()
# Value weighted market
vwm = pd.read_csv("data/VWM.csv",index_col="Date")
vwm.index = pd.to_datetime(vwm.index, format="%Y%m")
vwm = vwm.resample("M").last()

In [101]:

industries = industries["1927-01-31":]
momentum = momentum["1927-01-31":]
vwm = vwm["1927-01-31":]
hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
hml = hml["1927-01-31":]
momentum

Date
1927-01-31     0.44
1927-02-28    -1.32
1927-03-31     3.59
1927-04-30     4.17
1927-05-31     2.96
              ...  
2020-07-31     7.61
2020-08-31     0.51
2020-09-30     3.05
2020-10-31    -3.03
2020-11-30   -12.25
Freq: M, Name: Mom   , Length: 1127, dtype: float64

In [100]:
hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )

In [88]:
## Cross Validation function

def xval_5fold(y, x , random=False, seed = 20201231,fold=5):
    # Use numpy arrays for simplicity
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        # If randomisation is needed, use either the default or provided seed
        rg = np.random.default_rng(seed)
        # Generate a set of index values to use to randomly reorder the data
        # After randomisation, we can use the data as if it is is inorder!
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    # Compute the block size
    block = n / fold
    sse = 0.0
    for i in range(int(fold)):
        # Start and end of each block need to be integers or we lose an observation
        # Rounding ensures that we get all observations since int rounds down
        st = int(np.round(i * block))
        en = int(np.round((i + 1) * block))
        # Construct the indicies of the observations that we leave out
        leave_out = np.r_[st:en]
        # The included are the one that we don't leave out
        include = np.setdiff1d(np.arange(n), leave_out)
        # Compute the regression coefficients
        beta = lstsq(x[include], y[include], rcond=None)[0]
        # Compute the residuals and add to the sse
        resid = y[st:en]- x[st:en] @ beta
        sse += resid @ resid
    return sse

# Randomisation is better when we suspect the model may not be totally stable

In [91]:
# forward stepwise function: 
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def forward_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
                included.append(next_var)
    
    fsr_sse_sv = {}
    for i in range(1,p+1):
        fsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random, fold=fold)
    fsr_sse_sv = pd.Series(fsr_sse_sv)
    forward_step_model = included[:fsr_sse_sv.idxmin()]
    return forward_step_model

In [102]:

nobs = hml.shape[0]

vwm_sse = np.zeros(())

# 20 year rolling regressions
for i in range(240,nobs-60,12): 
    # Select the in sample training data
    y = hml.iloc[i-240:i]
    y = y.squeeze()
    x = industries.iloc[i-240:i]
    t, p = x.shape
    
    #res = sm.OLS(y,x).fit(cov_type="HC0")  ## 

    # Form the out of sample data
    y_oos = (hml.iloc[i:i+60]).squeeze()
    x_oos = industries.iloc[i:i+60]

    # Evaluate a model selection procedure using previously defined function
    model = forward_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred = x_oos[model] @ res.params
    resid_oos = y_oos - pred
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    #vwm_sse[i] = oos_sse

    print(oos_sse)
    print(oos_tss)
    print(oos_r2)
    print(model)
    print()



112.90987520009811
458.34638016
0.7536581936990897
['Food ', 'Mines', 'Oil  ', 'Clths', 'Durbl', 'Chems', 'Cnsum', 'Cnstr', 'Steel', 'FabPr', 'Machn', 'Cars ', 'Trans', 'Utils', 'Rtail']

115.37361966846794
444.13448857749995
0.7402281907041415
['Food ', 'Mines', 'Oil  ', 'Clths', 'Durbl', 'Chems', 'Cnsum', 'Cnstr', 'Steel', 'FabPr', 'Machn', 'Cars ', 'Trans', 'Utils', 'Rtail', 'Finan', 'Other']

133.16285687854497
403.17618826999995
0.6697154724093771
['Food ', 'Mines', 'Oil  ', 'Clths', 'Durbl', 'Chems', 'Cnsum', 'Cnstr', 'Steel', 'FabPr', 'Machn', 'Cars ', 'Trans', 'Utils', 'Rtail']

151.09792807670325
476.260789005
0.6827411964936778
['Food ', 'Mines', 'Oil  ', 'Clths', 'Durbl', 'Chems', 'Cnsum', 'Cnstr', 'Steel', 'FabPr', 'Machn', 'Cars ', 'Trans', 'Utils', 'Rtail', 'Finan']

155.22427930818253
232.11126689
0.3312505619051014
['Food ', 'Mines', 'Oil  ', 'Clths', 'Durbl', 'Chems', 'Cnsum', 'Cnstr', 'Steel', 'FabPr', 'Machn', 'Cars ', 'Trans', 'Utils', 'Rtail', 'Finan']

182.6672208