In [1]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from numpy.linalg import lstsq
from itertools import combinations
from statsmodels.api import OLS
from sklearn.linear_model import LassoCV, RidgeCV
#from tqdm import tqdm
import statsmodels.api as sm
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
## Fetching data - using value weighted returns

industries = pd.read_csv("data/17_Industry_Portfolios.csv",index_col="Date")
industries.index = pd.to_datetime(industries.index,format="%Y%m")
industries = industries.resample("M").last()
industries = industries["1927-01-31":]

mom_size = pd.read_csv("data/MOM-size.csv",index_col="Date")
mom_size.index = pd.to_datetime(mom_size.index,format="%Y%m")
mom_size = mom_size.resample("M").last()
mom_size = mom_size["1927-01-31":]

size_value = pd.read_csv("data/size_value.csv",index_col="Date")
size_value.index = pd.to_datetime(size_value.index,format="%Y%m")
size_value = size_value.resample("M").last()
size_value = size_value["1927-01-31":]


#momentum = pdr.get_data_famafrench("F-F_Momentum_Factor_daily", start="1920", end="2020-12-31")[0]
#momentum = momentum.resample("M").last().squeeze()
#momentum.rename(level=0,index="Mom")

momentum = pd.read_csv("data/F-F_Momentum_Factor.csv",index_col="Date")
momentum.index = pd.to_datetime(momentum.index,format="%Y%m")
momentum = momentum.resample("M").last().squeeze()

# Value weighted market

vwm = pd.read_csv("data/VWM.csv",index_col="Date")
vwm.index = pd.to_datetime(vwm.index, format="%Y%m")
vwm = vwm.resample("M").last()
vwm = vwm["1927-01-31":]


In [3]:
hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
mom = 0.5 * (mom_size["SMALL HiPRIOR"] + mom_size["BIG HiPRIOR"]) - 0.5 * (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"])

In [4]:
## Cross Validation function

def xval_5fold(y, x , random=False, seed = 20201231,fold=5):
    # Use numpy arrays for simplicity
    y = np.asarray(y)
    x = np.asarray(x)
    n = y.shape[0]
    if random:
        # If randomisation is needed, use either the default or provided seed
        rg = np.random.default_rng(seed)
        # Generate a set of index values to use to randomly reorder the data
        # After randomisation, we can use the data as if it is is inorder!
        ind = rg.permutation(np.arange(n))
        y = y[ind]
        x = x[ind]
    # Compute the block size
    block = n / fold
    sse = 0.0
    for i in range(int(fold)):
        # Start and end of each block need to be integers or we lose an observation
        # Rounding ensures that we get all observations since int rounds down
        st = int(np.round(i * block))
        en = int(np.round((i + 1) * block))
        # Construct the indicies of the observations that we leave out
        leave_out = np.r_[st:en]
        # The included are the one that we don't leave out
        include = np.setdiff1d(np.arange(n), leave_out)
        # Compute the regression coefficients
        beta = lstsq(x[include], y[include], rcond=None)[0]
        # Compute the residuals and add to the sse
        resid = y[st:en]- x[st:en] @ beta
        sse += resid @ resid
    return sse

# Randomisation is better when we suspect the model may not be totally stable

In [5]:
# forward stepwise function: 
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def forward_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
        included.append(next_var)
    
    fsr_sse_sv = {}
    for i in range(1,p+1):
        fsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random, fold=fold)
    fsr_sse_sv = pd.Series(fsr_sse_sv)
    forward_step_model = included[:fsr_sse_sv.idxmin()]
    return forward_step_model

In [6]:
# Hybrid stepwise function
# requires x: x dataframe , y series, p = number of columns, requires xval function too
# input x, y, p
def hybrid_stepwise(x, y, p,random=False,fold=5):
    included = []

    for i in range(p):
        excluded = [col for col in x if col not in included]
        best_sse = np.inf
        for col in excluded:
            try_x = x[included + [col]]
            beta = lstsq(try_x, y, rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_var = col
        included.append(next_var)

    end_models = {}
    for i in range(1,p+1):
        x_col = included[:i]
        regressors = x[x_col]
        end_models[i] = back_stepwise(regressors,y,p,random=random,fold=fold)

    hsr_sse_sv = {}

    for i in range(1,p+1):
        hsr_sse_sv[i] = xval_5fold(y,x[end_models[i]],random=random, fold=fold)
    hsr_sse_sv = pd.Series(hsr_sse_sv)
    hybrid_step_model = end_models[hsr_sse_sv.idxmin()]


    return hybrid_step_model

In [7]:
# Backward stepwise components

def back_stepwise(x,y,p,random=False,fold=5):


    included = list(x.columns)
    
    removed = []
    p = x.shape[1]

    for i in range(p):
        best_sse = np.inf
        for col in included:
            try_col = included[:]
            try_col.remove(col)
            try_x = x[try_col]
            beta = lstsq(try_x,y,rcond=None)[0]
            resid = y - try_x @ beta
            sse = resid @ resid
            if sse < best_sse:
                best_sse = sse
                next_drop = col
        removed.append(next_drop)
        included.remove(next_drop)

    #print(removed)

    included = removed[::-1]

    bsr_sse_sv = {}
    for i in range(1,p+1):
        bsr_sse_sv[i] = xval_5fold(y,x[included[:i]],random=random,fold=fold)
    bsr_sse_sv = pd.Series(bsr_sse_sv)
    backstep_model = included[:bsr_sse_sv.idxmin()]
    return backstep_model

In [8]:
### LASSO function

def lasso_reg(x,y):

    x_scale = x.std(ddof=0)
    y_scale = y.std(ddof=0)
    std_x = x / x_scale
    std_y = y / y_scale
    std_x = std_x

    lasso_cv = LassoCV(fit_intercept=False)
    lasso_cv = lasso_cv.fit(std_x,std_y)
    #print(f"Optimal Alpha = {lasso_cv.alpha_}")
    lasso_beta = lasso_cv.coef_  * (y_scale / x_scale)
    return lasso_beta


In [9]:
# General to Specific selection

def gts(x,y):
    cv = stats.norm.ppf(.995)

    included  = list(x.columns)
    y = y
    for i in range(16):
        x = x.loc[:,included]
        res = sm.OLS(y,x).fit(cov_type="HC0")
        tstats = res.tvalues
        if tstats.abs().min() < cv:
            sorted_tstats = tstats.abs().sort_values()
            remove = sorted_tstats.index[0]
            included.remove(remove)
        else:
            break
    return included

In [10]:
# Ridge regression 

def ridge_reg(x,y):
    x_scale = x.std(ddof=0)
    y_scale = y.std(ddof=0)
    std_x = x / x_scale
    std_y = y / y_scale

    ridge_cv = RidgeCV(fit_intercept=False, alphas=np.linspace(1, 100, 100))
    ridge_cv = ridge_cv.fit(std_x, std_y)
    #print(f"Optimal alpha = {ridge_cv.alpha_}")
    start = ridge_cv.alpha_ - (1/5 * ridge_cv.alpha_)
    end = ridge_cv.alpha_ + (1/5 * ridge_cv.alpha_)

    ridge_cv = RidgeCV(fit_intercept=False, alphas=np.linspace(start, end, 2001))
    ridge_cv = ridge_cv.fit(std_x, std_y)
    #print(f"Optimal alpha = {ridge_cv.alpha_}")
    return ridge_cv.coef_ * (y_scale / x_scale)


HML (Value) and momentum,  rolling 
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [11]:
hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
mom = 0.5 * (mom_size["SMALL HiPRIOR"] + mom_size["BIG HiPRIOR"]) - (0.5 * (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"]))

In [12]:

hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
mom = 0.5 * (mom_size["SMALL HiPRIOR"] + mom_size["BIG HiPRIOR"]) - (0.5 * (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"]))

"""Toggle to select the number of years data trained on, 20, 10 or 5"""


nobs = industries.shape[0]

# rolling regressions
for i in range(240,nobs-60,12): 

    # Toggle to select number of years
    #years = 12 * years

    # Select the in sample training data
    y = mom.iloc[i-240:i]                 # ----  change to mom or hml as desired   
    y = y.squeeze()
    x = industries.iloc[i-240:i]         
    t, p = x.shape
    
    # Form the out of sample data
    y_oos = (mom.iloc[i:i+60]).squeeze()    # ----  change to mom or hml as desired
    x_oos = industries.iloc[i:i+60]

    #---------------------------------------------------------------------------------------------------------
    # Simple OLS Regression
    """
    res = sm.OLS(y,x).fit(cov_type="HC0")
    pred_ols = x_oos @ res.params
    resid_oos = y_oos - pred_ols
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Simple OLS")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Hybrid Stepwise
    """
    model = hybrid_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred_hy = x_oos[model] @ res.params
    resid_oos = y_oos - pred_hy
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Hybrid Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """

      #---------------------------------------------------------------------------------------------------------
    # Forward Stepwise
    """
    model = forward_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred_fwd = x_oos[model] @ res.params
    resid_oos = y_oos - pred_fwd
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    #print("Forward Stepwise")
    #print(oos_sse)
    #print(oos_tss)
    #print(oos_r2)
    #print(model)
    #print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso on the forward stepwise
    """
    beta = lasso_reg(x[model],y)
    pred_fwd_lasso = x_oos[model] @ beta
    resid_oos = y_oos - pred_fwd_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    

    
    #print("Lasso on Forward Stepwise")
    #print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso regression on all industries
    """
    beta = lasso_reg(x,y)
    pred_lasso = x_oos @ beta
    resid_oos = y_oos - pred_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Lasso")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

     #---------------------------------------------------------------------------------------------------------
    # Ridge selection on all industries
    """
    beta = ridge_reg(x,y)
    pred_ridge = x_oos @ beta
    resid_oos = y_oos - pred_ridge
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Ridge")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # General to Specific selection
    """
    model = gts(x,y)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    pred_gts = x_oos[model] @ res.params
    resid_oos = y_oos - pred_gts
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso on general to specific
    """
    beta = lasso_reg(x[model],y)
    pred_gts_lasso = x_oos[model] @ beta
    resid_oos = y_oos - pred_gts_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    #print("Lasso on General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Ridge on general to specific
    """
    beta = ridge_reg(x[model],y)
    pred_gts_ridge = x_oos[model] @ beta
    resid_oos = y_oos - pred_gts_ridge
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Ridge on General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Mixed model
    """
    pred_mixed = (1/3)* pred_gts_ridge + (1/3) * pred_fwd_lasso + (1/3)* pred_hy
    resid_oos = y_oos - pred_mixed
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """
    
    """
    print("Naive Averaging")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """


    #---------------------------------------------------------------------------------------------------------------------------
    # Random Forest estimators
    """
    rfr = RandomForestRegressor(max_features="sqrt", random_state=20201231)
    rfr = rfr.fit(x, y)
    resid = y - rfr.predict(x)
    #print(f"The RandomForest SSE is {resid.T@resid:0.1f}")


    pred = rfr.predict(x_oos)
    resid = y_oos - pred
    rf_oos_sse = resid.T @ resid
    oos_tss = (y_oos **2).sum()
    print(rf_oos_sse)
    print(1 - rf_oos_sse/oos_tss)
    """

    # Boosting
    """
    gbr = GradientBoostingRegressor(random_state=20201231)
    gbr.fit(x, y)
    pred = gbr.predict(x_oos)
    resid = y_oos - pred
    oos_tss = (y_oos **2).sum()
    gbr_oos_sse = resid @ resid
    print(gbr_oos_sse)
    print(1 - gbr_oos_sse/oos_tss)
    print()
    """


HML (Value) and Momentum, positive side
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) 
mom = 0.5 * (mom_size["SMALL HiPRIOR"] + mom_size["BIG HiPRIOR"])

"""Toggle to select the number of years data trained on, 20, 10 or 5"""
#years = 20

nobs = industries.shape[0]
# rolling regressions
for i in range(240,nobs-60,12): 

    # Toggle to select number of years
    #years = 12 * years

    # Select the in sample training data
    y = mom.iloc[i-240:i]                   # ----  change to mom or hml as desired
    y = y.squeeze()
    x = industries.iloc[i-240:i]         
    t, p = x.shape
    
    # Form the out of sample data
    y_oos = (mom.iloc[i:i+60]).squeeze()      # ----  change to mom or hml as desired
    x_oos = industries.iloc[i:i+60]

    #---------------------------------------------------------------------------------------------------------
    # Simple OLS Regression
    """
    res = sm.OLS(y,x).fit(cov_type="HC0")
    pred_ols = x_oos @ res.params
    resid_oos = y_oos - pred_ols
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    #print("Simple OLS")
    #print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """
    #---------------------------------------------------------------------------------------------------------
    # Hybrid Stepwise
    """
    model = hybrid_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred_hy = x_oos[model] @ res.params
    resid_oos = y_oos - pred_hy
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Hybrid Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """
    #---------------------------------------------------------------------------------------------------------
    # Forward Stepwise
    """
    model = forward_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred_fwd = x_oos[model] @ res.params
    resid_oos = y_oos - pred_fwd
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Forward Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso on the forward stepwise
    """
    beta = lasso_reg(x[model],y)
    pred_fwd_lasso = x_oos[model] @ beta
    resid_oos = y_oos - pred_fwd_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Lasso on Forward Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso regression on all industries
    """
    beta = lasso_reg(x,y)
    pred_lasso = x_oos @ beta
    resid_oos = y_oos - pred_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Lasso")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

     #---------------------------------------------------------------------------------------------------------
    # Ridge selection on all industries
    """
    beta = ridge_reg(x,y)
    pred_ridge = x_oos @ beta
    resid_oos = y_oos - pred_ridge
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Ridge")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # General to Specific selection
    """
    model = gts(x,y)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    pred_gts = x_oos[model] @ res.params
    resid_oos = y_oos - pred_gts
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso on general to specific
    """
    beta = lasso_reg(x[model],y)
    pred_gts_lasso = x_oos[model] @ beta
    resid_oos = y_oos - pred_gts_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    #print("Lasso on General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Ridge on general to specific
    """
    beta = ridge_reg(x[model],y)
    pred_gts_ridge = x_oos[model] @ beta
    resid_oos = y_oos - pred_gts_ridge
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Ridge on General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Mixed model
    """
    pred_mixed = (1/3)* pred_gts_ridge + (1/3) * pred_fwd_lasso + (1/3)* pred_hy
    resid_oos = y_oos - pred_mixed
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """
    
    """
    print("Naive Averaging")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """



0.883200650916961
0.8972379576938263
0.8901139472896202
0.884685727337866
0.8425453153671465
0.872415177968377
0.8861294795141489
0.9192243061437586
0.9175337934552302
0.9289751747027283
0.9078039508192347
0.8886064933203105
0.8686424699255462
0.8530631072866139
0.853695029766153
0.7853607528321793
0.7500018934500536
0.7692065063152621
0.7816201365854005
0.8511772358771328
0.9168965367965052
0.9269753002379958
0.9101505227357185
0.864825216525838
0.863550321664373
0.8608275342080995
0.8432114529575225
0.8741392208560914
0.9116389749878926
0.8925353879019473
0.8815837083982165
0.8894471113578603
0.8961602830382152
0.8783511696908622
0.7872478737315775
0.727619612420244
0.8709366874901106
0.8684120421164127
0.8963964568902045
0.9198672101361179
0.96005670436959
0.9196152766790114
0.8957343833112954
0.8741427361132329
0.8397061052481627
0.8166104089868556
0.8740131582818693
0.9247250321602638
0.881317642804424
0.7252539694323676
0.6283495351523638
0.5691555772151418
0.48809495442367645
0.

HML (Value) and Momentum, negative side
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
hml = - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
mom = - 0.5 * (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"])

In [15]:
hml = - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
mom = - 0.5 * (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"])

"""Toggle to select the number of years data trained on, 20, 10 or 5"""
#years = 10

nobs = hml.shape[0]
# rolling regressions
for i in range(240,nobs-60,12): 

    # Toggle to select number of years
    #years = 12 * years

    # Select the in sample training data
    y = mom.iloc[i-60:i]                   # ----  change to mom or hml as desired
    y = y.squeeze()
    x = industries.iloc[i-60:i]         
    t, p = x.shape
    
    # Form the out of sample data
    y_oos = (mom.iloc[i:i+60]).squeeze()      # ----  change to mom or hml as desired
    x_oos = industries.iloc[i:i+60]

    #---------------------------------------------------------------------------------------------------------
    # Simple OLS Regression
    """
    res = sm.OLS(y,x).fit(cov_type="HC0")
    pred_ols = x_oos @ res.params
    resid_oos = y_oos - pred_ols
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Simple OLS")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """
    #---------------------------------------------------------------------------------------------------------
    # Hybrid Stepwise
    """
    model = hybrid_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred_hy = x_oos[model] @ res.params
    resid_oos = y_oos - pred_hy
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Hybrid Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """
    #---------------------------------------------------------------------------------------------------------
    # Forward Stepwise
    """
    model = forward_stepwise(x, y, p)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    
    pred_fwd = x_oos[model] @ res.params
    resid_oos = y_oos - pred_fwd
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Forward Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso on the forward stepwise
    """
    beta = lasso_reg(x[model],y)
    pred_fwd_lasso = x_oos[model] @ beta
    resid_oos = y_oos - pred_fwd_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Lasso on Forward Stepwise")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso regression on all industries
    """
    beta = lasso_reg(x,y)
    pred_lasso = x_oos @ beta
    resid_oos = y_oos - pred_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Lasso")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

     #---------------------------------------------------------------------------------------------------------
    # Ridge selection on all industries
    """
    beta = ridge_reg(x,y)
    pred_ridge = x_oos @ beta
    resid_oos = y_oos - pred_ridge
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Ridge")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # General to Specific selection
    """
    model = gts(x,y)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    pred_gts = x_oos[model] @ res.params
    resid_oos = y_oos - pred_gts
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Lasso on general to specific
    """
    beta = lasso_reg(x[model],y)
    pred_gts_lasso = x_oos[model] @ beta
    resid_oos = y_oos - pred_gts_lasso
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    #print("Lasso on General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Ridge on general to specific
    """
    beta = ridge_reg(x[model],y)
    pred_gts_ridge = x_oos[model] @ beta
    resid_oos = y_oos - pred_gts_ridge
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    """

    """
    print("Ridge on General to Specific")
    print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    print()
    """

    #---------------------------------------------------------------------------------------------------------
    # Mixed model
    """
    pred_mixed = (1/3)* pred_gts_ridge + (1/3) * pred_fwd_lasso + (1/3)* pred_hy
    resid_oos = y_oos - pred_mixed
    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss
    
    
    
    #print("Naive Averaging")
    #print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()
    """

0.8161281795502736
0.8757602644099745
0.8506839903363462
0.7125482600059485
0.7727255253864111
0.7665133090524834
0.738534173675979
0.8287456140916042
0.7854712955761592
0.8446688789086685
0.8144249576941711
0.8352643301793339
0.8349728748074635
0.8210486877650638
0.8015387096214829
0.7800115290318218
0.7435672536375699
0.8043754857680353
0.8824650094991794
0.8622883626228227
0.8419051556548526
0.8151074443170211
0.8007109862021045
0.8544839464906604
0.9050862645994229
0.887208870129436
0.8641567247727611
0.832711150951174
0.8200781904008588
0.7000461437425995
0.5847240606783506
0.6989353883118707
0.7709585682374388
0.7815115657212801
0.7533925399359377
0.6561766443837838
0.8026741610674444
0.7955174557527056
0.718342623531919
0.7486794844973563
0.7419383258442172
0.7658329404172741
0.7027439678340663
0.7573199929478349
0.5954610177322726
0.4220659518621437
0.6648930849761645
0.7074923613899607
0.6083100653046793
0.7118354015129911
0.6649383168835343
0.7171019844165749
0.71283316528991

--------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------
START: Combining neg and pos predictions with general to specific
---------------------------

Because the correlations differ between positive and negative weights, we cannot simply combine both the up and down weights to create a better tracking model

In [65]:


"""Toggle to select the number of years data trained on, 20, 10 or 5"""
#years = 10

nobs = hml.shape[0]
# rolling regressions
for i in range(240,nobs-72,12): 

    # negative ----------------------------------------------------------------
    x_oos = industries.iloc[i:i+60]

    hml =  (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
    mom =  (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"])

  

    # Select the in sample training data
    y = hml.iloc[i-60:i]                   # ----  change to mom or hml as desired
    y = y.squeeze()
    x = industries.iloc[i-60:i]         
    t, p = x.shape


    # General to Specific selection
    
    model = gts(x,y)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    pred_gts_neg = x_oos[model] @ res.params
    #print(pred_gts_neg)

    # positive --------------------------------------------
    hml = (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) 
    mom = (mom_size["SMALL HiPRIOR"] + mom_size["BIG HiPRIOR"])
  

    # Select the in sample training data
    y = hml.iloc[i-60:i]                   # ----  change to mom or hml as desired
    y = y.squeeze()
    x = industries.iloc[i-60:i]         
    t, p = x.shape

    # General to Specific selection
    
    model = gts(x,y)
    res = sm.OLS(y,x[model]).fit(cov_type="HC0")
    pred_gts_pos = x_oos[model] @ res.params
    #print(pred_gts_pos)
    #print()

    comb_pred = 0.5 * pred_gts_pos - 0.5* pred_gts_neg
    #print(comb_pred)


    # ---- out of sample
    hml = 0.5 * (size_value["SMALL HiBM"] + size_value["BIG HiBM"]) - 0.5 * (size_value["SMALL LoBM"] + size_value["BIG LoBM"] )
    mom = 0.5 * (mom_size["SMALL HiPRIOR"] + mom_size["BIG HiPRIOR"]) - (0.5 * (mom_size["SMALL LoPRIOR"] + mom_size["BIG LoPRIOR"]))
    # Form the out of sample data
    y_oos = (hml.iloc[i:i+60]).squeeze()      # ----  change to mom or hml as desired
    x_oos = industries.iloc[i:i+60]


    resid_oos = y_oos - comb_pred
    #print(comb_pred)

    oos_sse = resid_oos @ resid_oos
    oos_tss = (y_oos **2).sum()
    oos_r2 = 1- oos_sse/ oos_tss







    #print("General to Specific")
    #print(oos_sse)
    #print(oos_tss)
    print(oos_r2)
    #print(model)
    #print()


0.05105613761589234
0.6388538039972356
0.7795557214982296
0.8129354376110102
0.3800085826460615
0.03519036963175848
0.23006086055018837
0.056627801406854084
-0.7473198576749822
-0.017725476999929768
0.0024685513713508245
-0.7808991578877411
-0.3762349195489496
-0.1178921462140301
0.286820228128721
0.09913790800544242
-0.5369825152350423
-0.07235714328728493
-0.10125384752585287
-0.07447389773022617
0.07940125663493236
-0.06031984368714771
-0.18305301656691042
0.3428147909402188
0.35074994790651237
0.21567954281745305
0.06909932050871304
-0.43319228169310864
-0.42691447922422543
0.013014167179637437
-0.21807078815500813
-0.11826792141399256
-0.492480409057525
-0.08270676993014825
0.06859618719870386
0.007977715244798178
-0.0022000187504802593
0.3946452240168502
0.25900973215067613
-0.04796028680732034
0.28279344146011265
-0.21784487403716102
0.1320955645117562
0.4949806364287699
0.5018821799329805
0.33135590947928495
0.3504077029853043
0.512771672002621
0.4325280547295802
0.647390217518