In [3]:
import numpy as np
import pandas as pd
import math
import sklearn.preprocessing
import datetime
from TimeBasedCV import TimeBasedCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import make_scorer, r2_score
import statsmodels.api as sm
import warnings
import pickle

warnings.simplefilter(action='ignore', category=Warning)
from sklearn.linear_model import HuberRegressor
# pd.set_option('display.max_rows', None)
# more

In [4]:
# df = pd.read_csv('data/factors_1965.csv', parse_dates=['DATE'])

In [5]:
# with open('data/features_1965.pkl', 'wb') as f:
#     pickle.dump(df, f)

with open('data/features_1965.pkl', 'rb') as f:
    df = pickle.load(f)
    print(df.head())



   permno       DATE        mvel1      beta    betasq     chmom     dolvol  \
0   10145 1965-02-26   1498872.00  0.983510  0.967291  0.105988  11.546907   
1   10401 1965-02-26  35392058.00  0.780829  0.609694 -0.063768  12.240330   
2   10786 1965-02-26   1695284.75  0.806119  0.649827 -0.130519  12.005040   
3   10989 1965-02-26   1295887.75  1.199748  1.439395  0.073609  11.756961   
4   11260 1965-02-26   2302001.25  1.257269  1.580725 -0.167320  12.240330   

    idiovol    indmom     mom1m  ...  macro_ep  macro_bm  macro_ntis  \
0  0.022307  0.035075  0.104116  ...  2.936836  0.471399    0.014823   
1  0.013395  0.335139 -0.007326  ...  2.936836  0.471399    0.014823   
2  0.024366  0.104106  0.060498  ...  2.936836  0.471399    0.014823   
3  0.022717  0.118513  0.068807  ...  2.936836  0.471399    0.014823   
4  0.035883  0.185424 -0.036885  ...  2.936836  0.471399    0.014823   

   macro_tbl  macro_tms  macro_dfy  macro_svar  macro_mkt-rf  macro_hml  \
0     0.0393    -0.0379

In [6]:
#Sort observations by date and stock id
df[df.columns[2:]] = df[df.columns[2:]].astype('float32')
df = df.sort_values(by = ['DATE', 'permno'], ascending = True)

In [7]:
df['permno2'] = df['permno'].copy()
df['DATE2'] = df['DATE'].copy()
df['mvel12'] = df['mvel1'].copy()
df = df.set_index(['DATE2','permno2'])


In [8]:
p=0.3 
df_top= df.groupby('DATE').apply(lambda x: x.nlargest(int(len(x)*p),'mvel1')).reset_index(drop=True)  
df_bottom = df.groupby('DATE').apply(lambda x: x.nsmallest(int(len(x)*p),'mvel1')).reset_index(drop=True)  




In [9]:

def calculate_r2(y_true, y_pred, in_sample=True, benchmark=None):
    if in_sample:
        return 1 - (np.sum((y_true - y_pred) ** 2) / 
                    np.sum((y_true - np.mean(y_true)) ** 2))
    else:
        if benchmark is None:
            raise ValueError("Benchmark must be provided for out-of-sample R-squared calculation.")
        return 1 - (np.sum((y_true - y_pred) ** 2) / 
                    np.sum((y_true - benchmark) ** 2))

In [10]:
features = df.columns[~df.columns.isin(['DATE', 'DATE2', "mvel2",'sic2' ,'permno',"permno2",'risk_premium'])].tolist()
df[features]=df.groupby('DATE')[features].rank(pct=True)

df[features] = 2*df[features] - 1


In [11]:
tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df[features]
y = df[['risk_premium']]

predictions = []
y_test_list =[]
dates = []
dic_r2_all = {}


for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1981,1,31), second_split_date= datetime.date(1991,1,31)):


    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]
    
    #OLS regression with huber loss function
    reg_huber = HuberRegressor(max_iter=1000)
    #No hyperparameters in OLS --> use df and validation set for training
    reg_huber.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    #Use test set to generate predictions
    preds = reg_huber.predict(X_test) 
    #Save predictions, dates and the true values of the dependent variable to list  
    predictions.append(preds)
    dates.append(y_test.index)
    y_test_list.append(y_test)
    
    #Calculate OOS model performance the for current window
    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    #Save OOS model performance and the respective month to dictionary
    dic_r2_all["r2." + str(y_test.index)] = r2
   
    
    
#Concatenate to get results over the whole OOS test period (Jan 2010-Dec 2019)
predictions_all= np.concatenate(predictions, axis=0)
y_test_list_all= np.concatenate(y_test_list, axis=0) 
dates_all= np.concatenate(dates, axis=0)

#Calculate OOS model performance over the entire test period in line with Gu et al (2020)
# R2OOS_LR = 1-np.sum(pow(y_test_list_all-predictions_all,2))/np.sum(pow(y_test_list_all,2))
# print("R2OOS Hubber Regression: ", R2OOS_LR)
R2FULL = r2_score(y_test_list_all, predictions_all)


Train period: 1976-01-31 - 1981-01-31 ,val period: 1981-01-31 - 1983-01-31 , Test period 1983-01-31 - 1984-01-31 # train records 11430 ,# val records 6471 , # test records 4416
Train period: 1977-01-31 - 1982-01-31 ,val period: 1982-01-31 - 1984-01-31 , Test period 1984-01-31 - 1985-01-31 # train records 13241 ,# val records 7309 , # test records 4368
Train period: 1978-01-31 - 1983-01-31 ,val period: 1983-01-31 - 1985-01-31 , Test period 1985-01-31 - 1986-01-31 # train records 14193 ,# val records 8784 , # test records 4870
Train period: 1979-01-31 - 1984-01-31 ,val period: 1984-01-31 - 1986-01-31 , Test period 1986-01-31 - 1987-01-31 # train records 16579 ,# val records 9238 , # test records 6416
Train period: 1980-01-31 - 1985-01-31 ,val period: 1985-01-31 - 1987-01-31 , Test period 1987-01-31 - 1988-01-31 # train records 18589 ,# val records 11286 , # test records 6641
Train period: 1981-01-31 - 1986-01-31 ,val period: 1986-01-31 - 1988-01-31 , Test period 1988-01-31 - 1989-01-31 #

KeyboardInterrupt: 

In [None]:

tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12','sic2' , 'DATE2', 'risk_premium'])].tolist()

X = df_top[features]
y = df_top[['risk_premium']]

predictions_top = []
y_test_list_top =[]
dates_top = []
dic_r2_all_top = {}

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1981,1,31), second_split_date= datetime.date(1991,1,31)):

    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    reg_huber = HuberRegressor(max_iter=1000)
    reg_huber.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))


    preds = reg_huber.predict(X_test)    
    predictions_top.append(preds)
    dates_top.append(y_test.index)
    y_test_list_top.append(y_test)
    
    r2_top = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    dic_r2_all_top["r2." + str(y_test.index)] = r2

predictions_all_top= np.concatenate(predictions_top, axis=0)
y_test_list_all_top= np.concatenate(y_test_list_top, axis=0) 
dates_all_top= np.concatenate(dates_top, axis=0)

R2TOP = r2_score(y_test_list_all_top, predictions_all_top)

Train period: 1976-01-31 - 1981-01-31 ,val period: 1981-01-31 - 1983-01-31 , Test period 1983-01-31 - 1984-01-31 # train records 3405 ,# val records 1930 , # test records 1320
Train period: 1977-01-31 - 1982-01-31 ,val period: 1982-01-31 - 1984-01-31 , Test period 1984-01-31 - 1985-01-31 # train records 3947 ,# val records 2182 , # test records 1304
Train period: 1978-01-31 - 1983-01-31 ,val period: 1983-01-31 - 1985-01-31 , Test period 1985-01-31 - 1986-01-31 # train records 4232 ,# val records 2624 , # test records 1458
Train period: 1979-01-31 - 1984-01-31 ,val period: 1984-01-31 - 1986-01-31 , Test period 1986-01-31 - 1987-01-31 # train records 4949 ,# val records 2762 , # test records 1919
Train period: 1980-01-31 - 1985-01-31 ,val period: 1985-01-31 - 1987-01-31 , Test period 1987-01-31 - 1988-01-31 # train records 5549 ,# val records 3377 , # test records 1987
Train period: 1981-01-31 - 1986-01-31 ,val period: 1986-01-31 - 1988-01-31 , Test period 1988-01-31 - 1989-01-31 # train

In [None]:
tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')


features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12','sic2' , 'DATE2', 'risk_premium'])].tolist()
X = df_bottom[features]
y = df_bottom[['risk_premium']]

#Empty containers to save results from each window

predictions_bottom = []
y_test_list_bottom =[]
dates_bottom = []
dic_r2_all_bottom = {}

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1981,1,31), second_split_date= datetime.date(1991,1,31)):

    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    reg_huber = HuberRegressor(max_iter=1000)
    reg_huber.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))


    preds = reg_huber.predict(X_test)    
    predictions_bottom.append(preds)
    dates_bottom.append(y_test.index)
    y_test_list_bottom.append(y_test)
    
    r2_bottom = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    dic_r2_all_bottom["r2." + str(y_test.index)] = r2

predictions_all_bottom= np.concatenate(predictions_bottom, axis=0)
y_test_list_all_bottom= np.concatenate(y_test_list_bottom, axis=0) 
dates_all_bottom= np.concatenate(dates_bottom, axis=0)

R2BOTTOM = r2_score(y_test_list_all_bottom, predictions_all_bottom)

Train period: 1976-01-31 - 1981-01-31 ,val period: 1981-01-31 - 1983-01-31 , Test period 1983-01-31 - 1984-01-31 # train records 3405 ,# val records 1930 , # test records 1320
Train period: 1977-01-31 - 1982-01-31 ,val period: 1982-01-31 - 1984-01-31 , Test period 1984-01-31 - 1985-01-31 # train records 3947 ,# val records 2182 , # test records 1304
Train period: 1978-01-31 - 1983-01-31 ,val period: 1983-01-31 - 1985-01-31 , Test period 1985-01-31 - 1986-01-31 # train records 4232 ,# val records 2624 , # test records 1458
Train period: 1979-01-31 - 1984-01-31 ,val period: 1984-01-31 - 1986-01-31 , Test period 1986-01-31 - 1987-01-31 # train records 4949 ,# val records 2762 , # test records 1919
Train period: 1980-01-31 - 1985-01-31 ,val period: 1985-01-31 - 1987-01-31 , Test period 1987-01-31 - 1988-01-31 # train records 5549 ,# val records 3377 , # test records 1987
Train period: 1981-01-31 - 1986-01-31 ,val period: 1986-01-31 - 1988-01-31 , Test period 1988-01-31 - 1989-01-31 # train

In [None]:
chart = np.array([[R2FULL],
                  [R2TOP],
                  [R2BOTTOM]])

huber = pd.DataFrame(chart, columns=['Huber Regression'],
                     index=['Full Sample', 'Large Firms', 'Small Firms'])

huber

Unnamed: 0,Huber Regression
Full Sample,-0.006539
Large Firms,-0.054777
Small Firms,2.6e-05


In [23]:
huber.to_csv(r'huber_model.csv', index=False)

In [37]:
features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12','sic2' , 'DATE2', 'DATE', 'risk_premium'])].tolist()
df['year'] = df['DATE'].dt.year

X_train = df[features].loc[(df["year"]>=2013) & (df["year"]<=2018)]
y_train = df["risk_premium"].loc[(df["year"]>=2013) & (df["year"]<=2018)]

X_val = df[features].loc[(df["year"]>=2018) & (df["year"]<=2020)]
y_val = df["risk_premium"].loc[(df["year"]>=2018) & (df["year"]<=2020)]


reg_huber = HuberRegressor(max_iter=1000)
reg_huber.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
preds = reg_huber.predict(np.concatenate((X_train, X_val))) 

R2OOS_all = 1-np.sum(pow(np.concatenate((y_train, y_val))-preds,2))/np.sum(pow(np.concatenate((y_train, y_val)),2))
print(R2OOS_all)


0.0008624898788281987


: 

In [24]:
for j in features:
    globals()['df_' + str(j)] =  df.copy()
    globals()['df_' + str(j)][str(j)] = 0

In [28]:
dic = {}

    
for j in features:
    df_var = globals()['df_' + str(j)]
    
    X_train = df[features].loc[(df["year"]>=2013) & (df["year"]<=2018)]
    y_train = df["risk_premium"].loc[(df["year"]>=2013) & (df["year"]<=2018)]

    X_val = df[features].loc[(df["year"]>=2018) & (df["year"]<=2020)]
    y_val = df["risk_premium"].loc[(df["year"]>=2018) & (df["year"]<=2020)]
    
    reg_huber = HuberRegressor()
    reg_huber.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    kpreds = reg_huber.predict(np.concatenate((X_train, X_val))) 

    R2OOS_var = 1-np.sum(pow(np.concatenate((y_train, y_val))-preds,2))/np.sum(pow(np.concatenate((y_train, y_val)),2))
    dic['R2OOS_' + str(j)] = R2OOS_var

In [29]:
dic

{'R2OOS_mvel1': np.float64(0.00011999259471884294),
 'R2OOS_beta': np.float64(0.00011999259471884294),
 'R2OOS_betasq': np.float64(0.00011999259471884294),
 'R2OOS_chmom': np.float64(0.00011999259471884294),
 'R2OOS_dolvol': np.float64(0.00011999259471884294),
 'R2OOS_idiovol': np.float64(0.00011999259471884294),
 'R2OOS_indmom': np.float64(0.00011999259471884294),
 'R2OOS_mom1m': np.float64(0.00011999259471884294),
 'R2OOS_mom6m': np.float64(0.00011999259471884294),
 'R2OOS_mom12m': np.float64(0.00011999259471884294),
 'R2OOS_mom36m': np.float64(0.00011999259471884294),
 'R2OOS_pricedelay': np.float64(0.00011999259471884294),
 'R2OOS_turn': np.float64(0.00011999259471884294),
 'R2OOS_absacc': np.float64(0.00011999259471884294),
 'R2OOS_acc': np.float64(0.00011999259471884294),
 'R2OOS_age': np.float64(0.00011999259471884294),
 'R2OOS_agr': np.float64(0.00011999259471884294),
 'R2OOS_bm': np.float64(0.00011999259471884294),
 'R2OOS_bm_ia': np.float64(0.00011999259471884294),
 'R2OOS_ca

In [31]:
pd.DataFrame(dic.items())
imp=pd.DataFrame(dic.items(), columns=['Feature', 'R2OOS'])
# Feature: name of the variable whose values are set to zero
imp["Feature"] = imp["Feature"].str[6:]

# Calculate reduction in predictive R2OOS 
imp["red_R2OOS"] = R2OOS_all -imp["R2OOS"]
imp["var_imp"] = imp["red_R2OOS"]/np.sum(imp["red_R2OOS"])
imp=imp.sort_values(by = ['var_imp'], ascending = False)
imp.head()

Unnamed: 0,Feature,R2OOS,red_R2OOS,var_imp
0,mvel1,0.00012,0.0,
1,beta,0.00012,0.0,
2,betasq,0.00012,0.0,
3,chmom,0.00012,0.0,
4,dolvol,0.00012,0.0,


In [None]:
yhat = predictions_all_full.tolist()
y_true = y_test_list_all_full.tolist()
i = dates_all_full.tolist()

results = pd.DataFrame(
    {'identifier': i,
     'yhat': yhat,
     'y_true': y_true
    })


results["identifier"]= results["identifier"].astype("str")
results["date"] = results["identifier"].str[12:22]
results["id"] = results["identifier"].str[35:40]
results.drop(["identifier"],axis = 1, inplace=True)
results['date'] = pd.to_datetime(results['date'], format='%Y-%m-%d')
results['MonthYear'] = results['date'].dt.to_period('M')
results = results.sort_values(by = ['date', 'id'], ascending = True)
results = results.set_index(['MonthYear','id'])
results.head()


In [None]:
# results['yhat'] = results['yhat'].apply(lambda x: x[0])
results['y_true'] = results['y_true'].apply(lambda x: x[0])

In [None]:
data = df[['mvel12', 'macro_tbl', 'macro_svar']].copy()
data.reset_index(inplace=True)
data['permno2'] = data['permno2'].astype('str')
data['MonthYear'] = data['DATE2'].dt.to_period('M')
data.drop('DATE2', axis=1, inplace=True)
data.rename(columns={'permno2': 'id'}, inplace=True)
data.rename(columns={'mvel12': 'market_cap'}, inplace=True)
data.rename(columns={'macro_tbl': 'risk_free_rate'}, inplace=True)
data = data.set_index(['MonthYear','id'])


In [None]:
bigdata = pd.merge(results, data,left_index=True, right_index=True)
bigdata.reset_index(inplace=True)
bigdata

In [None]:
bigdata['returns'] = bigdata['y_true'] + bigdata['risk_free_rate']
bigdata

In [None]:
bigdata['MonthYear1'] = bigdata['MonthYear'].copy()
bigdata['MonthYear'] = bigdata['MonthYear'].astype('int64')
bigdata['NumMonth'] = bigdata['MonthYear'] - 179
bigdata['NumMonth'].unique()

In [None]:
for i in bigdata['NumMonth'].unique():
    globals()['df_' + str(i)] = bigdata[bigdata['NumMonth'] == i]

for i in bigdata["NumMonth"].unique():
    globals()['df_' + str(i)]["rank"]= globals()['df_' + str(i)]['yhat'].rank(method='first')
    
for i in bigdata["NumMonth"].unique():
    globals()['df_' + str(i)]["DecileRank"]=pd.qcut(globals()['df_' + str(i)]['rank'].values, 10, labels = False)

#Drop normal rank, retain only decile ranks 
for i in bigdata["NumMonth"].unique():
     globals()['df_' + str(i)].drop('rank', axis=1, inplace=True)

In [None]:

for i in bigdata["NumMonth"].unique():
    for j,g in globals()['df_' + str(i)].groupby('DecileRank'):
        globals()['df_' + str(i)+ "_" + str(j)] =  g

for j in np.arange(0,10,1):
    globals()['rank_' + str(j)] = pd.concat([globals()['df_1_'+ str(j)], globals()['df_2_'+ str(j)]], axis=0)
    
# Generate 10 Dataframes for the 10 Decile portfolios 0-9: rank_9: top portfolio, rank_0: bottom portfolio
for i in np.arange(2,361,1):
    for j in np.arange(0,10,1):
        globals()['rank_' + str(j)] = pd.concat([globals()['rank_' + str(j)], globals()['df_' + str(i+1)+ "_" + str(j)]], axis = 0)

In [None]:
rank_9

In [None]:
# Get equal und value weights per stock per month in each decile portfolio:
# ew = Equally weighted
# vw = Value weighted

for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]["eq_weights"] = 1/globals()['rank_' + str(j)].groupby('MonthYear')["id"].transform('size')
    globals()['rank_' + str(j)]["me_weights"] = globals()['rank_' + str(j)]["market_cap"]/globals()['rank_' + str(j)].groupby('MonthYear')["market_cap"].transform('sum')

In [None]:
# Weighted excess return per stock in t+1
for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]['excess_return_stock_ew'] = globals()['rank_' + str(j)]["y_true"]*globals()['rank_' + str(j)]["eq_weights"]
    globals()['rank_' + str(j)]['excess_return_stock_vw'] = globals()['rank_' + str(j)]["y_true"]*globals()['rank_' + str(j)]["me_weights"]

In [None]:
# weighted return per stock in t+1 (to use for the sharpe ratio)
for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]['return_stock_ew'] = globals()['rank_' + str(j)]["returns"]*globals()['rank_' + str(j)]["eq_weights"]
    globals()['rank_' + str(j)]['return_stock_vw'] = globals()['rank_' + str(j)]["returns"]*globals()['rank_' + str(j)]["me_weights"]

In [None]:

# Portfolio excess return in t+1
for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]['excess_return_portfolio_ew'] = globals()['rank_' + str(j)].groupby('MonthYear')["excess_return_stock_ew"].transform('sum')
    globals()['rank_' + str(j)]['excess_return_portfolio_vw'] = globals()['rank_' + str(j)].groupby('MonthYear')["excess_return_stock_vw"].transform('sum')

In [None]:
# Portfolio return in t+1 (to use for the sharpe ratio) 
for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]['return_portfolio_ew'] = globals()['rank_' + str(j)].groupby('MonthYear')["return_stock_ew"].transform('sum')
    globals()['rank_' + str(j)]['return_portfolio_vw'] = globals()['rank_' + str(j)].groupby('MonthYear')["return_stock_vw"].transform('sum')

In [None]:

# Weighted predicted excess return per stock in t+1
for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]['pred_excess_return_stock_ew'] = globals()['rank_' + str(j)]["yhat"]*globals()['rank_' + str(j)]["eq_weights"]
    globals()['rank_' + str(j)]['pred_excess_return_stock_vw'] = globals()['rank_' + str(j)]["yhat"]*globals()['rank_' + str(j)]["me_weights"]

In [None]:
# Portfolio predicted excess return in t+1
for j in np.arange(0,10,1):
    globals()['rank_' + str(j)]['pred_excess_return_portfolio_ew'] = globals()['rank_' + str(j)].groupby('MonthYear')["pred_excess_return_stock_ew"].transform('sum')
    globals()['rank_' + str(j)]['pred_excess_return_portfolio_vw'] = globals()['rank_' + str(j)].groupby('MonthYear')["pred_excess_return_stock_vw"].transform('sum')

In [None]:
# Generate dataframes, containing the portfolio returns on mohtly basis for each decile portfolio
# e.g., montly_rank_0: dataframe, containing only the monthly portfolio excess returns (predicted and real) 
# for  the bottom rank

for j in np.arange(0,10,1):
    globals()['montly_rank_' + str(j)] = globals()['rank_' + str(j)][["MonthYear1", "DecileRank",
                                                                      "excess_return_portfolio_ew",
                                                                      "excess_return_portfolio_vw", 
                                                                      "pred_excess_return_portfolio_ew",
                                                                      "pred_excess_return_portfolio_vw",
                                                                      "return_portfolio_ew",
                                                                      "return_portfolio_vw"]]
    
for j in np.arange(0,10,1):
    globals()['montly_rank_' + str(j)]=globals()['montly_rank_' + str(j)].drop_duplicates()
    globals()['montly_rank_' + str(j)]=globals()['montly_rank_' + str(j)].set_index("MonthYear1")

In [None]:
for j in np.arange(0,10,1):
    #Time-series average of realized excess returns
    globals()["ew_mean_return_rank_" +  str(j)]= globals()['montly_rank_' + str(j)]["excess_return_portfolio_ew"].mean()
    globals()["vw_mean_return_rank_" +  str(j)]= globals()['montly_rank_' + str(j)]["excess_return_portfolio_vw"].mean()
    #Time-series average of predicted excess returns
    globals()["ew_mean_pred_return_rank_" +  str(j)]= globals()['montly_rank_' + str(j)]["pred_excess_return_portfolio_ew"].mean()
    globals()["vw_mean_pred_return_rank_" +  str(j)]= globals()['montly_rank_' + str(j)]["pred_excess_return_portfolio_vw"].mean()
    #Standard deviation of realized excess returns
    globals()["std_ew_rank_" +  str(j)]= globals()['montly_rank_' + str(j)]["excess_return_portfolio_ew"].std()
    globals()["std_vw_rank_" +  str(j)]= globals()['montly_rank_' + str(j)]["excess_return_portfolio_vw"].std()
    #Annualized sharpe ratio of realized excess returns
    globals()["sharpe_ew_rank_" +  str(j)]= (globals()['montly_rank_' + str(j)]["excess_return_portfolio_ew"].mean()/globals()['montly_rank_' + str(j)]["return_portfolio_ew"].std())* np.sqrt(12)
    globals()["sharpe_vw_rank_" +  str(j)]= (globals()['montly_rank_' + str(j)]["excess_return_portfolio_vw"].mean()/globals()['montly_rank_' + str(j)]["return_portfolio_vw"].std())* np.sqrt(12)

In [None]:
# For the zero-net-investment long-short portfolio the top (long) and bottom(short) decile portfolios are needed

long_monthly = rank_9[["NumMonth","MonthYear1", "DecileRank", "excess_return_portfolio_ew",
                       "excess_return_portfolio_vw","pred_excess_return_portfolio_ew",
                       "pred_excess_return_portfolio_vw","return_portfolio_ew",
                        "return_portfolio_vw"]].drop_duplicates()

short_monthly = rank_0[["NumMonth","MonthYear1", "DecileRank", "excess_return_portfolio_ew",
                       "excess_return_portfolio_vw","pred_excess_return_portfolio_ew",
                       "pred_excess_return_portfolio_vw","return_portfolio_ew",
                        "return_portfolio_vw"]].drop_duplicates()

# Create a column, indication the stategy 
long_monthly["Strategy"]= "long"
short_monthly["Strategy"]= "short"

# Merge to get the zero net investment portfolio
zeronet_monthly= pd.concat([long_monthly, short_monthly])
zeronet_monthly = zeronet_monthly.sort_values(by = ['NumMonth',"Strategy"])
zeronet_monthly["return_portfolio_vw"] = zeronet_monthly["return_portfolio_vw"].astype('float64')

#Create two new columns containing the exess return of the portfolio and initially set the values to zero.
zeronet_monthly["excess_return_zeronet_ew"] =0
zeronet_monthly["excess_return_zeronet_vw"] =0

# excess return zeronet in t = (weigted excess return long in t) - (weigted excess return short in t)
for i in range(0, len(zeronet_monthly)):
    if zeronet_monthly.iloc[i,9] == "long":
        zeronet_monthly.iloc[i, -2] = zeronet_monthly.iloc[i, 3]-zeronet_monthly.iloc[i+1, 3]
    else:
        zeronet_monthly.iloc[i, -2] = zeronet_monthly.iloc[i-1, 3]-zeronet_monthly.iloc[i, 3]
        
for i in range(0, len(zeronet_monthly)):
    if zeronet_monthly.iloc[i,9] == "long":
        zeronet_monthly.iloc[i, -1] = zeronet_monthly.iloc[i, 4]-zeronet_monthly.iloc[i+1, 4]
    else:
        zeronet_monthly.iloc[i, -1] = zeronet_monthly.iloc[i-1, 4]-zeronet_monthly.iloc[i, 4]

#Create two new columns containing predicted the exess return of the portfolio and initially set the values to zero.
zeronet_monthly["pred_excess_return_zeronet_ew"] =0
zeronet_monthly["pred_excess_return_zeronet_vw"] =0

# predicted excess return zeronet in t = (weigted predicted excess return long in t) - (weigted predicted excess return short in t)
for i in range(0, len(zeronet_monthly)):
    if zeronet_monthly.iloc[i,9] == "long":
        zeronet_monthly.iloc[i, -2] = zeronet_monthly.iloc[i, 5]-zeronet_monthly.iloc[i+1, 5]
    else:
        zeronet_monthly.iloc[i, -2] = zeronet_monthly.iloc[i-1, 5]-zeronet_monthly.iloc[i, 5]
        
for i in range(0, len(zeronet_monthly)):
    if zeronet_monthly.iloc[i,9] == "long":
        zeronet_monthly.iloc[i, -1] = zeronet_monthly.iloc[i, 6]-zeronet_monthly.iloc[i+1, 6]
    else:
        zeronet_monthly.iloc[i, -1] = zeronet_monthly.iloc[i-1, 6]-zeronet_monthly.iloc[i, 6]

#Create two new columns containing return of the portfolio and initially set the values to zero.       
zeronet_monthly["return_zeronet_ew"] =0
zeronet_monthly["return_zeronet_vw"] =0

# return zeronet in t = (weigted return long in t) - (weigted return short in t)
for i in range(0, len(zeronet_monthly)):
    if zeronet_monthly.iloc[i,9] == "long":
        zeronet_monthly.iloc[i, -2] = zeronet_monthly.iloc[i, 7]-zeronet_monthly.iloc[i+1, 7]
    else:
        zeronet_monthly.iloc[i, -2] = zeronet_monthly.iloc[i-1, 7]-zeronet_monthly.iloc[i, 7]
        
for i in range(0, len(zeronet_monthly)):
    if zeronet_monthly.iloc[i,9] == "long":
        zeronet_monthly.iloc[i, -1] = zeronet_monthly.iloc[i, 8]-zeronet_monthly.iloc[i+1, 8]
    else:
        zeronet_monthly.iloc[i, -1] = zeronet_monthly.iloc[i-1, 8]-zeronet_monthly.iloc[i, 8]

In [None]:
zeronet_monthly

In [None]:
#Only the measures at portfolio level are needed
zeronet_monthly = zeronet_monthly[['NumMonth', 'MonthYear1', 'excess_return_zeronet_ew',
                                   'excess_return_zeronet_vw', 'pred_excess_return_zeronet_ew',
                                   'pred_excess_return_zeronet_vw','return_zeronet_ew',
                                   'return_zeronet_vw']].drop_duplicates()

In [None]:

#Calculate zero-net portfolio performance measures                                            
#Time-series average of realized excess returns                                             
ew_mean_return_zeronet= zeronet_monthly["excess_return_zeronet_ew"].mean()
vw_mean_return_zeronet= zeronet_monthly["excess_return_zeronet_vw"].mean()
#Time-series average of predicted excess returns
ew_mean_pred_return_zeronet = zeronet_monthly["pred_excess_return_zeronet_ew"].mean()
vw_mean_pred_return_zeronet = zeronet_monthly["pred_excess_return_zeronet_vw"].mean()
#Standard deviation of realized excess returns
std_ew_zeronet = zeronet_monthly["excess_return_zeronet_ew"].std()
std_vw_zeronet = zeronet_monthly["excess_return_zeronet_vw"].std()
#Annualized sharpe ratio of realized excess returns
sharpe_ew_zeronet = (zeronet_monthly["excess_return_zeronet_ew"].mean()/zeronet_monthly["return_zeronet_ew"].std())* np.sqrt(12)
sharpe_vw_zeronet = (zeronet_monthly["excess_return_zeronet_vw"].mean()/zeronet_monthly["return_zeronet_vw"].std())* np.sqrt(12)

In [None]:
chart_np = np.array([[ew_mean_pred_return_rank_0, ew_mean_return_rank_0, std_ew_rank_0, sharpe_ew_rank_0],
                     [ew_mean_pred_return_rank_1, ew_mean_return_rank_1, std_ew_rank_1, sharpe_ew_rank_1],
                     [ew_mean_pred_return_rank_2, ew_mean_return_rank_2, std_ew_rank_2, sharpe_ew_rank_2],
                     [ew_mean_pred_return_rank_3, ew_mean_return_rank_3, std_ew_rank_3, sharpe_ew_rank_3],
                     [ew_mean_pred_return_rank_4, ew_mean_return_rank_4, std_ew_rank_4, sharpe_ew_rank_4],
                     [ew_mean_pred_return_rank_5, ew_mean_return_rank_5, std_ew_rank_5, sharpe_ew_rank_5],
                     [ew_mean_pred_return_rank_6, ew_mean_return_rank_6, std_ew_rank_6, sharpe_ew_rank_6],
                     [ew_mean_pred_return_rank_7, ew_mean_return_rank_7, std_ew_rank_7, sharpe_ew_rank_7],
                     [ew_mean_pred_return_rank_8, ew_mean_return_rank_8, std_ew_rank_8, sharpe_ew_rank_8],
                     [ew_mean_pred_return_rank_9, ew_mean_return_rank_9, std_ew_rank_9, sharpe_ew_rank_9],
                     [ew_mean_pred_return_zeronet, ew_mean_return_zeronet, std_ew_zeronet, sharpe_ew_zeronet]])

ew_df = pd.DataFrame(chart_np, columns=['Pred', 'Real', 'Std', 'Sharpe'],
                              index=['Low (L)', '2', '3', '4', '5','6','7','8',"9",'High (H)', "H-L"])

ew_df['Pred'] = pd.Series(["{0:.2f}%".format(val) for val in ew_df['Pred']], index = ew_df.index)
ew_df['Real'] = pd.Series(["{0:.2f}%".format(val) for val in ew_df['Real']], index = ew_df.index)
ew_df['Std'] = pd.Series(["{0:.2f}%".format(val) for val in ew_df['Std']], index = ew_df.index)
ew_df['Sharpe'] = pd.Series([("%.2f" % round(val, 2)) for val in ew_df['Sharpe']], index = ew_df.index)
ew_df