In [1]:
import numpy as np
import pandas as pd
import math
import sklearn.preprocessing
import datetime
from TimeBasedCV import TimeBasedCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import make_scorer, r2_score
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
import pickle
from sklearn.neural_network import MLPRegressor

pd.set_option('display.max_rows', None)
# more

In [2]:
# df = pd.read_csv('data/factors_1965.csv', parse_dates=['DATE'])

In [3]:
# with open('data/features_1965.pkl', 'wb') as f:
#     pickle.dump(df, f)

with open('data/features_1965.pkl', 'rb') as f:
    df = pickle.load(f)
    print(df.head())

   permno       DATE        mvel1      beta    betasq     chmom     dolvol  \
0   10145 1965-02-26   1498872.00  0.983510  0.967291  0.105988  11.546907   
1   10401 1965-02-26  35392058.00  0.780829  0.609694 -0.063768  12.240330   
2   10786 1965-02-26   1695284.75  0.806119  0.649827 -0.130519  12.005040   
3   10989 1965-02-26   1295887.75  1.199748  1.439395  0.073609  11.756961   
4   11260 1965-02-26   2302001.25  1.257269  1.580725 -0.167320  12.240330   

    idiovol    indmom     mom1m  ...  macro_ep  macro_bm  macro_ntis  \
0  0.022307  0.035075  0.104116  ...  2.936836  0.471399    0.014823   
1  0.013395  0.335139 -0.007326  ...  2.936836  0.471399    0.014823   
2  0.024366  0.104106  0.060498  ...  2.936836  0.471399    0.014823   
3  0.022717  0.118513  0.068807  ...  2.936836  0.471399    0.014823   
4  0.035883  0.185424 -0.036885  ...  2.936836  0.471399    0.014823   

   macro_tbl  macro_tms  macro_dfy  macro_svar  macro_mkt-rf  macro_hml  \
0     0.0393    -0.0379

In [4]:
#Sort observations by date and stock id
df[df.columns[2:]] = df[df.columns[2:]].astype('float32')
df = df.sort_values(by = ['DATE', 'permno'], ascending = True)
df.head()


Unnamed: 0,permno,DATE,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,...,macro_ep,macro_bm,macro_ntis,macro_tbl,macro_tms,macro_dfy,macro_svar,macro_mkt-rf,macro_hml,macro_smb
0,10145,1965-02-26,1498872.0,0.98351,0.967291,0.105988,11.546906,0.022307,0.035075,0.104116,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
1,10401,1965-02-26,35392056.0,0.780829,0.609694,-0.063768,12.240331,0.013395,0.335139,-0.007326,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
2,10786,1965-02-26,1695284.75,0.806119,0.649827,-0.130519,12.00504,0.024366,0.104106,0.060498,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
3,10989,1965-02-26,1295887.75,1.199748,1.439395,0.073609,11.756961,0.022717,0.118513,0.068807,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
4,11260,1965-02-26,2302001.25,1.257269,1.580725,-0.16732,12.240331,0.035883,0.185424,-0.036885,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55


In [5]:
df['permno2'] = df['permno'].copy()
df['DATE2'] = df['DATE'].copy()

#Make a copy of  the "me" variable (market equity) before rank standartization to use afterwards for value weighting
df['mvel12'] = df['mvel1'].copy()
df = df.set_index(['DATE2','permno2'])

In [6]:
p=0.3 
df_large= df.groupby('DATE').apply(lambda x: x.nlargest(int(len(x)*p),'mvel1')).reset_index(drop=True)  
df_small = df.groupby('DATE').apply(lambda x: x.nsmallest(int(len(x)*p),'mvel1')).reset_index(drop=True)  


  df_large= df.groupby('DATE').apply(lambda x: x.nlargest(int(len(x)*p),'mvel1')).reset_index(drop=True)
  df_small = df.groupby('DATE').apply(lambda x: x.nsmallest(int(len(x)*p),'mvel1')).reset_index(drop=True)


In [7]:
features = df.columns[~df.columns.isin(['DATE', 'DATE2', "mvel2",'sic2' ,'permno',"permno2",'risk_premium'])].tolist()
df[features]=df.groupby('DATE')[features].rank(pct=True)

df[features] = 2*df[features] - 1

In [8]:
tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df[features]
y = df[['risk_premium']]

#Empty containers to save results from each window

#Empty containers to save results from each window
predictions = []
y_test_list =[]
dates = []
dic_r2_all = {}

learning = np.linspace(start=0.01, stop=0.001, num=20)
mse = np.full((len(learning),1),np.nan, dtype = np.float32)

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1981,1,31), second_split_date= datetime.date(1991,1,31)):

    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    model = MLPRegressor(learning_rate_init=0.00001, hidden_layer_sizes=32, activation='relu',
                            batch_size=250, early_stopping=True, n_iter_no_change=10, random_state=42)
    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds = model.predict(X_test)

    predictions.append(preds)
    dates.append(y_test.index)
    y_test_list.append(y_test)

    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    dic_r2_all["r2." + str(y_test.index)] = r2

predictions_all_full= np.concatenate(predictions, axis=0)
y_test_list_all_full= np.concatenate(y_test_list, axis=0) 
dates_all_full= np.concatenate(dates, axis=0)

# R2FULL = 1-np.sum(pow(y_test_list_all_full-predictions_all_full,2))/np.sum(pow(y_test_list_all_full,2))
# print("R2OOS Linear Regression: ", R2FULL)
R2FULL = r2_score(y_test_list_all_full, predictions_all_full)
R2FULL



Train period: 1976-01-31 - 1981-01-31 ,val period: 1981-01-31 - 1983-01-31 , Test period 1983-01-31 - 1984-01-31 # train records 11430 ,# val records 6471 , # test records 4416
Train period: 1977-01-31 - 1982-01-31 ,val period: 1982-01-31 - 1984-01-31 , Test period 1984-01-31 - 1985-01-31 # train records 13241 ,# val records 7309 , # test records 4368
Train period: 1978-01-31 - 1983-01-31 ,val period: 1983-01-31 - 1985-01-31 , Test period 1985-01-31 - 1986-01-31 # train records 14193 ,# val records 8784 , # test records 4870
Train period: 1979-01-31 - 1984-01-31 ,val period: 1984-01-31 - 1986-01-31 , Test period 1986-01-31 - 1987-01-31 # train records 16579 ,# val records 9238 , # test records 6416
Train period: 1980-01-31 - 1985-01-31 ,val period: 1985-01-31 - 1987-01-31 , Test period 1987-01-31 - 1988-01-31 # train records 18589 ,# val records 11286 , # test records 6641
Train period: 1981-01-31 - 1986-01-31 ,val period: 1986-01-31 - 1988-01-31 , Test period 1988-01-31 - 1989-01-31 #

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

0.017733963551429643

In [None]:

tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df_large[features]
y = df_large[['risk_premium']]

#Empty containers to save results from each window

#Empty containers to save results from each window
predictions_top = []
y_test_list_top =[]
dates_top = []
dic_r2_top = {}

learning = np.linspace(start=0.01, stop=0.001, num=20)
mse = np.full((len(learning),1),np.nan, dtype = np.float32)

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1981,1,31), second_split_date= datetime.date(1991,1,31)):

    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    model = MLPRegressor(learning_rate_init=0.00001, hidden_layer_sizes=32, activation='relu',
                             early_stopping=True, n_iter_no_change=10, random_state=42)
    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds = model.predict(X_test)

    predictions_top.append(preds)
    dates_top.append(y_test.index)
    y_test_list_top.append(y_test)

    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    dic_r2_top["r2." + str(y_test.index)] = r2

predictions_all_top = np.concatenate(predictions_top, axis=0)
y_test_list_all_top= np.concatenate(y_test_list_top, axis=0) 
dates_all_top= np.concatenate(dates_top, axis=0)

# R2FULL = 1-np.sum(pow(y_test_list_all_full-predictions_all_full,2))/np.sum(pow(y_test_list_all_full,2))
# print("R2OOS Linear Regression: ", R2FULL)
R2TOP = r2_score(y_test_list_all_top, predictions_all_top)
R2TOP



In [None]:

tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df[features]
y = df[['risk_premium']]

#Empty containers to save results from each window

#Empty containers to save results from each window
predictions_bottom = []
y_test_list_bottom =[]
dates_bottom = []
dic_r2_bottom = {}

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1981,1,31), second_split_date= datetime.date(1991,1,31)):

    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    model = MLPRegressor(learning_rate_init=0.00001, hidden_layer_sizes=32, activation='relu',
                            batch_size=250, early_stopping=True, n_iter_no_change=10, random_state=42)
    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds = model.predict(X_test)

    predictions_bottom.append(preds)
    dates_bottom.append(y_test.index)
    y_test_list_bottom.append(y_test)

    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    dic_r2_top["r2." + str(y_test.index)] = r2

predictions_all_bottom = np.concatenate(predictions_bottom, axis=0)
y_test_list_all_bottom= np.concatenate(y_test_list_bottom, axis=0) 
dates_all_bottom= np.concatenate(dates_bottom, axis=0)

# R2FULL = 1-np.sum(pow(y_test_list_all_full-predictions_all_full,2))/np.sum(pow(y_test_list_all_full,2))
# print("R2OOS Linear Regression: ", R2FULL)
R2BOTTOM = r2_score(y_test_list_bottom, predictions_all_bottom)
R2BOTTOM



In [9]:

chart = np.array([[R2FULL],
                  [R2TOP],
                  [R2BOTTOM]])

NN1 = pd.DataFrame(chart, columns=['ENet Regression'],
                     index=['Full Sample', 'Large Firms', 'Small Firms'])

NN1

NameError: name 'R2TOP' is not defined

In [None]:
NN1.to_csv(r'r2_NN1_model.csv')

In [10]:

yhat = predictions_all_full.tolist()
y_true = y_test_list_all_full.tolist()
i = dates_all_full.tolist()

results = pd.DataFrame(
    {'identifier': i,
     'yhat': yhat,
     'y_true': y_true
    })


results["identifier"]= results["identifier"].astype("str")
results["date"] = results["identifier"].str[12:22]
results["id"] = results["identifier"].str[35:40]
results.drop(["identifier"],axis = 1, inplace=True)
results['date'] = pd.to_datetime(results['date'], format='%Y-%m-%d')
results['MonthYear'] = results['date'].dt.to_period('M')
results = results.sort_values(by = ['date', 'id'], ascending = True)
results = results.set_index(['MonthYear','id'])
results.head()
nnet = results.reset_index()
nnet.to_csv('predictions/nnet.csv', index=False)


In [None]:

results['y_true'] = results['y_true'].apply(lambda x: x[0])

In [None]:

data = df[['mvel12', 'macro_tbl', 'macro_svar']].copy()
data.reset_index(inplace=True)
data['permno2'] = data['permno2'].astype('str')
data['MonthYear'] = data['DATE2'].dt.to_period('M')
data.drop('DATE2', axis=1, inplace=True)
data.rename(columns={'permno2': 'id'}, inplace=True)
data.rename(columns={'mvel12': 'market_cap'}, inplace=True)
data.rename(columns={'macro_tbl': 'risk_free_rate'}, inplace=True)
data = data.set_index(['MonthYear','id'])


In [None]:
bigdata = pd.merge(results, data,left_index=True, right_index=True)
bigdata.reset_index(inplace=True)
bigdata

In [None]:
bigdata['returns'] = bigdata['y_true'] + bigdata['risk_free_rate']
bigdata

In [None]:
bigdata['MonthYear1'] = bigdata['MonthYear'].copy()
bigdata['MonthYear'] = bigdata['MonthYear'].astype('int64')
bigdata['NumMonth'] = bigdata['MonthYear'] - 179
bigdata['NumMonth'].unique()
