In [7]:
import numpy as np
import pandas as pd
import math
import sklearn.preprocessing
import datetime
from TimeBasedCV import TimeBasedCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import make_scorer, r2_score
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
import pickle
from sklearn.neural_network import MLPRegressor

import warnings
warnings.simplefilter(action='ignore', category=Warning)
from itertools import product

In [8]:
# df = pd.read_csv('data/factors_1965.csv', parse_dates=['DATE'])

In [9]:
# with open('data/features_1965.pkl', 'wb') as f:
#     pickle.dump(df, f)

with open('data/features_1965.pkl', 'rb') as f:
    df = pickle.load(f)
    print(df.head())

   permno       DATE        mvel1      beta    betasq     chmom     dolvol  \
0   10145 1965-02-26   1498872.00  0.983510  0.967291  0.105988  11.546907   
1   10401 1965-02-26  35392058.00  0.780829  0.609694 -0.063768  12.240330   
2   10786 1965-02-26   1695284.75  0.806119  0.649827 -0.130519  12.005040   
3   10989 1965-02-26   1295887.75  1.199748  1.439395  0.073609  11.756961   
4   11260 1965-02-26   2302001.25  1.257269  1.580725 -0.167320  12.240330   

    idiovol    indmom     mom1m  ...  macro_ep  macro_bm  macro_ntis  \
0  0.022307  0.035075  0.104116  ...  2.936836  0.471399    0.014823   
1  0.013395  0.335139 -0.007326  ...  2.936836  0.471399    0.014823   
2  0.024366  0.104106  0.060498  ...  2.936836  0.471399    0.014823   
3  0.022717  0.118513  0.068807  ...  2.936836  0.471399    0.014823   
4  0.035883  0.185424 -0.036885  ...  2.936836  0.471399    0.014823   

   macro_tbl  macro_tms  macro_dfy  macro_svar  macro_mkt-rf  macro_hml  \
0     0.0393    -0.0379

In [10]:
#Sort observations by date and stock id
df[df.columns[2:]] = df[df.columns[2:]].astype('float32')
df = df.sort_values(by = ['DATE', 'permno'], ascending = True)
df.head()


Unnamed: 0,permno,DATE,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,...,macro_ep,macro_bm,macro_ntis,macro_tbl,macro_tms,macro_dfy,macro_svar,macro_mkt-rf,macro_hml,macro_smb
0,10145,1965-02-26,1498872.0,0.98351,0.967291,0.105988,11.546906,0.022307,0.035075,0.104116,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
1,10401,1965-02-26,35392056.0,0.780829,0.609694,-0.063768,12.240331,0.013395,0.335139,-0.007326,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
2,10786,1965-02-26,1695284.75,0.806119,0.649827,-0.130519,12.00504,0.024366,0.104106,0.060498,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
3,10989,1965-02-26,1295887.75,1.199748,1.439395,0.073609,11.756961,0.022717,0.118513,0.068807,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55
4,11260,1965-02-26,2302001.25,1.257269,1.580725,-0.16732,12.240331,0.035883,0.185424,-0.036885,...,2.936836,0.471399,0.014823,0.0393,-0.0379,0.0055,0.000393,0.44,0.11,3.55


In [11]:
df['permno2'] = df['permno'].copy()
df['DATE2'] = df['DATE'].copy()

#Make a copy of  the "me" variable (market equity) before rank standartization to use afterwards for value weighting
df['mvel12'] = df['mvel1'].copy()
df = df.set_index(['DATE2','permno2'])

In [12]:
p=0.3 
df_large= df.groupby('DATE').apply(lambda x: x.nlargest(int(len(x)*p),'mvel1')).reset_index(drop=True)  
df_small = df.groupby('DATE').apply(lambda x: x.nsmallest(int(len(x)*p),'mvel1')).reset_index(drop=True)  


In [13]:
features = df.columns[~df.columns.isin(['DATE', 'DATE2', "mvel2",'sic2' ,'permno',"permno2",'risk_premium'])].tolist()
df[features]=df.groupby('DATE')[features].rank(pct=True)
df[features] = 2*df[features] - 1

df_large[features]=df_large.groupby('DATE')[features].rank(pct=True)
df_large[features] = 2*df_large[features] - 1

df_small[features]=df_small.groupby('DATE')[features].rank(pct=True)
df_small[features] = 2*df_small[features] - 1

In [15]:
tscv = TimeBasedCV(train_period=120,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df[features]
y = df[['risk_premium']]

param_grid = {
    'learning_rate_init': [0.01, 0.001],  # Learning rate for the MLP
    'alpha': np.linspace(start=0.00001,stop=0.001,num=10) 
}

###########################################
# Validation
###########################################

pred_val = []
y_val_list =[]
r2_list = []

#Empty containers to save results from each window
predictions = []
y_test_list =[]
dates = []
dic_r2_all = {}


param_combinations =list(product(param_grid['learning_rate_init'],  param_grid['alpha']))

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1975,1,31), second_split_date= datetime.date(1985,1,31)):
    print('-------')
    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    for lr, alpha in param_combinations:
        
        test_results = pd.DataFrame(columns=["model", "learning rate", "alpha"])

        nn_model = MLPRegressor( learning_rate_init=lr, alpha=alpha, activation='relu', max_iter=1000, 
                            batch_size = 5000, early_stopping = True, n_iter_no_change = 10, random_state=42)


        nn_model.fit(X_train,y_train)
        Yval_predict = nn_model.predict(X_val)
        mse = np.sqrt(mean_squared_error(y_val,Yval_predict))

        test_results = pd.concat([test_results, pd.DataFrame([{
            "model":"MLPRegressor",
            "learning rate": lr,
            "alpha": alpha,
            "mse": mse
        }])], ignore_index=True)

        lr = test_results[test_results['mse']==test_results['mse'].min()]['learning rate'].values[0]
        act = test_results[test_results['mse']==test_results['mse'].min()]['alpha'].values[0]
    

    model = MLPRegressor(learning_rate_init=lr, alpha=alpha, hidden_layer_sizes=32, activation='relu',
                            batch_size=5000, early_stopping=True, n_iter_no_change=10, random_state=42)

    model.fit(X_train, y_train)
    y_train_preds = model.predict(X_train)
    r2_train = 1-np.sum(pow(y_train['risk_premium']-y_train_preds,2))/np.sum(pow(y_train['risk_premium'],2))

    r2_list.append(r2_train)
    
    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds=model.predict(X_test)

    print(f'R2 {y_train.index[0][0].date()} - {y_train.index[-1][0].date()} training set {r2_train}')


    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds = model.predict(X_test)

    predictions.append(preds)
    dates.append(y_test.index)
    y_test_list.append(y_test)

    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    print(f'R2 {y_test.index[0][0].date()} - {y_test.index[-1][0].date()} validation set {r2}')
    dic_r2_all["r2." + str(y_test.index)] = r2

predictions_all_full= np.concatenate(predictions, axis=0)
y_test_list_all_full= np.concatenate(y_test_list, axis=0) 
dates_all_full= np.concatenate(dates, axis=0)

# R2FULL = 1-np.sum(pow(y_test_list_all_full-predictions_all_full,2))/np.sum(pow(y_test_list_all_full,2))
# print("R2OOS Linear Regression: ", R2FULL)
R2FULL = r2_score(y_test_list_all_full, predictions_all_full)
R2FULL



Train period: 1965-01-31 - 1975-01-31 ,val period: 1975-01-31 - 1977-01-31 , Test period 1977-01-31 - 1978-01-31 # train records 13670 ,# val records 3499 , # test records 1941
Train period: 1966-01-31 - 1976-01-31 ,val period: 1976-01-31 - 1978-01-31 , Test period 1978-01-31 - 1979-01-31 # train records 14434 ,# val records 3708 , # test records 2030
Train period: 1967-01-31 - 1977-01-31 ,val period: 1977-01-31 - 1979-01-31 , Test period 1979-01-31 - 1980-01-31 # train records 15118 ,# val records 3971 , # test records 2358
Train period: 1968-01-31 - 1978-01-31 ,val period: 1978-01-31 - 1980-01-31 , Test period 1980-01-31 - 1981-01-31 # train records 15843 ,# val records 4388 , # test records 3334
Train period: 1969-01-31 - 1979-01-31 ,val period: 1979-01-31 - 1981-01-31 , Test period 1981-01-31 - 1982-01-31 # train records 16573 ,# val records 5692 , # test records 3578
Train period: 1970-01-31 - 1980-01-31 ,val period: 1980-01-31 - 1982-01-31 , Test period 1982-01-31 - 1983-01-31 # 

0.018925321114587113

In [35]:

tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df_large[features]
y = df_large[['risk_premium']]

param_grid = {
    'learning_rate_init': [0.01, 0.001],  # Learning rate for the MLP
    'alpha': np.linspace(start=0.00001,stop=0.001,num=10) 
}

###########################################
# Validation
###########################################

pred_val = []
y_val_list =[]
r2_list_top = []

#Empty containers to save results from each window
predictions_top = []
y_test_list_top =[]
dates_top = []
dic_r2_all_top = {}


param_combinations =list(product(param_grid['learning_rate_init'],  param_grid['alpha']))

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1975,1,31), second_split_date= datetime.date(1985,1,31)):
    print('-------')
    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    for lr, alpha in param_combinations:
        
        test_results = pd.DataFrame(columns=["model", "learning rate", "alpha"])

        nn_model = MLPRegressor( learning_rate_init=lr, alpha=alpha, activation='relu', max_iter=1000, 
                            batch_size = 5000, early_stopping = True, n_iter_no_change = 10, random_state=42)


        nn_model.fit(X_train,y_train)
        Yval_predict = nn_model.predict(X_val)
        mse = np.sqrt(mean_squared_error(y_val,Yval_predict))

        test_results = pd.concat([test_results, pd.DataFrame([{
            "model":"MLPRegressor",
            "learning rate": lr,
            "alpha": alpha,
            "mse": mse
        }])], ignore_index=True)

        lr = test_results[test_results['mse']==test_results['mse'].min()]['learning rate'].values[0]
        act = test_results[test_results['mse']==test_results['mse'].min()]['alpha'].values[0]
    

    model = MLPRegressor(learning_rate_init=lr, alpha=alpha, hidden_layer_sizes=32, activation='relu',
                            batch_size=5000, early_stopping=True, n_iter_no_change=10, random_state=42)

    model.fit(X_train, y_train)
    y_train_preds = model.predict(X_train)
    r2_train = 1-np.sum(pow(y_train['risk_premium']-y_train_preds,2))/np.sum(pow(y_train['risk_premium'],2))

    r2_list_top.append(r2_train)
    
    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds=model.predict(X_test)

    print(f'R2 training set {r2_train}')


    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds = model.predict(X_test)

    predictions_top.append(preds)
    dates_top.append(y_test.index)
    y_test_list_top.append(y_test)

    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    print(f'R2 validation set {r2}')
    dic_r2_all_top["r2." + str(y_test.index)] = r2

predictions_all_top= np.concatenate(predictions_top, axis=0)
y_test_list_all_top= np.concatenate(y_test_list_top, axis=0) 
dates_all_top= np.concatenate(dates_top, axis=0)

# R2FULL = 1-np.sum(pow(y_test_list_all_full-predictions_all_full,2))/np.sum(pow(y_test_list_all_full,2))
# print("R2OOS Linear Regression: ", R2FULL)
R2TOP = r2_score(y_test_list_all_top, predictions_all_top)
R2TOP



Train period: 1970-01-31 - 1975-01-31 ,val period: 1975-01-31 - 1977-01-31 , Test period 1977-01-31 - 1978-01-31 # train records 2251 ,# val records 1040 , # test records 577
Train period: 1971-01-31 - 1976-01-31 ,val period: 1976-01-31 - 1978-01-31 , Test period 1978-01-31 - 1979-01-31 # train records 2430 ,# val records 1103 , # test records 603
Train period: 1972-01-31 - 1977-01-31 ,val period: 1977-01-31 - 1979-01-31 , Test period 1979-01-31 - 1980-01-31 # train records 2563 ,# val records 1180 , # test records 704
Train period: 1973-01-31 - 1978-01-31 ,val period: 1978-01-31 - 1980-01-31 , Test period 1980-01-31 - 1981-01-31 # train records 2639 ,# val records 1307 , # test records 995
Train period: 1974-01-31 - 1979-01-31 ,val period: 1979-01-31 - 1981-01-31 , Test period 1981-01-31 - 1982-01-31 # train records 2673 ,# val records 1699 , # test records 1068
Train period: 1975-01-31 - 1980-01-31 ,val period: 1980-01-31 - 1982-01-31 , Test period 1982-01-31 - 1983-01-31 # train rec

0.03054756197216446

In [36]:

tscv = TimeBasedCV(train_period=60,
                   val_period=24,
                   test_period=12,
                   freq='months')

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12', 'sic2', 'DATE2', 'risk_premium'])].tolist()

X = df_small[features]
y = df_small[['risk_premium']]

param_grid = {
    'learning_rate_init': [0.01, 0.001],  # Learning rate for the MLP
    'alpha': np.linspace(start=0.00001,stop=0.001,num=10) 
}

###########################################
# Validation
###########################################

pred_val = []
y_val_list =[]
r2_list_bottom = []

#Empty containers to save results from each window
predictions_bottom = []
y_test_list_bottom =[]
dates_bottom = []
dic_r2_all_bottom = {}


param_combinations =list(product(param_grid['learning_rate_init'],  param_grid['alpha']))

for train_index, val_index, test_index in tscv.split(X, first_split_date= datetime.date(1975,1,31), second_split_date= datetime.date(1985,1,31)):
    print('-------')
    X_train   = X.loc[train_index].drop('DATE', axis=1)
    y_train = y.loc[train_index]
    
    X_val   = X.loc[val_index].drop('DATE', axis=1)
    y_val = y.loc[val_index]

    X_test    = X.loc[test_index].drop('DATE', axis=1)
    y_test  = y.loc[test_index]

    for lr, alpha in param_combinations:
        
        test_results = pd.DataFrame(columns=["model", "learning rate", "alpha"])

        nn_model = MLPRegressor( learning_rate_init=lr, alpha=alpha, activation='relu', max_iter=1000, 
                            batch_size = 5000, early_stopping = True, n_iter_no_change = 10, random_state=42)


        nn_model.fit(X_train,y_train)
        Yval_predict = nn_model.predict(X_val)
        mse = np.sqrt(mean_squared_error(y_val,Yval_predict))

        test_results = pd.concat([test_results, pd.DataFrame([{
            "model":"MLPRegressor",
            "learning rate": lr,
            "alpha": alpha,
            "mse": mse
        }])], ignore_index=True)

        lr = test_results[test_results['mse']==test_results['mse'].min()]['learning rate'].values[0]
        act = test_results[test_results['mse']==test_results['mse'].min()]['alpha'].values[0]
    

    model = MLPRegressor(learning_rate_init=lr, alpha=alpha, hidden_layer_sizes=32, activation='relu',
                            batch_size=5000, early_stopping=True, n_iter_no_change=10, random_state=42)

    model.fit(X_train, y_train)
    y_train_preds = model.predict(X_train)
    r2_train = 1-np.sum(pow(y_train['risk_premium']-y_train_preds,2))/np.sum(pow(y_train['risk_premium'],2))

    r2_list_bottom.append(r2_train)
    
    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds=model.predict(X_test)

    print(f'R2 training set {r2_train}')


    model.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val)))
    preds = model.predict(X_test)

    predictions_bottom.append(preds)
    dates_bottom.append(y_test.index)
    y_test_list_bottom.append(y_test)

    r2 = 1-np.sum(pow(y_test['risk_premium']-preds,2))/np.sum(pow(y_test['risk_premium'],2))
    print(f'R2 validation set {r2}')
    dic_r2_all_top["r2." + str(y_test.index)] = r2

predictions_all_bottom= np.concatenate(predictions_bottom, axis=0)
y_test_list_all_bottom= np.concatenate(y_test_list_bottom, axis=0) 
dates_all_bottom= np.concatenate(dates_bottom, axis=0)

# R2FULL = 1-np.sum(pow(y_test_list_all_full-predictions_all_full,2))/np.sum(pow(y_test_list_all_full,2))
# print("R2OOS Linear Regression: ", R2FULL)
R2BOTTOM = r2_score(y_test_list_all_bottom, predictions_all_bottom)
R2BOTTOM



Train period: 1970-01-31 - 1975-01-31 ,val period: 1975-01-31 - 1977-01-31 , Test period 1977-01-31 - 1978-01-31 # train records 2251 ,# val records 1040 , # test records 577
Train period: 1971-01-31 - 1976-01-31 ,val period: 1976-01-31 - 1978-01-31 , Test period 1978-01-31 - 1979-01-31 # train records 2430 ,# val records 1103 , # test records 603
Train period: 1972-01-31 - 1977-01-31 ,val period: 1977-01-31 - 1979-01-31 , Test period 1979-01-31 - 1980-01-31 # train records 2563 ,# val records 1180 , # test records 704
Train period: 1973-01-31 - 1978-01-31 ,val period: 1978-01-31 - 1980-01-31 , Test period 1980-01-31 - 1981-01-31 # train records 2639 ,# val records 1307 , # test records 995
Train period: 1974-01-31 - 1979-01-31 ,val period: 1979-01-31 - 1981-01-31 , Test period 1981-01-31 - 1982-01-31 # train records 2673 ,# val records 1699 , # test records 1068
Train period: 1975-01-31 - 1980-01-31 ,val period: 1980-01-31 - 1982-01-31 , Test period 1982-01-31 - 1983-01-31 # train rec

0.017735426602415894

In [37]:

chart = np.array([[R2FULL],
                  [R2TOP],
                  [R2BOTTOM]])

NN1 = pd.DataFrame(chart, columns=['NN-1'],
                     index=['Full Sample', 'Large Firms', 'Small Firms'])

NN1

Unnamed: 0,NN-1
Full Sample,0.018925
Large Firms,0.030548
Small Firms,0.017735


In [38]:
NN1.to_csv(r'r2_NN1_model.csv')

In [None]:

features = df.columns[~df.columns.isin(['permno', 'permno2', 'mvel12','sic2' , 'DATE2', 'DATE', 'risk_premium', 'year'])].tolist()
df['year'] = df['DATE'].dt.year


param_grid = {
    'learning_rate_init': [0.01, 0.005, 0.001],  # Learning rate for the MLP
    'activation': ['relu', 'logistic']  # Activation functions to try
}
test_results = pd.DataFrame(columns=["model", "learning rate", "activation"])

X_train = df[features].loc[(df["year"]>=2008) & (df["year"]<=2018)]
y_train = df["risk_premium"].loc[(df["year"]>=2008) & (df["year"]<=2018)]

X_val = df[features].loc[(df["year"]>=2018) & (df["year"]<=2020)]
y_val = df["risk_premium"].loc[(df["year"]>=2018) & (df["year"]<=2020)]


param_combinations =list(product(param_grid['learning_rate_init'],  param_grid['activation']))

for lr, activation in param_combinations:
    nn_model = MLPRegressor( learning_rate_init=lr, activation=activation, max_iter=1000, 
                            batch_size = 128, early_stopping = True, n_iter_no_change = 10, random_state=42)


    nn_model.fit(X_train,y_train)
    Yval_predict = nn_model.predict(X_val)
    mse = np.sqrt(mean_squared_error(y_val,Yval_predict))
    print(mse)


    test_results = pd.concat([test_results, pd.DataFrame([{
        "model":"MLPRegressor",
        "learning rate": lr,
        "activation": activation,
        "mse": mse
    }])], ignore_index=True)


In [24]:

yhat = predictions_all_full.tolist()
y_true = y_test_list_all_full.tolist()
i = dates_all_full.tolist()

results = pd.DataFrame(
    {'identifier': i,
     'yhat': yhat,
     'y_true': y_true
    })

results["identifier"]= results["identifier"].astype("str")
results["date"] = results["identifier"].str[12:22]
results["id"] = results["identifier"].str[35:40]
results.drop(["identifier"],axis = 1, inplace=True)
results['date'] = pd.to_datetime(results['date'], format='%Y-%m-%d')
results['MonthYear'] = results['date'].dt.to_period('M')
results = results.sort_values(by = ['date', 'id'], ascending = True)
results = results.set_index(['MonthYear','id'])
results

# results['yhat'] = results['yhat'].apply(lambda x: x[0])
results['y_true'] = results['y_true'].apply(lambda x: x[0])

data = df[['mvel12', 'macro_tbl', 'macro_svar']].copy()
data.reset_index(inplace=True)
data['permno2'] = data['permno2'].astype('str')
data['MonthYear'] = data['DATE2'].dt.to_period('M')
data.drop('DATE2', axis=1, inplace=True)
data.rename(columns={'permno2': 'id'}, inplace=True)
data.rename(columns={'mvel12': 'market_cap'}, inplace=True)
data.rename(columns={'macro_tbl': 'risk_free_rate'}, inplace=True)
data = data.set_index(['MonthYear','id'])

bigdata = pd.merge(results, data,left_index=True, right_index=True)
bigdata.reset_index(inplace=True)
bigdata
bigdata['returns'] = bigdata['y_true'] + bigdata['risk_free_rate']

In [26]:
bigdata['MonthYear1'] = bigdata['MonthYear'].copy()
bigdata['MonthYear'] = bigdata['MonthYear'].astype('int64')
bigdata['NumMonth'] = bigdata['MonthYear'] - 83
bigdata['NumMonth'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [27]:
bigdata.to_csv('predictions/nnet1.csv', index=False)

In [28]:
bigdata = pd.read_csv('predictions/nnet1.csv')

In [29]:
top_100 = bigdata.sort_values(['NumMonth','yhat'], ascending=[True, True]).groupby(['MonthYear'],
                                                                  as_index=False,
                                                                  sort=False).tail(100)

In [30]:
portfolio = top_100[['date', 'NumMonth','MonthYear', 'id', 'yhat', 'y_true', 'risk_free_rate', 'MonthYear1']]
portfolio.reset_index(inplace=True)
portfolio.drop(columns=['index'],inplace=True)
portfolio['eq_weights'] = 1/portfolio.groupby('MonthYear')['id'].transform('size')
portfolio['excess_return_stock_ew'] = portfolio['y_true'] *portfolio['eq_weights']
portfolio['pred_excess_return_stock_ew'] = portfolio["yhat"]*portfolio["eq_weights"]

In [31]:
mean_pred_return = portfolio.groupby('MonthYear')['pred_excess_return_stock_ew'].transform('sum').mean()
mean_port_return = portfolio.groupby('MonthYear')['excess_return_stock_ew'].transform('sum').mean()
port_vol =  portfolio.groupby('MonthYear')["pred_excess_return_stock_ew"].transform('sum').std()
sharp_ratio = (mean_pred_return/port_vol)*np.sqrt(12)

In [33]:
chart_np = np.array([[mean_port_return, mean_pred_return, port_vol, sharp_ratio]])

ew_df = pd.DataFrame(chart_np, columns=['Real', 'Pred', 'Std', 'Sharpe'],
                                index=['NN-1'])

ew_df['Real'] = pd.Series(['{0:.2f}%'.format(val) for val in ew_df['Real']], index= ew_df.index)
ew_df['Pred'] = pd.Series(['{0:.2f}%'.format(val) for val in ew_df['Pred']], index= ew_df.index)
ew_df['Std'] = pd.Series(['{0:.2f}%'.format(val) for val in ew_df['Std']], index= ew_df.index)
ew_df['Sharpe'] = pd.Series(['{0:.2f}%'.format(val) for val in ew_df['Sharpe']], index= ew_df.index)
ew_df

Unnamed: 0,Real,Pred,Std,Sharpe
NN-1,-3.40%,-1.74%,2.98%,-2.02%
