In [1]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import *

from scipy import stats
import datetime as dt

import matplotlib.pyplot as plt
import statsmodels.api as sm

import pickle

In [2]:
# Loading stocks
y_pred1 = pickle.load(open('lstm_predict.pkl', 'rb' ))
y_pred2 = pickle.load(open('lstm_predict_2.pkl', 'rb' ))
y_pred3 = pickle.load(open('lstm_predict_3.pkl', 'rb' ))
y_pred4 = pickle.load(open('lstm_predict_4.pkl', 'rb' ))
y_pred5 = pickle.load(open('lstm_predict_5.pkl', 'rb' ))

In [3]:
y_pred1.shape, y_pred2.shape, y_pred3.shape, y_pred4.shape, y_pred5.shape

((270000, 1), (270000, 1), (270000, 1), (270000, 1), (270000, 1))

In [4]:
y_pred = np.mean(np.array([y_pred1, y_pred2, y_pred3, y_pred4,y_pred5]), axis=0 )

In [5]:
y_pred

array([[6.1427723e-05],
       [1.8269480e-05],
       [8.4781765e-05],
       ...,
       [1.7059020e-04],
       [2.0133998e-04],
       [2.0943268e-04]], dtype=float32)

In [6]:
# Importing y
y = pd.read_csv('y_reg.csv')
y['date'] =  pd.to_datetime(y['date'])
y = y.set_index(['date'])
y_test = y[y.index > '1975-12-01']
y_test = y_test.values.ravel()

In [7]:
# Testing R2
r2_df = pd.DataFrame()
r2_df['y'] = y_test
r2_df['y_hat'] = y_pred
r2_df['difference'] = (r2_df['y']-r2_df['y_hat'])
r2_df['y_2'] = r2_df['y']**2
r2_df['dif2'] = r2_df['difference']**2
y_2 = r2_df['y_2'].sum() 
dif2 = r2_df['dif2'].sum()
print(1 - (dif2 / y_2))

0.0002504527347331198


In [8]:
spear = []
for i in range(len(y_pred)//500):       
    spear.append(stats.spearmanr(y_pred[i*500:(i+1)*500], y_test[i*500:(i+1)*500])[0])

In [9]:
np.mean(spear),np.std(spear,ddof=1)

(0.023992212257426537, 0.18100208841385337)

In [10]:
stats.ttest_1samp(spear, 0.0)

Ttest_1sampResult(statistic=3.080233139605294, pvalue=0.002173894202390625)

In [11]:
# Loading stock universe
universe_test = pd.read_csv('universe_test.csv')

In [12]:
# Insert predictions
universe_test.insert(1, "y_hat", y_pred)

In [13]:
y_rank = universe_test.copy()

In [14]:
y_rank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270000 entries, 0 to 269999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    270000 non-null  object 
 1   y_hat   270000 non-null  float32
 2   permno  270000 non-null  int64  
dtypes: float32(1), int64(1), object(1)
memory usage: 5.1+ MB


In [15]:
y_rank

Unnamed: 0,date,y_hat,permno
0,1976-01-01,0.000061,10137
1,1976-01-01,0.000018,10145
2,1976-01-01,0.000085,10161
3,1976-01-01,0.000066,10225
4,1976-01-01,0.000075,10233
...,...,...,...
269995,2020-12-01,0.000120,93096
269996,2020-12-01,0.000148,93132
269997,2020-12-01,0.000171,93246
269998,2020-12-01,0.000201,93312


In [16]:
y_rank['y_hat'].nunique()

267093

In [17]:
y_rank.shape

(270000, 3)

In [18]:
# For each date, ranking stocks into 1-10 portfolios based on cumret
# 1 = loser, 10 = winner portfolio 

y_rank = y_rank.groupby('date').apply(lambda x: x.sort_values(['y_hat'], ascending = False)).reset_index(drop=True)
y_rank['rank'] = y_rank.groupby('date')['y_hat'].transform(lambda x: pd.qcut(x, 10, labels=False))

# For visualization: Shift momr from 0-9 to 1-10 
y_rank['rank']=1+y_rank['rank']

In [19]:
# Average cumulative return for each portfolio
y_rank.groupby('rank')['y_hat'].mean()

rank
1.0     0.000072
2.0     0.000087
3.0     0.000096
4.0     0.000103
5.0     0.000110
6.0     0.000117
7.0     0.000124
8.0     0.000131
9.0     0.000141
10.0    0.000156
Name: y_hat, dtype: float32

### 2. Merging daily data with LSTM rank

In [20]:
# Loading daily returns
d_ret1 = pd.read_csv('daily_1931.csv')
d_ret1['daily_date'] = pd.to_datetime(d_ret1.daily_date)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [21]:
d_ret1

Unnamed: 0,permno,daily_date,siccd,dlret,ret,date_merge
0,10000,1986-01-07,3990,,C,1986/01
1,10000,1986-01-08,3990,,-0.024390,1986/01
2,10000,1986-01-09,3990,,0.000000,1986/01
3,10000,1986-01-10,3990,,0.000000,1986/01
4,10000,1986-01-13,3990,,0.050000,1986/01
...,...,...,...,...,...,...
75696491,93436,2020-12-24,9999,,0.024444,2020/12
75696492,93436,2020-12-28,9999,,0.002901,2020/12
75696493,93436,2020-12-29,9999,,0.003465,2020/12
75696494,93436,2020-12-30,9999,,0.043229,2020/12


In [22]:
# Preparing date colmn in "universe" for merge

#Rename date column to merge
y_rank.rename(columns={'date':'monthly_date'}, inplace=True)

#Make datatime format
y_rank['monthly_date'] = pd.to_datetime(y_rank.monthly_date)

# Create date for merging
y_rank['date_merge'] = y_rank['monthly_date']
y_rank['date_merge'] = y_rank['date_merge'].dt.strftime("%Y/%m")

In [23]:
# Merge on monthly date
daily = pd.merge(d_ret1, y_rank, on=['date_merge', 'permno'], how='left')

In [24]:
# Looking at daily data
daily.head()

Unnamed: 0,permno,daily_date,siccd,dlret,ret,date_merge,monthly_date,y_hat,rank
0,10000,1986-01-07,3990,,C,1986/01,NaT,,
1,10000,1986-01-08,3990,,-0.024390,1986/01,NaT,,
2,10000,1986-01-09,3990,,0.000000,1986/01,NaT,,
3,10000,1986-01-10,3990,,0.000000,1986/01,NaT,,
4,10000,1986-01-13,3990,,0.050000,1986/01,NaT,,


In [25]:
# Filter on non-nans
daily_filtered = daily[daily['rank'].notna()]

In [26]:
# Ensure returns are number format
daily_filtered['ret'] = daily_filtered['ret'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_filtered['ret'] = daily_filtered['ret'].astype(float)


In [27]:
# Creating dateframe for portfolio calculation
port_d_return = pd.DataFrame(daily_filtered[['daily_date', 'ret', 'rank']])

In [28]:
# Calculating mean (portfolio return), for a given date, rank
port_d_return = port_d_return.groupby(['daily_date','rank'])['ret'].mean().reset_index()
port_d_return_t = port_d_return.copy()

In [29]:
# Check portfolio return
port_d_return_t.groupby('rank')['ret'].describe()[['count','mean','std']].reset_index()

Unnamed: 0,rank,count,mean,std
0,1.0,11350.0,0.000383,0.012621
1,2.0,11350.0,0.000471,0.01143
2,3.0,11350.0,0.000508,0.010905
3,4.0,11350.0,0.000496,0.010651
4,5.0,11350.0,0.000562,0.010832
5,6.0,11350.0,0.00055,0.010906
6,7.0,11350.0,0.000573,0.011206
7,8.0,11350.0,0.000602,0.011754
8,9.0,11350.0,0.00069,0.013052
9,10.0,11350.0,0.000824,0.014787


In [30]:
# Transpose portfolio layout to have columns as portfolio returns
port_d_return_t = port_d_return.pivot(index='daily_date', columns='rank', values='ret')

# Add prefix port in front of each column
port_d_return_t = port_d_return_t.add_prefix('P')

In [31]:
port_d_return_t.head()

rank,P1.0,P2.0,P3.0,P4.0,P5.0,P6.0,P7.0,P8.0,P9.0,P10.0
daily_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1976-01-02,0.006738,0.011817,0.009438,0.00933,0.008639,0.00791,0.006683,0.008224,0.005939,0.002824
1976-01-05,0.016448,0.014676,0.017907,0.021792,0.022432,0.019771,0.015957,0.015352,0.017298,0.015593
1976-01-06,0.01657,0.016324,0.014518,0.010212,0.013775,0.01139,0.010069,0.012719,0.016626,0.020221
1976-01-07,0.006484,0.001816,0.007775,0.005127,0.006212,0.005975,0.005786,0.009859,0.013412,0.015671
1976-01-08,0.004569,0.010558,0.005687,0.009325,0.005191,0.007027,0.011425,0.003769,0.009764,0.006901


In [32]:
# Creating long-short portfolio
port_d_return_t = port_d_return_t.rename(columns={'P1.0':'losers', 'P10.0':'winners'})
port_d_return_t['long_short'] = port_d_return_t.winners - port_d_return_t.losers

In [33]:
port_d_return_t

rank,losers,P2.0,P3.0,P4.0,P5.0,P6.0,P7.0,P8.0,P9.0,winners,long_short
daily_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1976-01-02,0.006738,0.011817,0.009438,0.009330,0.008639,0.007910,0.006683,0.008224,0.005939,0.002824,-0.003914
1976-01-05,0.016448,0.014676,0.017907,0.021792,0.022432,0.019771,0.015957,0.015352,0.017298,0.015593,-0.000855
1976-01-06,0.016570,0.016324,0.014518,0.010212,0.013775,0.011390,0.010069,0.012719,0.016626,0.020221,0.003652
1976-01-07,0.006484,0.001816,0.007775,0.005127,0.006212,0.005975,0.005786,0.009859,0.013412,0.015671,0.009187
1976-01-08,0.004569,0.010558,0.005687,0.009325,0.005191,0.007027,0.011425,0.003769,0.009764,0.006901,0.002332
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,0.005737,0.004170,0.003559,0.005007,0.000941,0.002003,-0.000434,0.000302,-0.001444,-0.002921,-0.008657
2020-12-28,0.006195,0.004060,-0.000397,-0.000705,-0.004408,0.002720,-0.002912,-0.004699,-0.003557,-0.015348,-0.021543
2020-12-29,-0.001550,-0.000428,-0.002402,-0.002708,-0.004079,-0.005113,-0.005941,-0.005402,-0.007445,-0.011205,-0.009655
2020-12-30,0.000291,0.001125,0.002158,0.004038,0.002928,0.005746,0.006992,0.009797,0.006870,0.016512,0.016221


In [34]:
# Ensuring no NAN-values
port_d_return_t['long_short'].isnull().sum()

3

In [35]:
# OBS FILLING WITH NA - should not be necessary with new file
port_d_return_t['long_short'] = port_d_return_t['long_short'].fillna(0)
port_d_return_t['long_short'].isnull().sum()

0

In [36]:
# Defining long_short return
long_short_ret = port_d_return_t['long_short']
long_short_ret

daily_date
1976-01-02   -0.003914
1976-01-05   -0.000855
1976-01-06    0.003652
1976-01-07    0.009187
1976-01-08    0.002332
                ...   
2020-12-24   -0.008657
2020-12-28   -0.021543
2020-12-29   -0.009655
2020-12-30    0.016221
2020-12-31   -0.014621
Name: long_short, Length: 11353, dtype: float64

In [37]:
# Dropping NA values
long_short_ret = long_short_ret[long_short_ret.index != '1985-09-27']
long_short_ret = long_short_ret[long_short_ret.index != '2012-10-29']
long_short_ret = long_short_ret[long_short_ret.index != '2017-09-30']

In [38]:
# Save to CSV
long_short_ret.to_csv('lstm_results.csv', index=True)

In [39]:
long_short_ret

daily_date
1976-01-02   -0.003914
1976-01-05   -0.000855
1976-01-06    0.003652
1976-01-07    0.009187
1976-01-08    0.002332
                ...   
2020-12-24   -0.008657
2020-12-28   -0.021543
2020-12-29   -0.009655
2020-12-30    0.016221
2020-12-31   -0.014621
Name: long_short, Length: 11350, dtype: float64