In [40]:
import pandas as pd
import numpy as np
from pandas.tseries.offsets import *

from scipy import stats
import datetime as dt

import matplotlib.pyplot as plt
import statsmodels.api as sm

import pickle

In [41]:
# Loading stocks
y_pred = pickle.load(open('rf_predict.pkl', 'rb' ))

In [42]:
y_pred.shape

(270000,)

In [43]:
# Importing y
y = pd.read_csv('y_reg.csv')
y['date'] =  pd.to_datetime(y['date'])
y = y.set_index(['date'])
y_test = y[y.index > '1975-12-01']
y_test = y_test.values.ravel()

In [44]:
# Testing R2
r2_df = pd.DataFrame()
r2_df['y'] = y_test
r2_df['y_hat'] = y_pred
r2_df['difference'] = (r2_df['y']-r2_df['y_hat'])
r2_df['y_2'] = r2_df['y']**2
r2_df['dif2'] = r2_df['difference']**2
y_2 = r2_df['y_2'].sum() 
dif2 = r2_df['dif2'].sum()
print(1 - (dif2 / y_2))

0.006810558037537384


In [45]:
len(y_pred)//540

500

In [46]:
y_pred.shape, y_test.shape

((270000,), (270000,))

In [48]:
spear = []
for i in range(len(y_pred)//500):       
    spear.append(stats.spearmanr(y_pred[i*500:(i+1)*500], y_test[i*500:(i+1)*500])[0])
spear

[0.15612952316111467,
 0.0343830528074932,
 -0.01871437728875601,
 -0.08492143331278248,
 -0.08211869878406963,
 0.042434788660929194,
 -0.22941859307298995,
 -0.22269399011073826,
 -0.12497823467189043,
 -0.1471459976602807,
 -0.0042143733078716826,
 0.16002220113995963,
 -0.07156904672813746,
 0.01695381996641927,
 0.12478887323945895,
 0.012293248881414273,
 -0.12803766717382728,
 0.1593326945939534,
 -0.15940397500805137,
 -0.021916429097073746,
 -0.025299600789382624,
 -0.1378950796436596,
 0.2906835109883697,
 0.004140603490125168,
 -0.16050060471291916,
 -0.08077321518736554,
 0.199076674206991,
 0.30121996086216646,
 0.2038199092338744,
 -0.07455158556235775,
 0.2957802663254758,
 0.09452771841112001,
 -0.21180121982582606,
 -0.4727580129569642,
 0.1542746442493618,
 0.18407762978688189,
 -0.04388951440052289,
 -0.17123960909178837,
 0.42456285036656827,
 0.08494564132891597,
 -0.18388842624638443,
 0.14124164780922527,
 0.0169090974939045,
 0.23905728188574157,
 0.251412805109

In [49]:
np.mean(spear),np.std(spear,ddof=1)

(0.0006993396865526645, 0.19126976670851922)

In [50]:
stats.ttest_1samp(spear, 0.0)

Ttest_1sampResult(statistic=0.08496473873431286, pvalue=0.9323209726693938)

In [51]:
# Loading stock universe
universe_test = pd.read_csv('universe_test.csv')

In [52]:
# Insert predictions
universe_test.insert(1, "y_hat", y_pred*100)

In [53]:
y_rank = universe_test.copy()

In [54]:
y_rank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270000 entries, 0 to 269999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    270000 non-null  object 
 1   y_hat   270000 non-null  float64
 2   permno  270000 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 6.2+ MB


In [55]:
y_rank

Unnamed: 0,date,y_hat,permno
0,1976-01-01,0.894265,10137
1,1976-01-01,0.873039,10145
2,1976-01-01,0.876159,10161
3,1976-01-01,0.860705,10225
4,1976-01-01,0.905740,10233
...,...,...,...
269995,2020-12-01,0.815737,93096
269996,2020-12-01,0.850236,93132
269997,2020-12-01,0.864399,93246
269998,2020-12-01,0.851359,93312


In [56]:
y_rank['y_hat'].nunique()

241340

In [57]:
y_rank.shape

(270000, 3)

In [58]:
# For each date, ranking stocks into 1-10 portfolios based on cumret
# 1 = loser, 10 = winner portfolio 

y_rank = y_rank.groupby('date').apply(lambda x: x.sort_values(['y_hat'], ascending = False)).reset_index(drop=True)
y_rank['rank'] = y_rank.groupby('date')['y_hat'].transform(lambda x: pd.qcut(x, 10, labels=False))

# For visualization: Shift momr from 0-9 to 1-10 
y_rank['rank']=1+y_rank['rank']

In [59]:
# Average cumulative return for each portfolio
y_rank.groupby('rank')['y_hat'].mean()

rank
1.0     0.825018
2.0     0.830935
3.0     0.836620
4.0     0.842721
5.0     0.848671
6.0     0.854005
7.0     0.858888
8.0     0.864021
9.0     0.870870
10.0    0.885994
Name: y_hat, dtype: float64

### 2. Merging daily data with RF rank

In [60]:
# Loading daily returns
d_ret1 = pd.read_csv('daily_1931.csv')
d_ret1['daily_date'] = pd.to_datetime(d_ret1.daily_date)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [61]:
d_ret1

Unnamed: 0,permno,daily_date,siccd,dlret,ret,date_merge
0,10000,1986-01-07,3990,,C,1986/01
1,10000,1986-01-08,3990,,-0.024390,1986/01
2,10000,1986-01-09,3990,,0.000000,1986/01
3,10000,1986-01-10,3990,,0.000000,1986/01
4,10000,1986-01-13,3990,,0.050000,1986/01
...,...,...,...,...,...,...
75696491,93436,2020-12-24,9999,,0.024444,2020/12
75696492,93436,2020-12-28,9999,,0.002901,2020/12
75696493,93436,2020-12-29,9999,,0.003465,2020/12
75696494,93436,2020-12-30,9999,,0.043229,2020/12


In [62]:
# Preparing date colmn in "universe" for merge

#Rename date column to merge
y_rank.rename(columns={'date':'monthly_date'}, inplace=True)

#Make datatime format
y_rank['monthly_date'] = pd.to_datetime(y_rank.monthly_date)

# Create date for merging
y_rank['date_merge'] = y_rank['monthly_date']
y_rank['date_merge'] = y_rank['date_merge'].dt.strftime("%Y/%m")

In [63]:
# Merge on monthly date
daily = pd.merge(d_ret1, y_rank, on=['date_merge', 'permno'], how='left')

In [64]:
# Looking at daily data
daily.head()

Unnamed: 0,permno,daily_date,siccd,dlret,ret,date_merge,monthly_date,y_hat,rank
0,10000,1986-01-07,3990,,C,1986/01,NaT,,
1,10000,1986-01-08,3990,,-0.024390,1986/01,NaT,,
2,10000,1986-01-09,3990,,0.000000,1986/01,NaT,,
3,10000,1986-01-10,3990,,0.000000,1986/01,NaT,,
4,10000,1986-01-13,3990,,0.050000,1986/01,NaT,,


In [65]:
# Filter on non-nans
daily_filtered = daily[daily['rank'].notna()]

In [66]:
# Ensure returns are number format
daily_filtered['ret'] = daily_filtered['ret'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_filtered['ret'] = daily_filtered['ret'].astype(float)


In [67]:
# Creating dateframe for portfolio calculation
port_d_return = pd.DataFrame(daily_filtered[['daily_date', 'ret', 'rank']])

In [68]:
# Calculating mean (portfolio return), for a given date, rank
port_d_return = port_d_return.groupby(['daily_date','rank'])['ret'].mean().reset_index()
port_d_return_t = port_d_return.copy()

In [69]:
# Check portfolio return
port_d_return_t.groupby('rank')['ret'].describe()[['count','mean','std']].reset_index()

Unnamed: 0,rank,count,mean,std
0,1.0,11350.0,0.000526,0.008651
1,2.0,11350.0,0.000526,0.009192
2,3.0,11350.0,0.000556,0.009749
3,4.0,11350.0,0.000534,0.010122
4,5.0,11350.0,0.000569,0.010946
5,6.0,11350.0,0.000502,0.01163
6,7.0,11350.0,0.000537,0.012528
7,8.0,11350.0,0.000546,0.013255
8,9.0,11350.0,0.000629,0.014421
9,10.0,11350.0,0.000732,0.016805


In [70]:
# Transpose portfolio layout to have columns as portfolio returns
port_d_return_t = port_d_return.pivot(index='daily_date', columns='rank', values='ret')

# Add prefix port in front of each column
port_d_return_t = port_d_return_t.add_prefix('P')

In [71]:
port_d_return_t.head()

rank,P1.0,P2.0,P3.0,P4.0,P5.0,P6.0,P7.0,P8.0,P9.0,P10.0
daily_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1976-01-02,0.009194,0.008036,0.008553,0.012283,0.006042,0.005962,0.006258,0.004939,0.009485,0.006791
1976-01-05,0.015757,0.019665,0.017873,0.020653,0.016106,0.01797,0.014935,0.016929,0.02232,0.015016
1976-01-06,0.010377,0.008208,0.017792,0.01197,0.01241,0.012107,0.016315,0.019384,0.018626,0.015234
1976-01-07,0.005772,0.006967,0.009959,0.004195,0.007162,0.002493,0.011734,0.007262,0.005204,0.017368
1976-01-08,0.005214,0.005818,0.01199,0.007059,0.002315,0.008296,0.003843,0.013522,0.00859,0.00757


In [72]:
# Creating long-short portfolio
port_d_return_t = port_d_return_t.rename(columns={'P1.0':'losers', 'P10.0':'winners'})
port_d_return_t['long_short'] = port_d_return_t.winners - port_d_return_t.losers

In [73]:
port_d_return_t

rank,losers,P2.0,P3.0,P4.0,P5.0,P6.0,P7.0,P8.0,P9.0,winners,long_short
daily_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1976-01-02,0.009194,0.008036,0.008553,0.012283,0.006042,0.005962,0.006258,0.004939,0.009485,0.006791,-0.002403
1976-01-05,0.015757,0.019665,0.017873,0.020653,0.016106,0.017970,0.014935,0.016929,0.022320,0.015016,-0.000741
1976-01-06,0.010377,0.008208,0.017792,0.011970,0.012410,0.012107,0.016315,0.019384,0.018626,0.015234,0.004857
1976-01-07,0.005772,0.006967,0.009959,0.004195,0.007162,0.002493,0.011734,0.007262,0.005204,0.017368,0.011597
1976-01-08,0.005214,0.005818,0.011990,0.007059,0.002315,0.008296,0.003843,0.013522,0.008590,0.007570,0.002356
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,0.005833,0.006209,0.005116,0.003318,0.000159,-0.000155,0.000023,0.000372,0.000522,-0.004479,-0.010312
2020-12-28,0.006089,0.004279,0.003899,0.000508,0.004322,0.000634,-0.000922,-0.007501,-0.007460,-0.022898,-0.028987
2020-12-29,-0.002321,0.000350,-0.004070,-0.003975,-0.002354,-0.004957,-0.006822,-0.006263,-0.006034,-0.009826,-0.007505
2020-12-30,0.001585,0.002230,0.001904,0.003837,0.005165,0.003976,0.007589,0.006491,0.008334,0.015346,0.013761


In [74]:
# Ensuring no NAN-values
port_d_return_t['long_short'].isnull().sum()

3

In [75]:
# OBS FILLING WITH NA - should not be necessary with new file
port_d_return_t['long_short'] = port_d_return_t['long_short'].fillna(0)
port_d_return_t['long_short'].isnull().sum()

0

In [76]:
# Defining long_short return
long_short_ret = port_d_return_t['long_short']
long_short_ret

daily_date
1976-01-02   -0.002403
1976-01-05   -0.000741
1976-01-06    0.004857
1976-01-07    0.011597
1976-01-08    0.002356
                ...   
2020-12-24   -0.010312
2020-12-28   -0.028987
2020-12-29   -0.007505
2020-12-30    0.013761
2020-12-31   -0.017539
Name: long_short, Length: 11353, dtype: float64

In [77]:
# Dropping NA values
long_short_ret = long_short_ret[long_short_ret.index != '1985-09-27']
long_short_ret = long_short_ret[long_short_ret.index != '2012-10-29']
long_short_ret = long_short_ret[long_short_ret.index != '2017-09-30']

In [78]:
# Save to CSV
long_short_ret.to_csv('rf_results.csv', index=True)

In [79]:
long_short_ret

daily_date
1976-01-02   -0.002403
1976-01-05   -0.000741
1976-01-06    0.004857
1976-01-07    0.011597
1976-01-08    0.002356
                ...   
2020-12-24   -0.010312
2020-12-28   -0.028987
2020-12-29   -0.007505
2020-12-30    0.013761
2020-12-31   -0.017539
Name: long_short, Length: 11350, dtype: float64