## 1. Import Relevant Packages

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

import statsmodels.api as sm

from statsmodels.regression.rolling import RollingOLS

from functools import reduce
import math

## 2. Variables created for ANN 

**Market variables** (merges på date)
* Market return (1M)
* Cumulative market return (12-1M)
* Standard deviation (12-1M)

**Stock variables** (merges på date og permno)
* Stock returns (1M) 
* Stock cumulative returns (12-1M) 
* Standard deviation of stock returns (12-1M)
* Alpha(12-1M)
* Beta (12-1M) 
* Idiosyncreatic returns (12-1M)

## 3. Importing and preprocessing data

### 3.1 Importing CRSP data

In [2]:
# OBS: Change file to "LSTM file" when momentum strategy code is discussed 
crsp_m = pd.read_csv('Input.csv', low_memory=False)

In [3]:
# Make Python datetime format
crsp_m['date'] =  pd.to_datetime(crsp_m['date'])

In [4]:
crsp_m=crsp_m.sort_values(by='date')
crsp_m

Unnamed: 0,permno,date,cumret,ret
0,10006,1926-01-31,,0.032732
80206,11666,1926-01-31,,0.048110
80624,11674,1926-01-31,,0.015668
81764,11690,1926-01-31,,0.067669
83080,11703,1926-01-31,,-0.020474
...,...,...,...,...
1083039,84606,2020-12-31,-0.080691,0.077953
215907,14795,2020-12-31,0.505261,0.053377
337087,17830,2020-12-31,-0.113068,-0.002928
1126635,89540,2020-12-31,0.313579,-0.005061


In [5]:
sorted_crsp = crsp_m.sort_values(['permno','date']).set_index('date')

In [6]:
sorted_crsp.head(11)

Unnamed: 0_level_0,permno,cumret,ret
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1926-01-31,10006,,0.032732
1926-02-28,10006,,-0.071429
1926-03-31,10006,,-0.042735
1926-04-30,10006,,-0.025907
1926-05-31,10006,,0.023936
1926-06-30,10006,,0.044156
1926-08-31,10006,,0.020202
1926-09-30,10006,,0.009901
1926-10-31,10006,,-0.014925
1926-11-30,10006,,0.04798


### 3.2 Importing Fama French data

In [7]:
# Loading Fama French 3 Factors (only include rows with factors)
ff_3f = pd.read_csv('FF_3.csv', skiprows = 3, nrows=1134, index_col = 0)

In [8]:
# Get in decimal format 
ff_3f = ff_3f / 100

In [9]:
# Checking
ff_3f.head()

Unnamed: 0,Mkt-RF,SMB,HML,RF
192607,0.0296,-0.023,-0.0287,0.0022
192608,0.0264,-0.014,0.0419,0.0025
192609,0.0036,-0.0132,0.0001,0.0023
192610,-0.0324,0.0004,0.0051,0.0032
192611,0.0253,-0.002,-0.0035,0.0031


In [10]:
# Setting datetime format (in months)
ff_3f.index = pd.to_datetime(ff_3f.index, format= '%Y%m')
ff_3f.index = ff_3f.index.to_period('M')

## 4. Market variables

In [11]:
# Calculating market return
market = pd.DataFrame()
market['Mkt'] = ff_3f['Mkt-RF'] + ff_3f['RF']

In [12]:
# Calculating log and sum of log returns (cumulative market return) over past 11 months
market_log = market.copy()
market_log['Mkt_logret']= np.log(1+market['Mkt'])
market_log['Mkt_cumlog'] = market_log['Mkt_logret'].rolling(11, min_periods=11).sum()

# Get non-log
market['Mkt_cumret'] = np.exp(market_log['Mkt_cumlog'])-1

In [13]:
market['Mkt_std'] = market['Mkt'].rolling(11, min_periods=11).std()

In [14]:
# Lag once in order to ensure that previous months are used for prediction
# Lag once more in order to include skipping period
market_lagged = market.shift(2)

In [15]:
# Include only relevant period
market_lagged = market_lagged[market_lagged.index >= '1928-12']
market_train = market_lagged[market_lagged.index <= '1975-12']
market_test = market_lagged[market_lagged.index > '1975-12']

In [16]:
# Importing MinMax scaler to convert to scale from -1 to 1
from sklearn.preprocessing import StandardScaler

In [17]:
# Defining scaler from -1 to 1
scaler = StandardScaler()

In [18]:
# Fit scaler to training values
scaler = scaler.fit(market_train)

In [19]:
# Scale train data
market_train_scaled = pd.DataFrame(scaler.transform(market_train))

In [20]:
# Scale test data
market_test_scaled =  pd.DataFrame(scaler.transform(market_test))

In [21]:
market_test_scaled.describe()

Unnamed: 0,0,1,2
count,540.0,540.0,540.0
mean,0.038833,0.100492,-0.273513
std,0.724474,0.675525,0.47513
min,-3.825014,-2.248445,-1.187975
25%,-0.38027,-0.243858,-0.656022
50%,0.096795,0.152802,-0.300221
75%,0.505708,0.504707,0.046594
max,2.098917,2.651119,1.238474


In [22]:
# Resetting columns and index
market_train_scaled.columns = ['Mkt', 'Mkt_cumret','Mkt_std']
market_train_scaled.index = market_train.index

market_test_scaled.columns = ['Mkt', 'Mkt_cumret','Mkt_std']
market_test_scaled.index = market_test.index

In [23]:
market_train_scaled

Unnamed: 0,Mkt,Mkt_cumret,Mkt_std
1928-12,0.154744,0.722116,-0.371499
1929-01,1.860588,1.251616,-0.059420
1929-02,-0.060731,1.302944,-0.075847
1929-03,0.686902,1.694581,-0.142417
1929-04,-0.126026,1.157308,-0.216312
...,...,...,...
1975-08,0.726080,0.823018,1.068768
1975-09,-1.126679,0.984572,0.974928
1975-10,-0.516166,1.538866,0.650214
1975-11,-0.738171,0.439998,0.315588


**Testing for stationarity**

In [83]:
# Testing for stationarity (i.e. if p-value <= 0.05)
# Import adfuller

from statsmodels.tsa.stattools import adfuller
# Stationarity of market returns
print("Stationarity of market returns:")
result = adfuller(market_lagged['Mkt'])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))
    

from statsmodels.tsa.stattools import adfuller
# Stationarity of market cum. returns
print("Stationarity of market cum. returns:")
result = adfuller(market_lagged['Mkt_cumret'])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

Stationarity of market returns:
ADF Statistic: -8.348412
p-value: 0.000000
Critical Values:
	1%: -3.436
	5%: -2.864
	10%: -2.568
Stationarity of market cum. returns:
ADF Statistic: -5.801281
p-value: 0.000000
Critical Values:
	1%: -3.436
	5%: -2.864
	10%: -2.568


In [84]:
# Market file to CSV
market_train_scaled.to_csv('market_train.csv', index=True)
market_test_scaled.to_csv('market_test.csv', index=True)

# Save unscalled market for plotting
market_lagged.to_csv('market_unscalled.csv', index=True)

## 3. Stock variables 

### 3.1 Return

In [26]:
# Make into pivot format
sorted_crsp_p = sorted_crsp.pivot(columns='permno', values=['ret'])

In [27]:
sorted_ret = sorted_crsp_p['ret']

In [28]:
# Ensure that there is no N/A values in between returns
sorted_ret[sorted_ret.bfill().notnull() & sorted_ret.ffill().notnull()] = sorted_ret.fillna(0)
sorted_ret[sorted_ret.bfill().notnull() & sorted_ret.ffill().notnull()] = sorted_ret.fillna(0)

In [29]:
# Return CSV file
m_sorted_ret = sorted_ret.copy()

sorted_ret.index = sorted_ret.index.to_period('M')
sorted_ret = sorted_ret[sorted_ret.index >= '1927-01']
sorted_ret_lagged = sorted_ret.shift(1)

In [30]:
sorted_ret_lagged.head()

permno,10006,10014,10022,10030,10057,10073,10078,10081,10095,10102,...,93152,93174,93179,93223,93246,93295,93312,93422,93429,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1927-01,,,,,,,,,,,...,,,,,,,,,,
1927-02,-0.013547,0.0,-0.075893,0.009545,-0.05102,0.094595,,-0.075,,-0.017986,...,,,,,,,,,,
1927-03,0.066167,0.619048,0.033816,0.044575,-0.182796,0.333333,,0.0,,0.084249,...,,,,,,,,,,
1927-04,-0.028103,-0.117647,0.090234,-0.013793,0.184211,-0.083333,,0.013514,,0.028716,...,,,,,,,,,,
1927-05,-0.01467,-0.266667,-0.149123,0.010601,-0.022222,-0.010101,,-0.123288,,0.039735,...,,,,,,,,,,


In [31]:
# To CSV
sorted_ret_lagged.to_csv('input_ret.csv', index=True)

### 3.2 Beta

In [32]:
# Defining beginning period for FF
ff_3f = ff_3f[ff_3f.index >= '1927-01']

In [33]:
# Calculating excess return (output variable for CAPM regression)
excess_return = pd.DataFrame()
for stock in sorted_ret.columns:
    excess_return[stock] = sorted_ret[stock] - ff_3f['RF']

In [34]:
excess_return

Unnamed: 0_level_0,10006,10014,10022,10030,10057,10073,10078,10081,10095,10102,...,93152,93174,93179,93223,93246,93295,93312,93422,93429,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1927-01,-0.016047,-0.002500,-0.078393,0.007045,-0.053520,0.092095,,-0.077500,,-0.020486,...,,,,,,,,,,
1927-02,0.063567,0.616448,0.031216,0.041975,-0.185396,0.330733,,-0.002600,,0.081649,...,,,,,,,,,,
1927-03,-0.031103,-0.120647,0.087234,-0.016793,0.181211,-0.086333,,0.010514,,0.025716,...,,,,,,,,,,
1927-04,-0.017170,-0.269167,-0.151623,0.008101,-0.024722,-0.012601,,-0.125788,,0.037235,...,,,,,,,,,,
1927-05,0.075164,-0.093909,0.009887,0.073923,0.178818,-0.043816,,-0.284250,,0.047955,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08,,,,,,,,,,,...,,0.145967,0.042245,,0.205510,,0.110509,-0.115746,,0.741352
2020-09,,,,,,,,,,,...,,-0.169650,-0.064787,,0.019165,,-0.050320,-0.305638,,-0.139187
2020-10,,,,,,,,,,,...,,0.011011,0.118510,,0.085161,,-0.021580,-0.003202,,-0.095599
2020-11,,,,,,,,,,,...,,0.239598,0.128335,,0.025834,,0.165553,0.788789,,0.462636


In [35]:
# Estimating beta
CAPM_dict = {}

for stock in excess_return.columns:
    y=excess_return[stock]
    X=sm.add_constant(ff_3f['Mkt-RF'])
    rols = RollingOLS(y, X, window=11, min_nobs=11) #don't include NaN values
    rres = rols.fit()
    
    CAPM_dict[stock] = rres.params

In [36]:
# Creating dataframe for beta
beta = pd.DataFrame()
for stock in excess_return.columns:
    beta[stock] = CAPM_dict[stock]['Mkt-RF']

In [37]:
#Lag beta once to get one month skipping period
beta_lagged = beta.shift(1)

In [38]:
# Save as csv file
beta_lagged.to_csv('input_beta.csv', index=True)

### 3.3 Idiosyncratic Momentum (Error Term)

In [39]:
# Estimating alpha and beta using Fama/French three-factor model

CAPM_dict = {}

for stock in excess_return.columns:
    y=excess_return[stock]
    X=sm.add_constant(ff_3f[['Mkt-RF', 'SMB', 'HML']])
    rols = RollingOLS(y, X, window=11, min_nobs=11) #Alternative set window to 36 and min_nobs to 12
    rres = rols.fit()
    
    CAPM_dict[stock] = rres.params

In [40]:
# Calculating and creating DataFrame for idiosyncratic mom / error term (OBS Blitz uses 36 month - discuss)
error_df = pd.DataFrame()
for stock in excess_return.columns:
    error_df[stock] = excess_return[stock] - CAPM_dict[stock]['const']-ff_3f['Mkt-RF']*CAPM_dict[stock]['Mkt-RF']-ff_3f['SMB']*CAPM_dict[stock]['SMB']-ff_3f['HML']*CAPM_dict[stock]['HML']    

In [41]:
# Lag error once to get one-month skipping period
error_lagged = error_df.shift(1)

In [42]:
# Save as csv file
error_lagged.to_csv('input_idio.csv', index=True)

### 3.4 Alpha

In [43]:
# Creating alpha dataframe
alpha_df = pd.DataFrame()
for stock in excess_return.columns:
    alpha_df[stock]=CAPM_dict[stock]['const']

In [44]:
# Lag alpha once to get one-month skipping period
alpha_lagged = alpha_df.shift(1)

In [45]:
# Save as csv file
alpha_lagged.to_csv('input_alpha.csv', index=True)

### 3.5 Standard deviation

In [46]:
# Calculating 12-1M rolling stddev
std_df = sorted_ret.rolling(11, min_periods=11).std()
std_df.head(11)

permno,10006,10014,10022,10030,10057,10073,10078,10081,10095,10102,...,93152,93174,93179,93223,93246,93295,93312,93422,93429,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1927-01,,,,,,,,,,,...,,,,,,,,,,
1927-02,,,,,,,,,,,...,,,,,,,,,,
1927-03,,,,,,,,,,,...,,,,,,,,,,
1927-04,,,,,,,,,,,...,,,,,,,,,,
1927-05,,,,,,,,,,,...,,,,,,,,,,
1927-06,,,,,,,,,,,...,,,,,,,,,,
1927-07,,,,,,,,,,,...,,,,,,,,,,
1927-08,,,,,,,,,,,...,,,,,,,,,,
1927-09,,,,,,,,,,,...,,,,,,,,,,
1927-10,,,,,,,,,,,...,,,,,,,,,,


In [47]:
# Lag std once to get one month skipping period
std_lagged = std_df.shift(1)

In [48]:
# Standard deviation to CSV
std_lagged.to_csv('input_std.csv', index=True)

### 3.6 Cumulative return

In [49]:
# Calculating log return
log_df = np.log(1+sorted_ret)
log_df

  log_df = np.log(1+sorted_ret)


permno,10006,10014,10022,10030,10057,10073,10078,10081,10095,10102,...,93152,93174,93179,93223,93246,93295,93312,93422,93429,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1927-01,-0.013640,0.000000,-0.078927,0.009500,-0.052368,0.090384,,-0.077962,,-0.018150,...,,,,,,,,,,
1927-02,0.064070,0.481838,0.033257,0.043610,-0.201867,0.287682,,0.000000,,0.080888,...,,,,,,,,,,
1927-03,-0.028505,-0.125163,0.086392,-0.013889,0.169077,-0.087011,,0.013424,,0.028311,...,,,,,,,,,,
1927-04,-0.014779,-0.310155,-0.161488,0.010545,-0.022473,-0.010152,,-0.131577,,0.038966,...,,,,,,,,,,
1927-05,0.075260,-0.095310,0.012805,0.074108,0.167054,-0.041672,,-0.330242,,0.049699,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08,,,,,,,,,,,...,,0.136336,0.041473,,0.186986,,0.104909,-0.122898,,0.554719
2020-09,,,,,,,,,,,...,,-0.185788,-0.066874,,0.019082,,-0.051525,-0.364618,,-0.149762
2020-10,,,,,,,,,,,...,,0.011050,0.112087,,0.081821,,-0.021714,-0.003107,,-0.100372
2020-11,,,,,,,,,,,...,,0.214868,0.120832,,0.025603,,0.153281,0.581595,,0.380309


In [50]:
# Calculating cumulative return
cumret_log = log_df.rolling(11, min_periods=11).sum()
cumret = np.exp(cumret_log)-1

In [51]:
# Lag cum.return once to get one-month skipping period
cumret_lagged = cumret.shift(1)

In [52]:
# Cumulative return CSV file
cumret_lagged.to_csv('input_ret_cum.csv', index=True)

## 4. Output variable

### 4a. Regression output variable 

In [47]:
# Preprocessing output variable
y_file = pd.read_csv('holding_df.csv', low_memory=False)
y_file = y_file[['permno', 'date', 'ret']]
y_file['date'] =  pd.to_datetime(y_file['date'])
y_reg = y_file.set_index('date')

In [48]:
# Resample to month
y_reg.index = y_reg.index.to_period('M')
y_reg = y_reg[y_reg.index > '1930-12']

In [49]:
# Check the number of permnos
y_reg.nunique()

permno      3250
ret       163644
dtype: int64

In [50]:
y_reg

Unnamed: 0_level_0,permno,ret
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1931-01,12562,0.016892
1931-01,13071,0.176471
1931-01,14007,0.178571
1931-01,17021,0.187500
1931-01,15050,0.091743
...,...,...
2020-12,13356,0.154506
2020-12,38703,0.103473
2020-12,19561,0.015899
2020-12,91103,-0.039956


In [51]:
# Sort values
y_reg = y_reg.sort_values(['date','permno'])

In [52]:
y_reg.drop('permno', axis=1, inplace=True)

In [53]:
y_reg

Unnamed: 0_level_0,ret
date,Unnamed: 1_level_1
1931-01,0.136364
1931-01,0.203448
1931-01,0.037879
1931-01,0.251786
1931-01,0.300000
...,...
2020-12,-0.037881
2020-12,0.205307
2020-12,0.054777
2020-12,0.056031


In [54]:
# Merge delisting return into missing_dlret dataframe
y_reg = pd.merge(y_reg, ff_3f['RF'], left_on=y_reg.index, right_on=ff_3f.index, how='left')

In [55]:
y_reg.rename(columns={'key_0': 'date'},inplace=True)

In [56]:
y_reg = y_reg.set_index('date')

In [57]:
y_reg['excess_returns'] = y_reg['ret'] - y_reg['RF']

In [58]:
y_reg.drop(['ret','RF'], axis=1, inplace=True)

In [59]:
y_reg

Unnamed: 0_level_0,excess_returns
date,Unnamed: 1_level_1
1931-01,0.134864
1931-01,0.201948
1931-01,0.036379
1931-01,0.250286
1931-01,0.298500
...,...
2020-12,-0.037981
2020-12,0.205207
2020-12,0.054677
2020-12,0.055931


In [60]:
#y_reg = y_reg.ret * 100

In [61]:
# Output (target) CSV file
y_reg.to_csv('y_reg.csv', index=True)