In [1]:
# Importing packages
import pandas as pd
import numpy as np
import pickle

# Visualization

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("dark")

In [2]:
# Loading data from github (second option, locally)
#url = 'https://raw.githubusercontent.com/MariaRosendal/Enhancing-Price-Momentum-with-RNN/main/'
url = ''

## 1. Importing input and output variables

In [3]:
# For plotting
market_unscaled = pd.read_csv(url+'market_unscalled.csv')
market_unscaled.rename(columns={market_unscaled.columns[0]: "date" }, inplace = True)
market_unscaled['date'] =  pd.to_datetime(market_unscaled['date'])

# Stock variables

# Return
input_ret = pd.read_csv(url+'input_ret.csv')
input_ret.rename(columns={input_ret.columns[0]: "date" }, inplace = True)
input_ret['date'] =  pd.to_datetime(input_ret['date'])

# Cum. Return
input_ret_cum = pd.read_csv(url+'input_ret_cum.csv')
input_ret_cum.rename(columns={input_ret_cum.columns[0]: "date" }, inplace = True)
input_ret_cum['date'] =  pd.to_datetime(input_ret_cum['date'])

# Standard deviation
input_std = pd.read_csv(url+'input_std.csv')
input_std.rename(columns={input_std.columns[0]: "date" }, inplace = True)
input_std['date'] =  pd.to_datetime(input_std['date'])

# Alpha
input_alpha = pd.read_csv(url+'input_alpha.csv')
input_alpha.rename(columns={input_alpha.columns[0]: "date" }, inplace = True)
input_alpha['date'] =  pd.to_datetime(input_alpha['date'])

# Beta
input_beta = pd.read_csv(url+'input_beta.csv')
input_beta.rename(columns={input_beta.columns[0]: "date" }, inplace = True)
input_beta['date'] =  pd.to_datetime(input_beta['date'])

# Idiosyncratic momentum
input_idio = pd.read_csv(url+'input_idio.csv')
input_idio.rename(columns={input_idio.columns[0]: "date" }, inplace = True)
input_idio['date'] =  pd.to_datetime(input_idio['date'])

In [4]:
# Checking shape
input_ret.shape, input_ret_cum.shape, input_std.shape, input_alpha.shape, input_beta.shape, input_idio.shape

((1128, 3251),
 (1128, 3251),
 (1128, 3251),
 (1128, 3251),
 (1128, 3251),
 (1128, 3251))

## 2. Combining Stock variables

In [5]:
stock_var_cols = []
for col in input_ret.columns:
  stock_var_lst = [list(stock) for stock in zip(input_ret[col], input_ret_cum[col], input_std[col], input_alpha[col], input_beta[col],input_idio[col])]
  stock_var_cols.append(stock_var_lst)

stock_var = pd.DataFrame(stock_var_cols).T
stock_var.columns = input_ret.columns

In [6]:
stock_var.shape

(1128, 3251)

In [7]:
stock_var['date'] = input_ret['date']

In [8]:
# Lag date column one month back (i.e. move features one month foward, so we are predicting next month)
stock_var_lagged = stock_var.copy()
stock_var_lagged['date'] = stock_var.loc[:,'date'].shift(-1)
stock_var_lagged.drop(stock_var_lagged.tail(1).index, inplace=True)

## 3. Defining train and test stock universe

In [9]:
universe = pd.read_csv(url+'universe.csv')
universe['date'] =  pd.to_datetime(universe['date'])
universe['permno'] = universe['permno'].astype(str)
universe.permno.nunique()

3250

In [10]:
stock_var_lagged

Unnamed: 0,date,10006,10014,10022,10030,10057,10073,10078,10081,10095,...,93152,93174,93179,93223,93246,93295,93312,93422,93429,93436
0,1927-02-01,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
1,1927-03-01,"[-0.013547, nan, nan, nan, nan, nan]","[0.0, nan, nan, nan, nan, nan]","[-0.07589299999999999, nan, nan, nan, nan, nan]","[0.009545, nan, nan, nan, nan, nan]","[-0.051019999999999996, nan, nan, nan, nan, nan]","[0.094595, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[-0.075, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
2,1927-04-01,"[0.06616699999999999, nan, nan, nan, nan, nan]","[0.6190479999999999, nan, nan, nan, nan, nan]","[0.033816000000000006, nan, nan, nan, nan, nan]","[0.044575, nan, nan, nan, nan, nan]","[-0.18279600000000001, nan, nan, nan, nan, nan]","[0.333333, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[0.0, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
3,1927-05-01,"[-0.028102999999999996, nan, nan, nan, nan, nan]","[-0.117647, nan, nan, nan, nan, nan]","[0.09023400000000001, nan, nan, nan, nan, nan]","[-0.013793000000000001, nan, nan, nan, nan, nan]","[0.18421099999999999, nan, nan, nan, nan, nan]","[-0.083333, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[0.013513999999999998, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
4,1927-06-01,"[-0.014669999999999999, nan, nan, nan, nan, nan]","[-0.266667, nan, nan, nan, nan, nan]","[-0.149123, nan, nan, nan, nan, nan]","[0.010601000000000001, nan, nan, nan, nan, nan]","[-0.022222, nan, nan, nan, nan, nan]","[-0.010101, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[-0.12328800000000001, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,2020-08-01,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[-0.041115, -0.4686280289781247, 0.14663832280...","[-0.182874, -0.3758319703849616, 0.14068384987...","[nan, nan, nan, nan, nan, nan]","[0.095803, 0.6864444804344647, 0.0855785371134...","[nan, nan, nan, nan, nan, nan]","[-0.024441, 0.18810090116145228, 0.12663032305...","[0.511246, -0.7205614037199345, 0.714596084941...","[nan, nan, nan, nan, nan, nan]","[0.293186, 3.4692297516423274, 0.2359078331879..."
1123,2020-09-01,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[0.099564, -0.41176308739656897, 0.15277622989...","[-0.091178, -0.5073814609355516, 0.12727856569...","[nan, nan, nan, nan, nan, nan]","[0.292381, 1.0205143265675107, 0.1123647894586...","[nan, nan, nan, nan, nan, nan]","[0.018059, 0.2416663060089175, 0.1256230906882...","[0.139535, -0.5597124286370012, 0.704536548059...","[nan, nan, nan, nan, nan, nan]","[0.325011, 5.3417413801282025, 0.2264247734714..."
1124,2020-10-01,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[0.146067, -0.2762466089124397, 0.161544505197...","[0.042345, -0.5007025743812901, 0.128256439035...","[nan, nan, nan, nan, nan, nan]","[0.20561, 1.4250685695130818, 0.11670575522437...","[nan, nan, nan, nan, nan, nan]","[0.11060899999999999, 0.2463729924578553, 0.12...","[-0.115646, -0.6253629035690885, 0.70789139148...","[nan, nan, nan, nan, nan, nan]","[0.741452, 9.344168965265428, 0.27253106787633..."
1125,2020-11-01,"[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan]",...,"[nan, nan, nan, nan, nan, nan]","[-0.16955, -0.4064537054328351, 0.167576824488...","[-0.064687, -0.5070820133798255, 0.12829874830...","[nan, nan, nan, nan, nan, nan]","[0.019265, 1.0049686579139814, 0.1079037796342...","[nan, nan, nan, nan, nan, nan]","[-0.05022, 0.17376557290856587, 0.128003494626...","[-0.30553800000000003, -0.7109208585982182, 0....","[nan, nan, nan, nan, nan, nan]","[-0.139087, 5.811416267518992, 0.2975930081236..."


### Testing scale

In [None]:
# Combining all variables

X_check = []
counter = 0
for (dt, permno), _data in universe.groupby(['date', 'permno']):
        if counter%672==0:
            print(counter, '/', len(universe.groupby(['date', 'permno'])))
        counter += 1
        # Construct features
        market = market_unscaled.loc[
            market_unscaled['date'] <= dt].tail(1)

        stock_variables = stock_var_lagged.loc[
            stock_var_lagged["date"] <= dt
            ][["date", permno]].tail(1)

        merged = market.merge(stock_variables, on="date")
        merged[['ret','cum_ret', 'std', 'alpha', 'beta', 'idio']] = pd.DataFrame(merged[permno].tolist(), index= merged.index)
        features = merged[["Mkt", "Mkt_cumret", "Mkt_std", 'ret','cum_ret', 'std', 'alpha', 'beta', 'idio']].values

        X_check.append(features)

X_check = np.array(X_check)

0 / 540000
672 / 540000
1344 / 540000
2016 / 540000
2688 / 540000
3360 / 540000
4032 / 540000
4704 / 540000
5376 / 540000
6048 / 540000
6720 / 540000
7392 / 540000
8064 / 540000
8736 / 540000
9408 / 540000
10080 / 540000
10752 / 540000
11424 / 540000
12096 / 540000
12768 / 540000
13440 / 540000
14112 / 540000
14784 / 540000
15456 / 540000
16128 / 540000
16800 / 540000
17472 / 540000
18144 / 540000
18816 / 540000
19488 / 540000
20160 / 540000
20832 / 540000
21504 / 540000
22176 / 540000
22848 / 540000
23520 / 540000
24192 / 540000
24864 / 540000
25536 / 540000
26208 / 540000
26880 / 540000
27552 / 540000
28224 / 540000
28896 / 540000
29568 / 540000
30240 / 540000
30912 / 540000
31584 / 540000
32256 / 540000
32928 / 540000
33600 / 540000
34272 / 540000
34944 / 540000
35616 / 540000
36288 / 540000
36960 / 540000
37632 / 540000
38304 / 540000
38976 / 540000
39648 / 540000
40320 / 540000
40992 / 540000
41664 / 540000
42336 / 540000
43008 / 540000
43680 / 540000
44352 / 540000
45024 / 540000

In [None]:
# Save scaled X_train for LSTM
pickle.dump(X_check, open('X_check.pkl', "wb" ))

In [None]:
# Combine data
#n_input = X_check.shape[1] * X_check.shape[2]
#X_df = X_check.reshape(len(X_check),n_input)

X_df = pd.DataFrame(X_check)

X_df.columns = ['Market Return','Market Cum. Return','Market Standard Deviation',
                     'Stock return','Stock Cum. Return','Stock Standard Deviation', 'Alpha','Beta', 'Idiosyncratic']

# Pairplot to visualize the scale of the data and the relation between variables 
sns.pairplot(X_df, corner=True, diag_kind='kde')