In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
# Loading data from github
url=''
#url = 'https://raw.githubusercontent.com/MariaRosendal/Enhancing-Price-Momentum-with-RNN/main/'

## 1. Importing input and output variables

In [None]:
# Market variables (train data)
input_market_train = pd.read_csv(url+'market_train.csv')
input_market_train.rename(columns={input_market_train.columns[0]: "date" }, inplace = True)
input_market_train['date'] =  pd.to_datetime(input_market_train['date'])

# Market variables (test data)
input_market_test = pd.read_csv(url+'market_test.csv')
input_market_test.rename(columns={input_market_test.columns[0]: "date" }, inplace = True)
input_market_test['date'] =  pd.to_datetime(input_market_test['date'])

input_market = pd.merge(input_market_train, input_market_test, 
                          on=['date', 'Mkt', 'Mkt_cumret', 'Mkt_std'], how='outer')

# Stock variables

# Return
input_ret = pd.read_csv(url+'input_ret.csv')
input_ret.rename(columns={input_ret.columns[0]: "date" }, inplace = True)
input_ret['date'] =  pd.to_datetime(input_ret['date'])

# Cum. Return
input_ret_cum = pd.read_csv(url+'input_ret_cum.csv')
input_ret_cum.rename(columns={input_ret_cum.columns[0]: "date" }, inplace = True)
input_ret_cum['date'] =  pd.to_datetime(input_ret_cum['date'])

# Standard deviation
input_std = pd.read_csv(url+'input_std.csv')
input_std.rename(columns={input_std.columns[0]: "date" }, inplace = True)
input_std['date'] =  pd.to_datetime(input_std['date'])

# Alpha
input_alpha = pd.read_csv(url+'input_alpha.csv')
input_alpha.rename(columns={input_alpha.columns[0]: "date" }, inplace = True)
input_alpha['date'] =  pd.to_datetime(input_alpha['date'])

# Beta
input_beta = pd.read_csv(url+'input_beta.csv')
input_beta.rename(columns={input_beta.columns[0]: "date" }, inplace = True)
input_beta['date'] =  pd.to_datetime(input_beta['date'])

# Idiosyncratic momentum
input_idio = pd.read_csv(url+'input_idio.csv')
input_idio.rename(columns={input_idio.columns[0]: "date" }, inplace = True)
input_idio['date'] =  pd.to_datetime(input_idio['date'])

In [None]:
input_ret.shape, input_ret_cum.shape, input_std.shape, input_alpha.shape, input_beta.shape, input_idio.shape

In [None]:
stock_var_cols = []
for col in input_ret.columns:
  stock_var_lst = [list(stock) for stock in zip(input_ret[col], input_ret_cum[col], input_std[col], input_alpha[col], input_beta[col],input_idio[col])]
  stock_var_cols.append(stock_var_lst)

stock_var = pd.DataFrame(stock_var_cols).T
stock_var.columns = input_ret.columns

In [None]:
stock_var.shape

In [None]:
stock_var['date'] = input_ret['date']

In [None]:
# Lag date column one month back (i.e. move features one month foward, so we are predicting next month)
stock_var_lagged = stock_var.copy()
stock_var_lagged['date'] = stock_var.loc[:,'date'].shift(-1)
stock_var_lagged.drop(stock_var_lagged.tail(1).index, inplace=True)

In [None]:
universe = pd.read_csv(url+'universe.csv')
universe['date'] =  pd.to_datetime(universe['date'])
universe['permno'] = universe['permno'].astype(str)
universe.permno.nunique()

In [None]:
# Define test and train universe
universe_train = universe[universe['date'] > '1930-12-01']
universe_train = universe_train[universe_train['date'] <= '1975-12-01']
universe_test = universe[universe['date'] > '1975-12-01']

# Save as CSV
universe_train.to_csv('universe_train.csv', index=False)
universe_test.to_csv('universe_test.csv',  index=False)

In [None]:
universe_train.shape, universe_test.shape

## LSTM variables

### Test variables

In [None]:
# In keras LSTM the time flows from top to bottom
# Assuming the lookback for features is 12-1M

X_test = []
counter = 0
for (dt, permno), _data in universe_test.groupby(['date', 'permno']):
        if counter%672==0:
            print(counter, '/', len(universe_test.groupby(['date', 'permno'])))
        counter += 1
        # Construct LSTM features
        market = input_market.loc[
            input_market['date'] <= dt].tail(12)

        stock_variables = stock_var_lagged.loc[
            stock_var_lagged["date"] <= dt
            ][["date", permno]].tail(12)

        merged = market.merge(stock_variables, on="date")
        merged[['ret','cum_ret', 'std', 'alpha', 'beta', 'idio']] = pd.DataFrame(merged[permno].tolist(), index= merged.index)
        features = merged[["Mkt", "Mkt_cumret", "Mkt_std", 'ret','cum_ret', 'std', 'alpha', 'beta', 'idio']].values

        X_test.append(features)

X_test = np.array(X_test)

In [None]:
# Scaling variables cross-sectionally (stock variables only)
idx_stock_features_begin = 3
num_permno = 500
time_steps = 12
num_stock_features = 6

for i in range(len(X_test)//num_permno):
    for t in range(time_steps):
        for f in range(num_stock_features):
            ranks = np.argsort(np.argsort(X_test[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f]))
            X_test[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f] = [-1 + (2/(num_permno-1))*rank for rank in ranks]
X_test  

In [None]:
pickle.dump(X_test, open('X_test_scaled.pkl', "wb" ))

In [None]:
# Ensuring no NaN values
np.count_nonzero(np.isnan(X_test))

### Train variables

In [None]:
# Combining all variables

X_train = []
counter = 0
for (dt, permno), _data in universe_train.groupby(['date', 'permno']):
        if counter%672==0:
            print(counter, '/', len(universe_train.groupby(['date', 'permno'])))
        counter += 1
        # Construct LSTM features
        market = input_market.loc[
            input_market['date'] <= dt].tail(12)

        stock_variables = stock_var_lagged.loc[
            stock_var_lagged["date"] <= dt
            ][["date", permno]].tail(12)

        merged = market.merge(stock_variables, on="date")
        merged[['ret','cum_ret', 'std', 'alpha', 'beta', 'idio']] = pd.DataFrame(merged[permno].tolist(), index= merged.index)
        features = merged[["Mkt", "Mkt_cumret", "Mkt_std", 'ret','cum_ret', 'std', 'alpha', 'beta', 'idio']].values

        X_train.append(features)

X_train = np.array(X_train)

In [None]:
# Sorting variables cross-sectionally (stock variables only)
idx_stock_features_begin = 3
num_permno = 500
time_steps = 12
num_stock_features = 6

for i in range(len(X_train)//num_permno):
    for t in range(time_steps):
        for f in range(num_stock_features):
            ranks = np.argsort(np.argsort(X_train[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f]))
            X_train[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f] = [-1 + (2/(num_permno-1))*rank for rank in ranks]
X_train  

In [None]:
np.count_nonzero(np.isnan(X_train))

In [None]:
X_train_shape

In [None]:
pickle.dump(X_train, open('X_train_scaled.pkl', "wb" ))

## MLP and RF variables

### Test variables

In [None]:
X_test_vec = []
counter = 0
for (dt, permno), _data in universe_test.groupby(['date', 'permno']):
        if counter%672==0:
            print(counter, '/', len(universe_test.groupby(['date', 'permno'])))
        counter += 1
        # Construct features
        market = input_market.loc[
            input_market['date'] <= dt].tail(1)

        stock_variables = stock_var_lagged.loc[
            stock_var_lagged["date"] <= dt
            ][["date", permno]].tail(1)

        merged = market.merge(stock_variables, on="date")
        merged[['ret','cum_ret', 'std', 'alpha', 'beta', 'idio']] = pd.DataFrame(merged[permno].tolist(), index= merged.index)
        features = merged[["Mkt", "Mkt_cumret", "Mkt_std", 'ret','cum_ret', 'std', 'alpha', 'beta', 'idio']].values

        X_test_vec.append(features)

X_test_vec = np.array(X_test_vec)

In [None]:
# Scaling variables cross-sectionally (stock variables only)
idx_stock_features_begin = 3
num_permno = 500
time_steps = 1
num_stock_features = 6

for i in range(len(X_test_vec)//num_permno):
    for t in range(time_steps):
        for f in range(num_stock_features):
            ranks = np.argsort(np.argsort(X_test_vec[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f]))
            X_test_vec[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f] = [-1 + (2/(num_permno-1))*rank for rank in ranks]
X_test_vec 

In [None]:
X_test_vec.shape

In [None]:
# Ensuring no NaN values
np.count_nonzero(np.isnan(X_test_vec))

In [None]:
pickle.dump(X_test_vec, open('X_test_vec.pkl', "wb" ))

### Train variables

In [None]:
# Combining all variables

X_train_vec = []
counter = 0
for (dt, permno), _data in universe_train.groupby(['date', 'permno']):
        if counter%672==0:
            print(counter, '/', len(universe_train.groupby(['date', 'permno'])))
        counter += 1
        # Construct features
        market = input_market.loc[
            input_market['date'] <= dt].tail(1)

        stock_variables = stock_var_lagged.loc[
            stock_var_lagged["date"] <= dt
            ][["date", permno]].tail(1)

        merged = market.merge(stock_variables, on="date")
        merged[['ret','cum_ret', 'std', 'alpha', 'beta', 'idio']] = pd.DataFrame(merged[permno].tolist(), index= merged.index)
        features = merged[["Mkt", "Mkt_cumret", "Mkt_std", 'ret','cum_ret', 'std', 'alpha', 'beta', 'idio']].values

        X_train_vec.append(features)

X_train_vec = np.array(X_train_vec)

In [None]:
# Sorting variables cross-sectionally (stock variables only)
idx_stock_features_begin = 3
num_permno = 500
time_steps = 1
num_stock_features = 6

for i in range(len(X_train_vec)//num_permno):
    for t in range(time_steps):
        for f in range(num_stock_features):
            ranks = np.argsort(np.argsort(X_train_vec[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f]))
            X_train_vec[i*num_permno:i*num_permno+num_permno,t,idx_stock_features_begin:][:,f] = [-1 + (2/(num_permno-1))*rank for rank in ranks]
X_train_vec  

In [None]:
np.count_nonzero(np.isnan(X_train_vec))

In [None]:
X_train_vec.shape

In [None]:
pickle.dump(X_train_vec, open('X_train_vec.pkl', "wb" ))