In [49]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn-whitegrid')

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.optimizers import SGD
import math
from sklearn.metrics import mean_squared_error

In [50]:
# Parsing general parameters
import os
import re
path_to_CTA = '/home/jupyter/CTA data/'
path_to_params = '/home/jupyter/CTA params/'
cutoff_date = '2006-12-01'
fileNames = os.listdir(path_to_CTA)
fileNames = list(filter(lambda x: x.endswith('.csv'), fileNames))

In [51]:
# Take crude oil data
oil = pd.read_csv(path_to_params + 'MCOILWTICO.csv')
oil.rename(columns=lambda x: x.strip(), inplace=True)
oil['DATE'] = pd.to_datetime(oil['DATE'], format='%Y-%m')
oil = oil.set_index('DATE')

In [52]:
# Take gold data
aux = pd.read_csv(path_to_params + 'AUX-USD-2592000-20200523172420.csv', parse_dates=['Date'])
aux.rename(columns=lambda x: x.strip(), inplace=True)
aux['Date'] = aux['Date'].apply(lambda dt: dt.replace(day=1))
aux.drop_duplicates(subset ="Date", 
                     keep = 'first', inplace = True)
aux = aux.set_index('Date')
aux = aux.reindex(index=aux.index[::-1])

In [53]:
# Take silver data
agx = pd.read_csv(path_to_params + 'AGX-USD-2592000-20200523172540.csv', parse_dates=['Date'])
agx.rename(columns=lambda x: x.strip(), inplace=True)
agx['Date'] = agx['Date'].apply(lambda dt: dt.replace(day=1))
agx.drop_duplicates(subset ="Date", 
                     keep = 'first', inplace = True)
agx = agx.set_index('Date')
agx = agx.reindex(index=agx.index[::-1])


In [54]:
# Take Producer Price index data
ppi = pd.read_csv(path_to_params + 'PPIACO.csv', parse_dates=['DATE'])
ppi.rename(columns=lambda x: x.strip(), inplace=True)
ppi.drop_duplicates(subset ="DATE", 
                     keep = 'first', inplace = True)
ppi = ppi.set_index('DATE')

In [55]:
import calendar
from datetime import datetime
# Take BTOP50 index data
btop = pd.read_csv(path_to_params + 'BTOP50_Index_historical_data.csv')
btop = btop.iloc[:34]
btop_df = pd.DataFrame(columns=['Date', 'BTOP50'])
# print(btop_df)
for index, row in btop.iterrows():
    for idx, value in row[1:].items():
        year = int(row[0])
        month = list(calendar.month_abbr).index(idx)
        actual_date = datetime(year=year, month=month, day=1)
        btop_df = btop_df.append({'Date':actual_date,'BTOP50':value},ignore_index=True)
        btop_df = btop_df[btop_df['Date'] < '2020-05-01']
        btop_df = btop_df[btop_df['Date'] >= cutoff_date]

btop_df['BTOP50'] = btop_df['BTOP50'].apply(lambda x: x.strip('%')).astype(float)
btop_df = btop_df.set_index('Date') 

In [56]:
import ast
# Take CTA index data
ctai_list = (open(path_to_params + "cta_index_formated.txt", "r").read())
ctai_list = ast.literal_eval(ctai_list)
ctai_df = pd.DataFrame.from_records(ctai_list)
ctai_df['Date'] = pd.to_datetime(ctai_df['year'].astype(str) + '-' + ctai_df['month'].astype(str), format='%Y-%m')
ctai_df = ctai_df.set_index('Date')

In [60]:
CTAs = {}
end_date = '2020-03-01'
# Take CTA data
for fName in fileNames:
    df = pd.read_csv(path_to_CTA + fName)
    df.rename(columns=lambda x: x.strip(), inplace=True)
    df["Month"] = df.Month.map("{:02}".format)
    df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str), format='%Y-%m')
    df = df.set_index('Date')
    df = df.loc[cutoff_date:]
    df.drop(columns=['Year','Month'], inplace=True)
    # Insert params to df
    df['Oil'] = oil.loc[cutoff_date:]['MCOILWTICO']
    df['AUX'] = aux.loc[cutoff_date:]['Close (kg)']
    df['AGX'] = agx.loc[cutoff_date:]['Close (kg)']
    df['PPI'] = ppi.loc[cutoff_date:]['PPIACO']
    df['BTOP50'] = btop_df['BTOP50']
    df['CTA_IDX'] = ctai_df.loc[cutoff_date:]['value']
    df.ffill(axis=0, inplace=True, limit=None, downcast=None)
    CTAs[fName] = df.loc[:end_date]

In [61]:
def return_rmse(test,predicted):
    rmse = math.sqrt(mean_squared_error(test, predicted))
    print("The root mean squared error is {}.".format(rmse))

In [62]:
train_start = '2006-12-01'
train_end = '2016-01-01' # inclusive
test_end = '2020-04-01' # inclusive

for fName in fileNames:
    # Mark as train and test sets
    df = CTAs[fName]
    df['DataSet_1'] = 'Test'
    df.loc[train_start:train_end,'DataSet_1'] = 'Train'
    
print(len(CTAs[fileNames[0]].loc[train_start:train_end]))
print(len(CTAs[fileNames[0]].loc[train_end:]))

110
51


In [65]:
# Scaling the training and testing set
transform_train = {}
transform_test = {}
scaler = {}

for num, fName in enumerate(fileNames): 
    sc = MinMaxScaler(feature_range=(0,1))
    df = CTAs[fName]
    a0 = np.array(df[df["DataSet_1"] == 'Train'].iloc[:,:-1])
    a1 = np.array(df[df["DataSet_1"] == 'Test'].iloc[:,:-1])
    transform_train[fName] = sc.fit_transform(a0)
    transform_test[fName] = sc.fit_transform(a1)
    scaler[fName] = sc

del a0
del a1

In [66]:
trainset = {}
testset = {}

timestep = 10
train_set_size = 110 # default value
test_set_size = 50 # defaul value
feature_count = 8

# For every CTA, create train set and validation set
# the value of past 10 time steps
# Result of input should be 3D Vector (Sample size, time steps, features)
# in this case for train set (100, 10, 8)
# for test set (40, 10, 8)
# total sample size 161 = 100 (train set) + 10 (validation set) + 40 (test set) + 10
 
for fName in fileNames:
    train_set_size = len(transform_train[fName]) 
    test_set_size = len(transform_test[fName])
    # Create train set
    trainset[fName] = {}
    X_train = []
    y_train = []
    for i in range(timestep,train_set_size):
        X_train.append(transform_train[fName][i-timestep:i,:])
        y_train.append(transform_train[fName][i,0])
    X_train, y_train = np.array(X_train), np.array(y_train)
    trainset[fName]["X"] = np.reshape(X_train, (X_train.shape[0],X_train.shape[1], feature_count))
    trainset[fName]["y"] = y_train

    # Create test set
    testset[fName] = {}
    X_test = []
    y_test = []    
    for i in range(timestep, test_set_size):
        X_test.append(transform_test[fName][i-timestep:i,:])
        y_test.append(transform_test[fName][i,0])
    
    X_test, y_test = np.array(X_test), np.array(y_test)
    testset[fName]["X"] = np.reshape(X_test, (X_test.shape[0], X_train.shape[1], feature_count))
    testset[fName]["y"] = y_test

In [67]:
# Check the shapes of input
arr_buff = []
for i in fileNames:
    buff = {}
    buff["X_train"] = trainset[i]["X"].shape
    buff["y_train"] = trainset[i]["y"].shape
    buff["X_test"] = testset[i]["X"].shape
    buff["y_test"] = testset[i]["y"].shape
    arr_buff.append(buff)

pd.DataFrame(arr_buff, index=fileNames)

Unnamed: 0,X_train,y_train,X_test,y_test
Mondiale-Asset-Management-Mondiale-Trading-Program-2X-_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
FTC-Capital-GmbH-FTC-Futures-Fund-Classic-EUR-_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
Global-Bayesian-Dynamics-LLC-SBF-Proprietary-_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
Molinero-Capital-Management-LLP-Global-Markets-Program_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
FORT-LP-Fort-Global-Contrarian_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
...,...,...,...,...
DUNN-Capital-Management-World-Monetary-and-Agriculture-Program-WMA-_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
Adalpha-Asset-Management-LLC-Adalpha-Diversified-Short-Term-Program_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
Mulvaney-Capital-Management-The-Mulvaney-Global-Markets-Fund_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"
AIS-Capital-Management-L-P-MAAP-3x-6x-Composite_data.csv,"(100, 10, 8)","(100,)","(40, 10, 8)","(40,)"


In [None]:
# The LSTM architecture
regressor = Sequential()
# First LSTM layer with Dropout regularisation
regressor.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],feature_count)))
regressor.add(Dropout(0.2))
# The output layer
regressor.add(Dense(units=1))

# Compiling the RNN
regressor.compile(optimizer='adam', loss='mean_squared_error')

# Fitting to the training set
for i, fName in enumerate(fileNames):
    print(i, ". Fitting to", fName)
    regressor.fit(trainset[fName]["X"], trainset[fName]["y"], epochs=50, batch_size=200)

0 . Fitting to Mondiale-Asset-Management-Mondiale-Trading-Program-2X-_data.csv
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
1 . Fitting to FTC-Capital-GmbH-FTC-Futures-Fund-Classic-EUR-_data.csv
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoc