In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn-whitegrid')

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.optimizers import SGD
import math
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.


In [2]:
# Parsing general parameters
import os
import re
path_to_CTA = '/home/jupyter/CTA data/'
path_to_params = '/home/jupyter/CTA params/'
cutoff_date = '2006-12-01'
fileNames = os.listdir(path_to_CTA)
fileNames = list(filter(lambda x: x.endswith('.csv'), fileNames))

In [3]:
# Take crude oil data
oil = pd.read_csv(path_to_params + 'MCOILWTICO.csv')
oil.rename(columns=lambda x: x.strip(), inplace=True)
oil['DATE'] = pd.to_datetime(oil['DATE'], format='%Y-%m')
oil = oil.set_index('DATE')

In [4]:
# Take gold data
aux = pd.read_csv(path_to_params + 'AUX-USD-2592000-20200523172420.csv', parse_dates=['Date'])
aux.rename(columns=lambda x: x.strip(), inplace=True)
aux['Date'] = aux['Date'].apply(lambda dt: dt.replace(day=1))
aux.drop_duplicates(subset ="Date", 
                     keep = 'first', inplace = True)
aux = aux.set_index('Date')
aux = aux.reindex(index=aux.index[::-1])

In [5]:
# Take silver data
agx = pd.read_csv(path_to_params + 'AGX-USD-2592000-20200523172540.csv', parse_dates=['Date'])
agx.rename(columns=lambda x: x.strip(), inplace=True)
agx['Date'] = agx['Date'].apply(lambda dt: dt.replace(day=1))
agx.drop_duplicates(subset ="Date", 
                     keep = 'first', inplace = True)
agx = agx.set_index('Date')
agx = agx.reindex(index=agx.index[::-1])


In [6]:
# Take Producer Price index data
ppi = pd.read_csv(path_to_params + 'PPIACO.csv', parse_dates=['DATE'])
ppi.rename(columns=lambda x: x.strip(), inplace=True)
ppi.drop_duplicates(subset ="DATE", 
                     keep = 'first', inplace = True)
ppi = ppi.set_index('DATE')

In [7]:
import calendar
from datetime import datetime
# Take BTOP50 index data
btop = pd.read_csv(path_to_params + 'BTOP50_Index_historical_data.csv')
btop = btop.iloc[:34]
btop_df = pd.DataFrame(columns=['Date', 'BTOP50'])
# print(btop_df)
for index, row in btop.iterrows():
    for idx, value in row[1:].items():
        year = int(row[0])
        month = list(calendar.month_abbr).index(idx)
        actual_date = datetime(year=year, month=month, day=1)
        btop_df = btop_df.append({'Date':actual_date,'BTOP50':value},ignore_index=True)
        btop_df = btop_df[btop_df['Date'] < '2020-05-01']
        btop_df = btop_df[btop_df['Date'] >= cutoff_date]

btop_df['BTOP50'] = btop_df['BTOP50'].apply(lambda x: x.strip('%')).astype(float)
btop_df = btop_df.set_index('Date') 

In [8]:
import ast
# Take CTA index data
ctai_list = (open(path_to_params + "cta_index_formated.txt", "r").read())
ctai_list = ast.literal_eval(ctai_list)
ctai_df = pd.DataFrame.from_records(ctai_list)
ctai_df['Date'] = pd.to_datetime(ctai_df['year'].astype(str) + '-' + ctai_df['month'].astype(str), format='%Y-%m')
ctai_df = ctai_df.set_index('Date')

In [31]:
CTAs = {}
# Take CTA data
for fName in fileNames:
    df = pd.read_csv(path_to_CTA + fName)
    df.rename(columns=lambda x: x.strip(), inplace=True)
    df["Month"] = df.Month.map("{:02}".format)
    df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str), format='%Y-%m')
    df = df.set_index('Date')
    df = df.loc[cutoff_date:]
    df.drop(columns=['Year','Month'], inplace=True)
    # Insert params to df
    df['Oil'] = oil.loc[cutoff_date:]['MCOILWTICO']
    df['AUX'] = aux.loc[cutoff_date:]['Close (kg)']
    df['AGX'] = agx.loc[cutoff_date:]['Close (kg)']
    df['PPI'] = ppi.loc[cutoff_date:]['PPIACO']
    df['BTOP50'] = btop_df['BTOP50']
    df['CTA_IDX'] = ctai_df.loc[cutoff_date:]['value']
    CTAs[fName] = df.loc[:'2020-03-01']

In [10]:
def return_rmse(test,predicted):
    rmse = math.sqrt(mean_squared_error(test, predicted))
    print("The root mean squared error is {}.".format(rmse))

In [66]:
train_start = '2006-12-01'
train_end = '2016-01-01' # inclusive
test_end = '2020-04-01' # inclusive

for fName in fileNames:
    # Mark as train and test sets
    df = CTAs[fName]
    df['DataSet_1'] = 'Test'
    df.loc[train_start:train_end,'DataSet_1'] = 'Train'
    
CTAs[fileNames[0]].loc[train_start:train_end]


Unnamed: 0_level_0,Return,Assets,Oil,AUX,AGX,PPI,BTOP50,CTA_IDX,DataSet_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-12-01,1.47,120500000,61.96,20077.35,412.82,165.6,2.08,1.94,Train
2007-01-01,0.47,119800000,54.51,21498.71,449.47,164.0,1.12,2.02,Train
2007-02-01,1.55,113200000,59.28,21028.07,420.21,166.8,-1.63,-1.90,Train
2007-03-01,-0.04,127600000,60.44,22205.52,453.00,169.3,-1.10,-1.49,Train
2007-04-01,1.70,223700000,63.98,21298.72,423.10,171.4,2.35,5.81,Train
...,...,...,...,...,...,...,...,...,...
2015-09-01,-3.51,222900000,45.48,35810.90,467.80,189.1,1.73,1.20,Train
2015-10-01,1.38,232600000,46.22,36717.92,500.19,187.5,-1.16,-1.36,Train
2015-11-01,-0.07,233300000,42.44,34219.00,452.88,185.7,2.88,2.23,Train
2015-12-01,0.18,240000000,37.19,35951.61,458.67,183.5,-1.61,-0.92,Train


In [64]:
# Scaling the training and testing set
transform_train = {}
transform_test = {}
scaler = {}

for num, fName in enumerate(fileNames):
    sc = MinMaxScaler(feature_range=(0,1))
    df = CTAs[fName]
    a0 = np.array(df[df["DataSet_1"] == 'Train'].iloc[:,:-1])
    a1 = np.array(df[df["DataSet_1"] == 'Test'].iloc[:,:-1])
    transform_train[fName] = sc.fit_transform(a0)
    transform_test[fName] = sc.fit_transform(a1)
    scaler[fName] = sc

# print(transform_train, transform_test)
del a0
del a1

In [65]:
trainset = {}
testset = {}

feature_count = 10,
train_set_size = 110

for fName in fileNames:
    trainset[j] = {}
    X_train = []
    y_train = []
    for i in range(feature_count,train_set_size):
        X_train.append(transform_train[j][i-feature_count:i,0])
        y_train.append(transform_train[j][i,0])
    X_train, y_train = np.array(X_train), np.array(y_train)
    trainset[j]["X"] = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))
    trainset[j]["y"] = y_train
    
    testset[j] = {}
    X_test = []
    y_test = []    
    for i in range(60, 755):
        X_test.append(transform_test[j][i-60:i,0])
        y_test.append(transform_test[j][i,0])
    X_test, y_test = np.array(X_test), np.array(y_test)
    testset[j]["X"] = np.reshape(X_test, (X_test.shape[0], X_train.shape[1], 1))
    testset[j]["y"] = y_test

{'Mondiale-Asset-Management-Mondiale-Trading-Program-2X-_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'FTC-Capital-GmbH-FTC-Futures-Fund-Classic-EUR-_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Global-Bayesian-Dynamics-LLC-SBF-Proprietary-_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Molinero-Capital-Management-LLP-Global-Markets-Program_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'FORT-LP-Fort-Global-Contrarian_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Warrington-Asset-Management-Warrington-Strategic-Program_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Claughton-Capital-Institutional-Program_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Kaiser-Trading-Group-Kaiser-Global-Diversified-Program-Class-A_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'Hamer-Trading-Diversified-Systematic-Program_data.csv': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'FORT-LP-Fort-Global

In [None]:
# The LSTM architecture
regressor = Sequential()
# First LSTM layer with Dropout regularisation
regressor.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],1)))
regressor.add(Dropout(0.2))
# Second LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# The output layer
regressor.add(Dense(units=1))

# Compiling the RNN
regressor.compile(optimizer='adam', loss='mean_squared_error')

# Fitting to the training set
for fName in fileNames:
    print("Fitting to", fName)
    regressor.fit(trainset[fName]["X"], trainset[fName]["y"], epochs=50, batch_size=200)