In [5]:
# this version will take the dataset without index so see if this effects the momentum

import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
import numpy as np
import pandas as pd

#load the consolidated dataset
df = pd.read_csv(r'C:\Users\alexm\FX_project\2000_2024\data2.csv')

# remove extra rows and columns
df = df.iloc[200:5383, 0:62]

# reformat Local time variable as a datetime
from datetime import datetime

df['Date'] = df['Date'].str[6:10] +'-'+ df['Date'].str[3:5] +'-'+ df['Date'].str[0:2]
df['Date'] = pd.to_datetime(df['Date'])

df.shape[0]

  df = pd.read_csv(r'C:\Users\alexm\FX_project\2000_2024\data2.csv')


5183

In [7]:
# normalise data

df_norm = df.iloc[:, 5:]
df_dates = df.iloc[:, :5]

for col in df_norm.columns:
    df_norm[col] = (df_norm[col] - df_norm[col].mean())/ df_norm[col].std()

x = df_norm.to_numpy()
y = df_dates.to_numpy()
cols = df.columns
z = np.append(y,x, axis=1)

df_norm2 = pd.DataFrame(z, columns=cols)

# add an integer series to replace date for use in the ANN
df_norm2['DateID'] = range(df_norm2.shape[0])
first_column = df_norm2.pop('DateID') 
df_norm2.insert(0, 'DateID', first_column)

df_norm2.iloc[:,2:] = df_norm2.iloc[:,2:].astype(float)

df_norm2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5183 entries, 0 to 5182
Data columns (total 63 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   DateID           5183 non-null   int64         
 1   Date             5183 non-null   datetime64[ns]
 2   Day              5183 non-null   float64       
 3   Month            5183 non-null   float64       
 4   Year             5183 non-null   float64       
 5   Close_Mid        5183 non-null   float64       
 6   Volume_Bid       5183 non-null   float64       
 7   Volume_Ask       5183 non-null   float64       
 8   Volume_Tot       5183 non-null   float64       
 9   Close_Sprd       5183 non-null   float64       
 10  5D-MA            5183 non-null   float64       
 11  10D-MA           5183 non-null   float64       
 12  20D-MA           5183 non-null   float64       
 13  50D-MA           5183 non-null   float64       
 14  200D-MA          5183 non-null   float64

In [8]:
# first a function to generate an n-step timeseries from one feature

# i = length of time series, feat = name of column
def datats(i,feat):
    series = []
    data = df_norm2[feat]
    
    # create the dataset
    for j in range(len(data) - i +1):
        series.append(data[j:j+i])
    series = np.array(series)
    series = np.asfarray(series)
    
    # create the column names
    lst = []
    cols = np.array(lst)
    for k in range(i):
        cols = np.append(cols,feat+'-'+str(k))
    # generate the dataframe
    df_ts = pd.DataFrame(series, columns=cols)

    # add the date ID of the start of the timeseries for reference
    df_ts['DateID'] = df_norm2['DateID'][0:len(data) - i +1]
    # shift column 'C' to first position 
    first_column = df_ts.pop('DateID')
    # insert column using insert(position,column_name,first_column) function 
    df_ts.insert(0, 'DateID', first_column)
    
    return df_ts


In [9]:
# function test
x = datats(3,'Close_Mid')
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5181 entries, 0 to 5180
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DateID       5181 non-null   int64  
 1   Close_Mid-0  5181 non-null   float64
 2   Close_Mid-1  5181 non-null   float64
 3   Close_Mid-2  5181 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 162.0 KB


In [10]:
# build a time series of m steps for input features and n for outputs 

feats = ['Volume_Tot','Close_Sprd','US-BaseRate', 'UK-BaseRate','UK-5Y-Inf','US-2Y-Int',
                     'US-10Y-Int','UK-2Y-Int','UK-10Y-Int','US-RealGDP','US-Unemp', 'US-CPI','US-HousePrice',
                     'US-CommmRealEst','US-MktVol','UK-RealGDP','UK-CPI','UK-Unemp','UK-CorpProf',
                     'UK-HshldInc','UK-ResPropInc','UK-SecLend','UK-ConsCred','UK-RetailVol','UK-MktVol']
m = 200
n = 100

ts_in = datats(m,'Close_Mid')

for k in range(len(feats)):
    temp = datats(m,feats[k])
    ts_in = ts_in.merge(temp, on='DateID')

ts_out = datats(n,'Close_Mid')
ts_in.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4984 entries, 0 to 4983
Columns: 5201 entries, DateID to UK-MktVol-199
dtypes: float64(5200), int64(1)
memory usage: 197.8 MB


In [11]:
# trim the datasets to match assuming n<m (i.e. output timeseries is smaller than input)
# note we need to reindex

# remove the first m time steps from the output
ts_out = ts_out.iloc[m:ts_out.shape[0],].reset_index()

# remove the last n timesteps from the input
ts_in = ts_in.iloc[0:ts_in.shape[0]-n,].reset_index()

In [12]:
# align the dates of the 
print('input')
print(ts_in.shape)
print(ts_in['DateID'][0])
print(ts_in['DateID'][ts_in.shape[0]-1])

print('output')
print(ts_out.shape)
print(ts_out['DateID'][0])
print(ts_out['DateID'][ts_out.shape[0]-1])

ts_in.iloc[0,1:]

input
(4884, 5202)
0
4883
output
(4884, 102)
200
5083


DateID           0.000000
Close_Mid-0      1.860770
Close_Mid-1      1.867550
Close_Mid-2      1.890760
Close_Mid-3      1.892390
                   ...   
UK-MktVol-195   -0.761344
UK-MktVol-196   -0.761344
UK-MktVol-197   -0.761344
UK-MktVol-198   -0.761344
UK-MktVol-199   -0.761344
Name: 0, Length: 5201, dtype: float64

In [13]:
# generate data for test/train
# exclude the index for inputs
X = ts_in.iloc[:,1:].values

# drop the Startdate and index for output
y = ts_out.iloc[:,2:].values

# check a value is as expected
#X[0,]
y[0,:]

array([1.84662 , 1.8526  , 1.859325, 1.849905, 1.85677 , 1.86023 ,
       1.86862 , 1.88122 , 1.89174 , 1.89385 , 1.89335 , 1.90926 ,
       1.93435 , 1.92401 , 1.94367 , 1.9385  , 1.94472 , 1.93442 ,
       1.92538 , 1.91479 , 1.92502 , 1.9276  , 1.94176 , 1.93158 ,
       1.94159 , 1.94616 , 1.92768 , 1.9166  , 1.92353 , 1.92363 ,
       1.93386 , 1.9277  , 1.9182  , 1.92522 , 1.91819 , 1.90405 ,
       1.88232 , 1.88254 , 1.8751  , 1.87074 , 1.87495 , 1.87798 ,
       1.89048 , 1.88213 , 1.87033 , 1.8593  , 1.86606 , 1.87098 ,
       1.87168 , 1.87678 , 1.88052 , 1.86395 , 1.88195 , 1.88917 ,
       1.88789 , 1.88257 , 1.8839  , 1.8857  , 1.88188 , 1.8757  ,
       1.85712 , 1.85437 , 1.85802 , 1.8684  , 1.86829 , 1.88757 ,
       1.8968  , 1.88496 , 1.8951  , 1.89401 , 1.89684 , 1.91122 ,
       1.90833 , 1.90942 , 1.91928 , 1.92075 , 1.92088 , 1.91394 ,
       1.90659 , 1.92256 , 1.9141  , 1.92865 , 1.92521 , 1.9224  ,
       1.92588 , 1.91341 , 1.91274 , 1.92597 , 1.92501 , 1.921

In [14]:
# check the dataframes have what we need
print('input')
print(ts_in.iloc[:,1:].shape)
print(ts_in.iloc[:,1:]['DateID'][0])
print(ts_in.iloc[:,1:]['DateID'][ts_in.shape[0]-1])

print('output')
print(ts_out.iloc[:,1:].shape)
print(ts_out.iloc[:,1:]['DateID'][0])
print(ts_out.iloc[:,1:]['DateID'][ts_out.shape[0]-1])

ts_out.iloc[:,1:].head()

input
(4884, 5201)
0
4883
output
(4884, 101)
200
5083


Unnamed: 0,DateID,Close_Mid-0,Close_Mid-1,Close_Mid-2,Close_Mid-3,Close_Mid-4,Close_Mid-5,Close_Mid-6,Close_Mid-7,Close_Mid-8,...,Close_Mid-90,Close_Mid-91,Close_Mid-92,Close_Mid-93,Close_Mid-94,Close_Mid-95,Close_Mid-96,Close_Mid-97,Close_Mid-98,Close_Mid-99
0,200,1.84662,1.8526,1.859325,1.849905,1.85677,1.86023,1.86862,1.88122,1.89174,...,1.89638,1.88598,1.86874,1.86838,1.87031,1.86646,1.87386,1.87808,1.89022,1.88183
1,201,1.8526,1.859325,1.849905,1.85677,1.86023,1.86862,1.88122,1.89174,1.89385,...,1.88598,1.86874,1.86838,1.87031,1.86646,1.87386,1.87808,1.89022,1.88183,1.87511
2,202,1.859325,1.849905,1.85677,1.86023,1.86862,1.88122,1.89174,1.89385,1.89335,...,1.86874,1.86838,1.87031,1.86646,1.87386,1.87808,1.89022,1.88183,1.87511,1.88098
3,203,1.849905,1.85677,1.86023,1.86862,1.88122,1.89174,1.89385,1.89335,1.90926,...,1.86838,1.87031,1.86646,1.87386,1.87808,1.89022,1.88183,1.87511,1.88098,1.88046
4,204,1.85677,1.86023,1.86862,1.88122,1.89174,1.89385,1.89335,1.90926,1.93435,...,1.87031,1.86646,1.87386,1.87808,1.89022,1.88183,1.87511,1.88098,1.88046,1.86936


In [15]:
# the data we have generated is a "pure forecast" with no scenario analysis. to get a scenario analysis we need
# to include data for the scenario by appending the additional n steps for features to the input
# we end up with m nodes for the input layer of the timeseries we are forecasting and m+n nodes for each feature
# i will create the scenario forecast in the next version

In [16]:
# Generate an adaptable fully connected neural net with the correct size of input/ouput layers from the above dataset
nodes = 1000

model = nn.Sequential(nn.Linear(ts_in.iloc[:,1:].shape[1], nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes,nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes,nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes,nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes,nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes,nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes,nodes),nn.LeakyReLU(negative_slope=0.2),
                      nn.Linear(nodes, ts_out.iloc[:,2:].shape[1]),nn.LeakyReLU(negative_slope=0.2))

# Tanh, Sigmoid don't work very well since the range of the functions is limited to [-1,1] or [0,1]

# set parameters of the learning
criterion = nn.MSELoss()
epochs = 1000
rand_st = 42 # random state

# optimiser and learning rate
opt = torch.optim.Adam(model.parameters(), lr = 0.001)

# container for tests
results = np.empty((0, 5))


# train the model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=rand_st)

# convert to tensor for use in pyTorch
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train)
y_test = torch.FloatTensor(y_test)

# seed a random number to replicate data
torch.manual_seed(rand_st)

# train the model and capture the time taken

import time
t_start = time.time()

for i in range(epochs):
    # create predictions
    y_pred = model(X_train)

    # calculate loss and add to the list
    loss = criterion(y_pred, y_train)

    # performs back propogation
    opt.zero_grad()
    loss.backward()
    opt.step()

t_stop = time.time()
process_time = round((t_stop - t_start), 2)
print(process_time)

# calculate a prediction
pred = model(torch.FloatTensor(X))


1421.52


In [31]:
pred.shape

torch.Size([4884, 100])

In [18]:
pred

tensor([[1.9258, 1.9287, 1.9359,  ..., 1.8412, 1.8433, 1.8487],
        [1.9250, 1.9278, 1.9349,  ..., 1.8418, 1.8438, 1.8492],
        [1.9241, 1.9268, 1.9339,  ..., 1.8421, 1.8441, 1.8494],
        ...,
        [1.2722, 1.2823, 1.2726,  ..., 1.2218, 1.2234, 1.2182],
        [1.2749, 1.2851, 1.2752,  ..., 1.2221, 1.2236, 1.2185],
        [1.2770, 1.2871, 1.2773,  ..., 1.2244, 1.2260, 1.2209]],
       grad_fn=<LeakyReluBackward0>)

In [41]:
# calculate the MSE for this model
mse = nn.MSELoss()

# create a target tensor from the ts_out dataframe
target = torch.FloatTensor(ts_out.iloc[:,2:].values)
output = mse(pred, target)
print("MSE loss:", output.item())

MSE loss: tensor(0.0008, grad_fn=<MseLossBackward0>)
