In [41]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import sys
import math
from datetime import datetime
#import relu
from torch.nn import functional as F
sys.path.append('../data')
from data_helper_functions import create_study_periods,create_tensors

In [17]:
df=pd.read_csv('../data/crsp_ff_adjusted.csv')
#drop unamed 0
df['date'] = pd.to_datetime(df['date'])
df.dropna(subset=['RET'],inplace=True)
df=df.drop(columns='Unnamed: 0')
df 

Unnamed: 0,date,TICKER,RET,Adj_RET_Mkt,Adj_RET_Mkt_SMB,Adj_RET_Mkt_SMB_HML
0,1990-02-01,SUNW,0.012903,7.292903,8.532903,7.682903
1,1990-02-01,MYG,0.014085,7.294085,8.534085,7.684085
2,1990-02-01,INTC,-0.012658,7.267342,8.507342,7.657342
3,1990-02-01,CB,0.005634,7.285634,8.525634,7.675634
4,1990-02-01,BUD,-0.026490,7.253510,8.493510,7.643510
...,...,...,...,...,...,...
3266862,2015-12-31,KMI,0.026135,-0.533865,-4.133865,-3.713865
3266863,2015-12-31,ADM,-0.005423,-0.565423,-4.165423,-3.745423
3266864,2015-12-31,HPE,-0.005236,-0.565236,-4.165236,-3.745236
3266865,2015-12-31,DIS,-0.011849,-0.571849,-4.171849,-3.751849


In [18]:
study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31))

 89%|████████▉ | 34/38 [00:06<00:00,  5.27it/s]

Reached the end of the dataset.





In [19]:
train_test_splits=create_tensors(study_periods)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:  2.1min
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:  7.5min
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed: 26.5min
[Parallel(n_jobs=6)]: Done  27 out of  34 | elapsed: 98.3min remaining: 25.5min
[Parallel(n_jobs=6)]: Done  31 out of  34 | elapsed: 100.1min remaining:  9.7min
[Parallel(n_jobs=6)]: Done  34 out of  34 | elapsed: 100.1min finished


In [29]:
train_test_splits[0][1].shape   

torch.Size([248383])

In [39]:
class ScaledMultiHeadAttention(nn.Module):
    def __init__(self, num_heads, n_dim):
        super(ScaledMultiHeadAttention, self).__init__()
        assert n_dim % num_heads == 0
        self.num_heads = num_heads
        self.n_dim = n_dim
        self.head_dim = n_dim // num_heads  # Dimension of each head: commonly referred to as d_k

        self.fc_q = nn.Linear(n_dim, n_dim)  # Query
        self.fc_k = nn.Linear(n_dim, n_dim)  # Key
        self.fc_v = nn.Linear(n_dim, n_dim)  # Value
        self.fc_o = nn.Linear(n_dim, n_dim)  # Output

    def create_look_ahead_mask(size):
        mask = 1 - torch.tril(torch.ones((size, size)))
        return mask  # Returns 0 for positions to be masked

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]
        Q = self.fc_q(query).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.fc_k(key).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.fc_v(value).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        key_out = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim)  # Dot product of query and key to get attention scores

        if mask is not None:  # Look ahead mask to prevent information leakage
            key_out = key_out.masked_fill(mask == 0, float('-inf'))

        attention = torch.softmax(key_out, dim=-1)  # Apply softmax to get probabilities
        value_out = torch.matmul(attention, V)  # Multiply attention scores by value
        value_out = value_out.transpose(1, 2).contiguous().view(batch_size, -1, self.n_dim)  # Reshape to get back to original shape
        return self.fc_o(value_out)  # Apply final linear layer


class PositionWiseFeedForward(nn.Module):
    def __init__(self,d_model,d_ff,dropout=.1):
        super(PositionWiseFeedForward,self).__init__()
        self.linear1=nn.Linear(d_model,d_ff)
        self.linear2=nn.Linear(d_ff,d_model)
        self.dropout=nn.Dropout(dropout)
    def forward(self,x):
        #RELU activation
        x=self.linear1(x)
        x=F.relu(x)
        x=self.dropout(x)
        x=self.linear2(x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self,d_model,num_heads,d_ff,dropout=.1):
        super(EncoderLayer,self).__init__()
        self.self_attention=ScaledMultiHeadAttention(num_heads,d_model)
        self.position_wise_feed_forward=PositionWiseFeedForward(d_model,d_ff,dropout)
        self.dropout=nn.Dropout(dropout)
        self.layer_norm1=nn.LayerNorm(d_model)
        self.layer_norm2=nn.LayerNorm(d_model)
    def forward(self,x,mask):
        #Self attention
        attention=self.self_attention(x,x,x,mask)
        #Add and norm
        x=self.layer_norm1(x+self.dropout(attention))
        #Position wise feed forward
        feed_forward=self.position_wise_feed_forward(x)
        #Add and norm
        x=self.layer_norm2(x+self.dropout(feed_forward))
        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TimeSeriesTransformer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_encoder_layers, dropout=0.1, max_len=512):
        super(TimeSeriesTransformer, self).__init__()
        
        self.d_model = d_model
        
        # Positional Encoding
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        
        # Encoder
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)
        ])

        # Final linear layer to denoise output
        self.fc = nn.Linear(d_model, 1)  

    def forward(self, src, src_mask=None):
        # Add positional encoding to the input
        src = self.positional_encoding(src)
        
        # Pass through each layer of the encoder
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        # Capture the context from the last time step of the encoded sequence
        context = src[:, -1, :]
        
        # Final linear layer
        output = self.fc(context)
        
        return output













In [43]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = TimeSeriesTransformer(d_model=64, num_heads=8, d_ff=256, num_encoder_layers=2, 
                               dropout=.1, max_len=240).to(device)

# Loss depends on target, MAE for returns, Cross Entropy for above/below cross-sectional median
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 100
patience = 5
best_loss = np.inf
counter = 0

for epoch in range(n_epochs):
    model.train()
    total_train_loss = 0.0
    total_val_loss = 0.0

    for train_data, train_labels, val_data, val_labels in tqdm(train_test_splits):

        # Generate look-ahead masks
        train_mask = ScaledMultiHeadAttention.create_look_ahead_mask(train_data.size(1))
        val_mask = ScaledMultiHeadAttention.create_look_ahead_mask(val_data.size(1))

        train_dataset = TensorDataset(train_data, train_labels)
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

        val_dataset = TensorDataset(val_data, val_labels)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

        train_loss = 0.0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(data, src_mask=train_mask)  # Adjusted here to use the look-ahead mask
            loss = criterion(outputs.squeeze(), labels.float())  # Adjust based on your specific use case
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * data.size(0)

        total_train_loss += train_loss / len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data, labels in val_loader:
                data, labels = data.to(device), labels.to(device)

                outputs = model(data, src_mask=val_mask)  # Adjusted here to use the look-ahead mask
                loss = criterion(outputs.squeeze(), labels.float())  # Adjust based on your specific use case
                val_loss += loss.item() * data.size(0)

        total_val_loss += val_loss / len(val_loader.dataset)

    average_train_loss = total_train_loss / len(train_test_splits)
    average_val_loss = total_val_loss / len(train_test_splits)
    
    print(f'Epoch {epoch+1}/{n_epochs}, '
          f'Average Train Loss: {average_train_loss:.4f}, '
          f'Average Validation Loss: {average_val_loss:.4f}')

    if average_val_loss < best_loss:
        best_loss = average_val_loss
        counter = 0
    else:
        counter += 1

    if counter == patience:
        print('Early stopping!')
        break


Using device: cpu


  3%|▎         | 1/34 [48:40<26:46:26, 2920.80s/it]


KeyboardInterrupt: 

In [None]:
model.load_state_dict(best_model_state)
model.eval()
predictions = []
returns_df = df[['date', 'TICKER', 'RET']].copy()
# Concatenate training and testing data and labels
from torch.utils.data import ConcatDataset

# This will store DataLoaders for the entire dataset for each period
entire_loaders = []

for train_loader, test_loader in zip(train_loaders, test_loaders):
    # We access the .dataset attribute of the DataLoader to get the underlying dataset
    entire_dataset = ConcatDataset([train_loader.dataset, test_loader.dataset])
    
    # Creating a DataLoader for the entire concatenated dataset
    entire_loader = DataLoader(entire_dataset, batch_size=64, shuffle=False)  # We set shuffle to False as we're not training the model here
    
    entire_loaders.append(entire_loader)

# Now, entire_loaders contains DataLoaders for the entire dataset (train + test) for each period

# You can now proceed to make predictions for each period like this:
all_predictions = []  # This will store predictions for all periods

for entire_loader in tqdm(entire_loaders):
    predictions = []
    
    with torch.no_grad():
        for sequences, labels in entire_loader:  
            sequences = sequences.unsqueeze(-1)
            outputs = model(sequences)
            probabilities = torch.softmax(outputs, dim=1)[:, 1].numpy() 
            predictions.extend(probabilities)
            
    all_predictions.append(np.array(predictions))