In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import sys
import math
from datetime import datetime
#import relu
from torch.nn import functional as F
sys.path.append('../models')
sys.path.append('../data_func')
from data_helper_functions import create_study_periods,create_tensors
from transformer_model import TimeSeriesTransformer,ScaledMultiHeadAttention

We can train multiple transformer models. 1. Cross-sectional-Median Target 2. Raw-returns target 3. Sharpe ratio objective 4. Selective Transformer Model with all of the previous iterations. We can also consider Stochastic Attention, using Attention to calculate a confidence score, and building a confidence model as a first pass. (Cross-sectional median confidence as first filter, and then use the confidence score to select the top 10% of stocks to train on.)
All in all, because of compute the goal shouldn't be to train a model that generates an insane sharpe, but show the potential of Selective ML in portfolio building and how it improves over current SOTA methods. The paper should mostly be an analysis on what types of stocks the model does not learn from/ abstains from.  We can't afford to hyper-parameter tune a lot.

In [2]:
# df=pd.read_csv('../data/crsp_ff_adjusted.csv')
# #drop unamed 0
# df['date'] = pd.to_datetime(df['date'])
# df.dropna(subset=['RET'],inplace=True)
# df=df.drop(columns='Unnamed: 0')
# df 

In [3]:
#get just 2014
# df=df[df['date'].dt.year==2014]

In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader

np.random.seed(42)  # for reproducibility

def generate_individual_stock_data(num_days=1000, mu=0.1, sigma=0.5, start_price=100):
    price_changes = np.random.randn(num_days) * sigma + mu  # daily returns
    prices = np.cumsum(price_changes) + start_price  # price series
    return prices

def process_stock_data(prices, sequence_length=250):
    # Calculate log returns from prices
    log_returns = np.log(prices[1:] / prices[:-1])
    
    # Generate sequences and labels
    data = []
    labels = []
    for i in range(len(log_returns) - sequence_length):
        data.append(log_returns[i:i + sequence_length])
        labels.append(log_returns[i + sequence_length])
    
    return np.array(data), np.array(labels)

# Set parameters
num_days = 1000
sequence_length = 250
train_period = 750

# Generate data for each stock
stocks_data = []
for i in range(3):  # For 3 stocks
    prices = generate_individual_stock_data(num_days=num_days, mu=(i+1)*0.1, sigma=0.5, start_price=(i+1)*100)
    stock_data, stock_labels = process_stock_data(prices, sequence_length=sequence_length)
    stocks_data.append((stock_data[:train_period], stock_labels[:train_period],  # Training data
                        stock_data[train_period-sequence_length:],  # Test data (with overlap)
                        stock_labels[train_period-sequence_length:]))

# Prepare the final datasets
train_data_combined = np.concatenate([data[0] for data in stocks_data], axis=0)
train_labels_combined = np.concatenate([data[1] for data in stocks_data], axis=0)
test_data_combined = np.concatenate([data[2] for data in stocks_data], axis=0)
test_labels_combined = np.concatenate([data[3] for data in stocks_data], axis=0)

# Convert to tensors
train_data_tensor = torch.tensor(train_data_combined, dtype=torch.float32).unsqueeze(-1)  # Add an extra dimension
train_labels_tensor = torch.tensor(train_labels_combined, dtype=torch.float32)
test_data_tensor = torch.tensor(test_data_combined, dtype=torch.float32).unsqueeze(-1)
test_labels_tensor = torch.tensor(test_labels_combined, dtype=torch.float32)

train_test_splits=[(train_data_tensor,train_labels_tensor,test_data_tensor,test_labels_tensor)]


In [5]:
# #select returns to use
# returns='RET'
# df=df[['date','TICKER',f'{returns}']]
# if returns!='RET':
#     #rename returns column
#     df.rename(columns={f'{returns}':'RET'},inplace=True)

In [6]:
#Optional parameter target_type: 'cross_sectional_median(default)','buckets(10 buckets)','raw_returns'.
# study_periods=create_study_periods(df,n_periods=23,window_size=240,trade_size=250,train_size=750,forward_roll=250,start_date=datetime(1990,1,1),end_date=datetime(2015,12,31),target_type='raw_return')

In [7]:
# train_test_splits,task_types=create_tensors(study_periods)

In [8]:
#Optional code to verify tensor shapes
# for train_data, train_labels, test_data, test_labels in train_test_splits:
#     print(train_data.shape, train_labels.shape, test_data.shape, test_labels.shape)

In [9]:
task_types=['regression']

In [10]:
#LOOK INTO MAKING THE D_MODEL>1 YOU'LL HAVE TO STACK INPUTS AS CURRENT INPUT IS [BATCH,SEQUENCE,1] NOT [BATCH,SEQUENCE,D_MODEL]

In [11]:
# Check if CUDA, MPS, or CPU should be used
if torch.cuda.is_available():
    device = torch.device("cuda")
# elif torch.backends.mps.is_available():
#     device = torch.device("mps")
else:
    device = torch.device("cpu")

print("Using device:", device)
best_model_path = "best_model.pth" 
model = TimeSeriesTransformer(d_model=1, num_heads=1, d_ff=256, num_encoder_layers=2, 
                               dropout=.1,task_type=task_types[0]).to(device)

# Loss depends on target, MAE for returns, Cross Entropy for above/below cross-sectional median. Also have selective loss in utils
if task_types[0] == 'classification':
    criterion = nn.NLLLoss()
else:
    criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 1
patience = 5
best_loss = np.inf
counter = 0
batch_size=64
for epoch in range(n_epochs):
    model.train()
    total_train_loss = 0.0
    total_val_loss = 0.0

    for train_data, train_labels, val_data, val_labels in tqdm(train_test_splits):
        train_dataset = TensorDataset(train_data, train_labels)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        val_dataset = TensorDataset(val_data, val_labels)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        # Access the batch size from the train_loader
        batch_size = train_loader.batch_size
        sequence_length = train_loader.dataset.tensors[0].size(1)
        # Generate look-ahead masks
        train_mask = ScaledMultiHeadAttention.create_look_ahead_mask(batch_size, sequence_length).to(device)
        val_mask = ScaledMultiHeadAttention.create_look_ahead_mask(batch_size, sequence_length).to(device)


        train_loss = 0.0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(data, src_mask=train_mask).squeeze()
            if task_types[0] == 'classification':
                labels = labels.long()  # Adjusted here to use the look-ahead mask
            loss = criterion(outputs, labels)  # Adjust based on your specific use case
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * data.size(0)

        total_train_loss += train_loss / len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data, labels in val_loader:
                data, labels = data.to(device), labels.to(device)

                outputs = model(data, src_mask=val_mask).squeeze()
                if task_types[0] == 'classification':
                    labels = labels.long() # Adjusted here to use the look-ahead mask
                loss = criterion(outputs.squeeze(), labels)  # Adjust based on your specific use case
                val_loss += loss.item() * data.size(0)

        total_val_loss += val_loss / len(val_loader.dataset)

    average_train_loss = total_train_loss / len(train_test_splits)
    average_val_loss = total_val_loss / len(train_test_splits)
    
    print(f'Epoch {epoch+1}/{n_epochs}, '
          f'Average Train Loss: {average_train_loss:.4f}, '
          f'Average Validation Loss: {average_val_loss:.4f}')

    if average_val_loss < best_loss:
        best_loss = average_val_loss
        torch.save(model.state_dict(), best_model_path)
        counter = 0
    else:
        counter += 1

    if counter == patience:
        print('Early stopping!')
        break

best_model_state = torch.load(best_model_path, map_location=device)
model.load_state_dict(best_model_state)

Using device: cpu


  0%|          | 0/1 [00:00<?, ?it/s]

: 

In [None]:
#Sharpe ratio objective function?
# for epoch in range(n_epochs):
#     model.train()
#     total_train_loss = 0.0
#     total_val_loss = 0.0

#     for train_data, train_labels, val_data, val_labels in train_test_splits:
#         # Make sure your labels are in the correct shape
#         train_labels = train_labels.view(-1, 1).to(device)
#         val_labels = val_labels.view(-1, 1).to(device)

#         # Training section
#         train_mask = ScaledMultiHeadAttention.create_look_ahead_mask(train_data.size(1)).to(device)
#         train_data = train_data.to(device)
        
#         optimizer.zero_grad()
        
#         weights = model(train_data, src_mask=train_mask)
#         abs_sum = torch.sum(torch.abs(weights), axis=1, keepdim=True) + 1e-8  # Avoid division by zero
#         weights = weights / abs_sum

#         rets = torch.sum(weights * train_labels, axis=1)  

#         mean_ret = torch.mean(rets)
#         std = torch.std(rets) + 1e-8  # Avoid division by zero
#         sharpe_ratio = mean_ret / std  

#         train_loss = -sharpe_ratio
#         train_loss.backward()
#         optimizer.step()

#         total_train_loss += train_loss.item()

#         # Validation section
#         model.eval()
#         with torch.no_grad():
#             val_mask = ScaledMultiHeadAttention.create_look_ahead_mask(val_data.size(1)).to(device)
#             val_data = val_data.to(device)

#             val_weights = model(val_data, src_mask=val_mask)
#             val_abs_sum = torch.sum(torch.abs(val_weights), axis=1, keepdim=True) + 1e-8  # Avoid division by zero
#             val_weights = val_weights / val_abs_sum
            
#             val_rets = torch.sum(val_weights * val_labels, axis=1)  

#             val_mean_ret = torch.mean(val_rets)
#             val_std = torch.std(val_rets) + 1e-8  # Avoid division by zero
#             val_sharpe_ratio = val_mean_ret / val_std

#             val_loss = -val_sharpe_ratio
#             total_val_loss += val_loss.item()

#     avg_train_loss = total_train_loss / len(train_test_splits)
#     avg_val_loss = total_val_loss / len(train_test_splits)

#     print(f"Epoch {epoch + 1}/{n_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

#     # Inside your training loop
#     if avg_val_loss < best_loss:
#         best_loss = avg_val_loss
#         torch.save(model.state_dict(), 'best_model.pth')
#         counter = 0
#     else:
#         counter += 1

#     if counter == patience:
#         print('Early stopping!')
#         break

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TimeSeriesTransformer(d_model=64, num_heads=8, d_ff=256, num_encoder_layers=2, 
                               dropout=.1, max_len=240,task_type='classification')
model.load_state_dict(torch.load('best_model.pth',map_location=torch.device('cpu')) )
model.eval()

in_sample_long_portfolios = pd.DataFrame()
out_of_sample_long_portfolios = pd.DataFrame()

in_sample_short_portfolios = pd.DataFrame()
out_of_sample_short_portfolios = pd.DataFrame()

k = 10  # Number of top assets to select in portfolios

for train_data, train_labels, val_data, val_labels in tqdm(train_test_splits):
    # Here, train_data, val_data are your training and validation data respectively
    
    train_mask = ScaledMultiHeadAttention.create_look_ahead_mask(train_data.size(1))
    val_mask = ScaledMultiHeadAttention.create_look_ahead_mask(val_data.size(1))

    with torch.no_grad():
        train_predictions = model(train_data.to(device), src_mask=train_mask.to(device))
        val_predictions = model(val_data.to(device), src_mask=val_mask.to(device))

        train_probs = torch.softmax(train_predictions, dim=1)[:, 1].cpu().numpy()
        val_probs = torch.softmax(val_predictions, dim=1)[:, 1].cpu().numpy()

    # Assuming you have a dataframe or similar structure to hold the date and TICKER information
    train_df['predicted_prob'] = train_probs
    val_df['predicted_prob'] = val_probs

    # In-Sample Portfolio Construction
    for date in train_df['date'].unique():
        date_data = train_df[train_df['date'] == date].sort_values(by='predicted_prob', ascending=False)
        
        long_tickers = date_data.head(k)
        short_tickers = date_data.tail(k)
        
        in_sample_long_portfolios = pd.concat([in_sample_long_portfolios, long_tickers])
        in_sample_short_portfolios = pd.concat([in_sample_short_portfolios, short_tickers])

    # Out-of-Sample Portfolio Construction
    for date in val_df['date'].unique():
        date_data = val_df[val_df['date'] == date].sort_values(by='predicted_prob', ascending=False)
        
        long_tickers = date_data.head(k)
        short_tickers = date_data.tail(k)
        
        out_of_sample_long_portfolios = pd.concat([out_of_sample_long_portfolios, long_tickers])
        out_of_sample_short_portfolios = pd.concat([out_of_sample_short_portfolios, short_tickers])

# At this point, in_sample_long_portfolios, out_of_sample_long_portfolios, etc. hold your portfolios



  0%|          | 0/34 [00:00<?, ?it/s]

: 

In [None]:
in_sample_long_portfolios.to_csv('../data/transformer_results/in_sample_long_portfolios.csv')
in_sample_short_portfolios.to_csv('../data/transformer_results/in_sample_short_portfolios.csv')
out_of_sample_long_portfolios.to_csv('../data/transformer_results/out_of_sample_long_portfolios.csv')
out_of_sample_short_portfolios.to_csv('../data/transformer_results/out_of_sample_short_portfolios.csv')