In [None]:
import torch
import torch.nn as nn

# Set seeds for reproducibility
SEED = 100
torch.manual_seed(SEED)
np.random.seed(SEED)

# Define constants
embedding_dim = 16  # embedding dimension for ticker
num_heads = 4  # Number of attention heads
ff_dim = 256  # Hidden layer size in feed forward network inside transformer
num_tickers = len(df['Ticker_encoded'].unique())  # Number of unique tickers

# Define the input layers for each feature
input_ticker = nn.Linear(1, embedding_dim)
input_volume = nn.Linear(1, 1)
input_eps = nn.Linear(1, 1)
input_pe_ratio = nn.Linear(1, 1)
input_revenue_growth = nn.Linear(1, 1)
input_operating_margin = nn.Linear(1, 1)
input_total_equity = nn.Linear(1, 1)
input_debt_levels = nn.Linear(1, 1)
input_roe = nn.Linear(1, 1)
input_net_cash = nn.Linear(1, 1)
input_unemployment_rate = nn.Linear(1, 1)
input_gdp = nn.Linear(1, 1)
input_month_sin = nn.Linear(1, 1)
input_month_cos = nn.Linear(1, 1)
input_day_of_week_sin = nn.Linear(1, 1)
input_day_of_week_cos = nn.Linear(1, 1)
input_day_of_year_sin = nn.Linear(1, 1)
input_day_of_year_cos = nn.Linear(1, 1)

# Concatenate all features including the ticker embedding
combined_input = nn.Sequential(
    input_ticker,
    nn.Flatten(),
    input_volume,
    input_eps,
    input_pe_ratio,
    input_revenue_growth,
    input_operating_margin,
    input_total_equity,
    input_debt_levels,
    input_roe,
    input_net_cash,
    input_unemployment_rate,
    input_gdp,
    input_month_sin,
    input_month_cos,
    input_day_of_week_sin,
    input_day_of_week_cos,
    input_day_of_year_sin,
    input_day_of_year_cos,
    nn.Flatten()
)

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embed_dim):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert
 self.head_dim * self.num_heads == self.embed_dim,
 "Embedding dimension must be divisible by the number of heads"

        self.wq = nn.Linear(self.embed_dim, self.embed_dim)
        self.wk = nn.Linear(self.embed_dim, self.embed_dim)
        self.wv = nn.Linear(self.embed_dim, self.embed_dim)
        self.fc
 = nn.Linear(self.embed_dim, self.embed_dim)

    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        query = self.wq(x).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        key = self.wk(x).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        value = self.wv(x).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(self.head_dim)
        attention_weights = torch.softmax(attention_scores, dim=-1)
        attention_output = torch.matmul(attention_weights, value).permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, embed_dim)

        output = self.fc(attention_output)
        return output

# Instantiate the multi-head attention layer
attention = MultiHeadAttention(num_heads, embedding_dim)

# Add & Normalize
attn_output = attention(combined_input_reshaped)
attn_output = nn.LayerNorm(embedding_dim)(attn_output + combined_input_reshaped)

# Feed Forward Network (FFN)
ffn = nn.Sequential(
    nn.Linear(embedding_dim, ff_dim),
    nn.ReLU(),
    nn.Linear(ff_dim, embedding_dim)
)

# Add & Normalize
transformer_output = nn.LayerNorm(embedding_dim)(ffn(attn_output) + attn_output)

# Flatten the output for the Dense layer
transformer_output_flat = transformer_output.view(transformer_output.size(0), -1)

# Output layer for predicting 'Adj. Close'
output = nn.Linear(transformer_output_flat.size(1), 1)

# Build and compile the model
model = nn.Sequential(
    combined_input,
    attention,
    ffn,
    transformer_output_flat,
    output
)

# Compile the model with Mean Squared Error for regression
# we can use other metrics. we can discuss them on 8:30 call
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

# Print the model summary
print(model)