In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Ensure that positional encoding matches the batch size of the input tensor
        pe = self.pe[:x.size(0), :]
        return x + pe.unsqueeze(1)  # Unsqueeze to match batch size dimension


class TransformerModel(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward, dropout):
        super(TransformerModel, self).__init__()
        self.positional_encoding = PositionalEncoding(d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout),
            num_layers=num_layers
        )
        self.fc1 = nn.Linear(d_model, d_model)
        self.fc2 = nn.Linear(d_model, 4)  # Output dimensionality is 2 for action probabilities
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(4)  # Output dimensionality is 2 for action probabilities
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x)
        x = self.layer_norm1(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.layer_norm2(x)
        x = self.fc2(x)
        x = self.layer_norm3(x)
        x = self.sigmoid(x)
        return x

# Define model parameters
d_model = 64          # Dimensionality of model
nhead = 4             # Number of attention heads
num_layers = 2        # Number of transformer encoder layers
dim_feedforward = 128 # Dimensionality of the feedforward network
dropout = 0.1         # Dropout rate

# Instantiate the Transformer Model
transformer_model = TransformerModel(d_model, nhead, num_layers, dim_feedforward, dropout)

# Correct the dimensions of the input tensor
input_data = torch.randn(3, 5, d_model)  # Assuming sequence length 3 and batch size 5
output = transformer_model(input_data)
print(output)  # Output shape: (3, 5, 2) - 2 for action probabilities

tensor([[[0.3662, 0.3439, 0.3691, 0.8495],
         [0.7622, 0.2599, 0.6935, 0.2820],
         [0.3063, 0.3167, 0.4831, 0.8394],
         [0.7795, 0.1937, 0.6236, 0.4153],
         [0.4415, 0.2148, 0.5066, 0.8182]],

        [[0.3910, 0.3230, 0.3685, 0.8484],
         [0.2421, 0.5304, 0.3704, 0.8249],
         [0.4578, 0.5106, 0.2068, 0.8132],
         [0.5820, 0.1646, 0.7506, 0.5478],
         [0.6296, 0.2013, 0.3941, 0.7821]],

        [[0.3424, 0.4856, 0.8380, 0.2823],
         [0.7089, 0.2268, 0.3232, 0.7456],
         [0.7538, 0.2519, 0.2893, 0.7044],
         [0.5254, 0.3717, 0.2433, 0.8260],
         [0.3579, 0.4156, 0.3132, 0.8469]]], grad_fn=<SigmoidBackward0>)
