In [1]:
#!pip install torch

In [2]:
#first install torch in independent cell

import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        batch_size = query.shape[0]

        # Linear transformations
        Q = self.query(query)
        K = self.key(key)
        V = self.value(value)

        # Split into heads
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Scaled Dot-Product Attention
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))

        attention = torch.nn.functional.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)

        # Reshape and concatenate
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.d_model)

        # Final linear layer
        x = self.fc_out(x)

        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Self-attention
        attention = self.self_attention(x, x, x, mask)
        x = x + self.dropout(attention)
        x = self.layer_norm1(x)

        # Feedforward
        ffn_output = self.ffn(x)
        x = x + self.dropout(ffn_output)
        x = self.layer_norm2(x)

        return x

class TransformerEncoder(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, n_layers, max_seq_length, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, n_heads, ff_dim, dropout)
            for _ in range(n_layers)
        ])
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.position_embedding = nn.Embedding(max_seq_length, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        positions = torch.arange(0, x.size(1)).expand(x.size(0), x.size(1)).to(self.device)
        x = x + self.position_embedding(positions)
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return x

# Example usage:
d_model = 512
n_heads = 8
ff_dim = 2048
n_layers = 6
max_seq_length = 100
dropout = 0.1

# Create transformer encoder
transformer_encoder = TransformerEncoder(d_model, n_heads, ff_dim, n_layers, max_seq_length, dropout)

# Dummy input
input_data = torch.rand((16, 100, d_model))

# Mask for padding
padding_mask = (input_data.sum(dim=-1) != 0).unsqueeze(1).unsqueeze(2)

# Forward pass
output_data = transformer_encoder(input_data, padding_mask)
print(output_data)
print("Output shape:", output_data.shape)





tensor([[[ 1.1657e-01, -3.6905e-01, -1.2601e+00,  ...,  2.3323e-01,
          -7.1668e-02, -2.0650e-01],
         [-4.2603e-01,  5.1927e-01, -1.1892e+00,  ...,  8.8188e-01,
           5.2130e-01, -7.2420e-01],
         [-1.9452e-01,  1.1631e+00, -3.8681e-01,  ...,  2.9459e-01,
           1.3314e+00, -2.3014e-01],
         ...,
         [-4.6561e-01,  1.4225e-01, -1.3854e+00,  ...,  5.2771e-01,
           1.1409e+00, -2.8436e-01],
         [-7.0931e-01, -5.6777e-01,  2.4200e+00,  ...,  1.1207e+00,
           1.8961e+00, -2.5267e-02],
         [ 9.8844e-01,  9.9537e-01,  6.8374e-01,  ...,  2.0445e+00,
           2.6545e+00, -3.2404e-01]],

        [[ 7.6275e-02, -3.1531e-01, -5.8475e-01,  ..., -4.7538e-01,
           3.9984e-01,  4.0923e-01],
         [-3.2142e-01,  7.5179e-02,  3.0848e-02,  ...,  4.6124e-01,
           5.1557e-01, -9.2494e-01],
         [-3.8310e-01,  8.7626e-01, -3.7387e-01,  ..., -8.1742e-02,
           9.9139e-01,  2.5840e-01],
         ...,
         [-8.0533e-01, -2

In [3]:
# Definition: A deep learning model architecture based on self-attention, widely used in NLP tasks.
# Key Components:
# Self-Attention: Computes attention scores among input tokens.
# Positional Encoding: Adds positional information to input tokens.
# Multi-Head Attention: Allows the model to focus on different parts of the input simultaneously.
# Feed-Forward Networks: Apply additional transformation layers.
# Layer Normalization: Normalizes the output of each layer to stabilize training.