<a href="https://colab.research.google.com/github/Giovanni26/Colab/blob/main/self_training_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## Standard libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [17]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        # Initialize dimensions
        self.d_model = d_model # Model's dimension, this is the size of the input embeddings
        self.num_heads = num_heads # Number of attention heads

        # Dimension of each head's key, query, and value
        # Here they are all equal but we can allow lower 'projection spaces' (e.g. see Gohjogh Ghodsi paper)
        # self.d_k is the p parameter in the paper
        self.d_k = d_model // num_heads

        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output


In [22]:
# Practicing with tensors' size
# Create a random tensor with shape (batch_size, seq_length, d_model)
# For example, let's say we have:
batch_size = 4   # Number of sequences in a batch
seq_length = 10  # Length of each sequence
d_model = 64     # Dimensionality of the model

# Create a tensor with the specified shape
# Let's remember that a tensor is a multi-dimensional object, so in this specific case we have
# 4 samples (batches) with 10 columns and 64 rows (assuming vectors are stored as column vectors)
x = torch.randn(batch_size, seq_length, d_model)

# Get the sizes
batch_size, seq_length, d_model = x.size()

# Print the results
print("Batch Size:", batch_size)
print("Sequence Length:", seq_length)
print("Model Dimension:", d_model)

# Building a small class
class MyTransformer:
    def __init__(self, num_heads, d_k):
        self.num_heads = num_heads
        self.d_k = d_k

    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

# Create an instance of the class
transformer = MyTransformer(num_heads=8, d_k=8)

# Create a random tensor x with shape (batch_size, seq_length, d_model)
x = torch.randn(4, 10, 64)  # Example tensor

# Call the method on the instance
x_1 = transformer.split_heads(x)

# Print the shape of the output
print(x_1.shape)  # Should be (4, 8, 10, 8)



Batch Size: 4
Sequence Length: 10
Model Dimension: 64
torch.Size([4, 8, 10, 8])


torch.Size([2, 8])

In [29]:
a = torch.randn(2,4, 4)
a.size()
print(a)



tensor([[[ 0.5375,  0.3533, -1.0756,  1.5183],
         [ 0.0887, -1.0855, -1.3476,  1.4100],
         [-0.3008, -0.1183, -0.2767, -1.0378],
         [-1.4390, -0.3711,  0.0848, -1.4813]],

        [[ 2.0656,  0.7584,  0.8162,  0.5624],
         [ 2.5525, -0.2654,  1.2728, -0.6032],
         [ 1.4448,  0.7006,  0.7994,  0.6087],
         [-0.4891,  0.8860, -0.8667,  0.9676]]])


In [30]:
z = a.view(2, 16)  # the size -1 is inferred from other dimensions
z.size()
print(z)

tensor([[ 0.5375,  0.3533, -1.0756,  1.5183,  0.0887, -1.0855, -1.3476,  1.4100,
         -0.3008, -0.1183, -0.2767, -1.0378, -1.4390, -0.3711,  0.0848, -1.4813],
        [ 2.0656,  0.7584,  0.8162,  0.5624,  2.5525, -0.2654,  1.2728, -0.6032,
          1.4448,  0.7006,  0.7994,  0.6087, -0.4891,  0.8860, -0.8667,  0.9676]])


In [33]:
y = a.view(32)
y.size()
print(y)

tensor([ 0.5375,  0.3533, -1.0756,  1.5183,  0.0887, -1.0855, -1.3476,  1.4100,
        -0.3008, -0.1183, -0.2767, -1.0378, -1.4390, -0.3711,  0.0848, -1.4813,
         2.0656,  0.7584,  0.8162,  0.5624,  2.5525, -0.2654,  1.2728, -0.6032,
         1.4448,  0.7006,  0.7994,  0.6087, -0.4891,  0.8860, -0.8667,  0.9676])
