### Self Attention demo

In [None]:
import torch
import torch.nn.functional as F

# Example input sequence
input_sequence = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])

# Generate random weights for Key, Query, and Value matrices
random_weights_key = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_query = torch.randn(input_sequence.size(-1), input_sequence.size(-1))
random_weights_value = torch.randn(input_sequence.size(-1), input_sequence.size(-1))

# Compute Key, Query, and Value matrices
key = torch.matmul(input_sequence, random_weights_key)
query = torch.matmul(input_sequence, random_weights_query)
value = torch.matmul(input_sequence, random_weights_value)

# Compute attention scores
attention_scores = torch.matmul(query, key.T) / torch.sqrt(torch.tensor(query.size(-1), dtype=torch.float32))

# Apply softmax to obtain attention weights
attention_weights = F.softmax(attention_scores, dim=-1)

# Compute weighted sum of Value vectors
output = torch.matmul(attention_weights, value)

print("Output after self-attention:")
print(output)

Output after self-attention:
tensor([[-0.7295,  1.0207,  1.1616],
        [-0.7925,  1.1056,  1.2716],
        [-0.8489,  1.1816,  1.3702]])


## Positional Encoding Implementation

In [None]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # Compute positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(
        torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + x + self.pe[:, :x.size(1)]
        return x

# Example usage
d_model = 512
max_len = 100
num_heads = 8

# Positional encoding
pos_encoder = PositionalEncoding(d_model, max_len)

# Example input sequence
input_sequence = torch.randn(5, max_len, d_model)

# Apply positional encoding
input_sequence = pos_encoder(input_sequence)
print("Positional Encoding of input sequence:")
print(input_sequence.shape)

Positional Encoding of input sequence:
torch.Size([5, 100, 512])


## Muti-Head Attention Implementation

In [None]:
# Code implementation of Multi-Head Attention
import math
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads

        # Linear projections for query, key, and value
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)

        # Output linear projection
        self.output_linear = nn.Linear(d_model, d_model)

    def split_heads(self, x):
      batch_size, seq_length, d_model = x.size()
      return x.view(batch_size, seq_length, self.num_heads, self.depth).transpose(1, 2)

    def forward(self, query, key, value, mask=None):

        # Linear projections
        query = self.query_linear(query)
        key = self.key_linear(key)
        value = self.value_linear(value)

        # Split heads
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)

        # Scaled dot-product attention
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)

        # Apply mask if provided
        if mask is not None:
            scores += scores.masked_fill(mask == 0, -1e9)

        # Compute attention weights and apply softmax
        attention_weights = torch.softmax(scores, dim=-1)

        # Apply attention to values
        attention_output = torch.matmul(attention_weights, value)

        # Merge heads

        batch_size, _, seq_length, d_k = attention_output.size()
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        # Linear projection
        attention_output = self.output_linear(attention_output)

        return attention_output

# Example usage
d_model = 512
max_len = 100
num_heads = 8
d_ff = 2048


# Multi-head attention
multihead_attn = MultiHeadAttention(d_model, num_heads)

# Example input sequence
input_sequence = torch.randn(5, max_len, d_model)

# Multi-head attention
attention_output= multihead_attn(input_sequence, input_sequence, input_sequence)
print("attention_output shape:", attention_output.shape)

attention_output shape: torch.Size([5, 100, 512])


## Feed Forward Implementation

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return x

# Example usage
d_model = 512
max_len = 100
num_heads = 8
d_ff = 2048



# Multi-head attention
multihead_attn = MultiHeadAttention(d_model, num_heads)

# Feed-forward network
ff_network = FeedForward(d_model, d_ff)

# Example input sequence
input_sequence = torch.randn(5, max_len, d_model)

# Multi-head attention
attention_output= multihead_attn(input_sequence, input_sequence, input_sequence)

# Feed-forward network
output_ff = ff_network(attention_output)
print('input_sequence',input_sequence.shape)
print("output_ff", output_ff.shape)

input_sequence torch.Size([5, 100, 512])
output_ff torch.Size([5, 100, 512])


## Encoder Implementation

In [None]:
import torch.nn as nn

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):

        # Self-attention layer
        attention_output= self.self_attention(x, x,
        x, mask)
        attention_output = self.dropout(attention_output)
        x = x + attention_output
        x = self.norm1(x)

        # Feed-forward layer
        feed_forward_output = self.feed_forward(x)
        feed_forward_output = self.dropout(feed_forward_output)
        x = x + feed_forward_output
        x = self.norm2(x)

        return x

d_model = 512
max_len = 100
num_heads = 8
d_ff = 2048


# Multi-head attention
encoder_layer = EncoderLayer(d_model, num_heads, d_ff, 0.1)

# Example input sequence
input_sequence = torch.randn(1, max_len, d_model)

# Multi-head attention
encoder_output= encoder_layer(input_sequence, None)
print("encoder output shape:", encoder_output.shape)

encoder output shape: torch.Size([1, 100, 512])


## Decoder Implementation

In [None]:
import torch.nn as nn

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.masked_self_attention = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, src_mask, tgt_mask):

        # Masked self-attention layer
        self_attention_output= self.masked_self_attention(x, x, x, tgt_mask)
        self_attention_output = self.dropout(self_attention_output)
        x = x + self_attention_output
        x = self.norm1(x)

        # Encoder-decoder attention layer
        enc_dec_attention_output= self.enc_dec_attention(x, encoder_output,
        encoder_output, src_mask)
        enc_dec_attention_output = self.dropout(enc_dec_attention_output)
        x = x + enc_dec_attention_output
        x = self.norm2(x)

        # Feed-forward layer
        feed_forward_output = self.feed_forward(x)
        feed_forward_output = self.dropout(feed_forward_output)
        x = x + feed_forward_output
        x = self.norm3(x)

        return x

# Define the DecoderLayer parameters
d_model = 512  # Dimensionality of the model
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimensionality of the feed-forward network
dropout = 0.1  # Dropout probability
batch_size = 1 # Batch Size
max_len = 100  # Max length of Sequence

# Define the DecoderLayer instance
decoder_layer = DecoderLayer(d_model, num_heads, d_ff, dropout)


src_mask = torch.rand(batch_size, max_len, max_len) > 0.5
tgt_mask = torch.tril(torch.ones(max_len, max_len)).unsqueeze(0) == 0

# Pass the input tensors through the DecoderLayer
output = decoder_layer(input_sequence, encoder_output, src_mask, tgt_mask)

# Output shape
print("Output shape:", output.shape)

Output shape: torch.Size([1, 100, 512])


## Transformer Implementation

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_len, dropout):
        super(Transformer, self).__init__()

        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        self.positional_encoding = PositionalEncoding(d_model, max_len)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.linear = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        encoder_embedding = self.encoder_embedding(src)
        en_positional_encoding = self.positional_encoding(encoder_embedding)
        src_embedded = self.dropout(en_positional_encoding)

        decoder_embedding = self.decoder_embedding(tgt)
        de_positional_encoding = self.positional_encoding(decoder_embedding)
        tgt_embedded = self.dropout(de_positional_encoding)

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.linear(dec_output)
        return output

src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_len = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_len, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (5, max_len))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (5, max_len))  # (batch_size, seq_length)
print("Transformer model output shape:",transformer(src_data, tgt_data[:, :-1]).shape)

Transformer model output shape: torch.Size([5, 99, 5000])


## Training and Evaluation of Transformer Model

In [None]:
import torch.optim as optim
import torch.nn.functional as F

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Training loop
transformer.train()

for epoch in range(10):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}: Loss= {loss.item():.4f}")


#Dummy Data
src_data = torch.randint(1, src_vocab_size, (5, max_len))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (5, max_len))  # (batch_size, seq_length)

# Evaluation loop
transformer.eval()
with torch.no_grad():
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    print(f"\nEvaluation Loss for dummy data= {loss.item():.4f}")

Epoch 1: Loss= 8.6726
Epoch 2: Loss= 8.1731
Epoch 3: Loss= 7.8844
Epoch 4: Loss= 7.6743
Epoch 5: Loss= 7.4506
Epoch 6: Loss= 7.1867
Epoch 7: Loss= 6.9412
Epoch 8: Loss= 6.6558
Epoch 9: Loss= 6.4241
Epoch 10: Loss= 6.1700

Evaluation Loss for dummy data= 8.7536


## BERT Implementation

In [None]:
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

inputs = tokenizer("Hello, world!", return_tensors="pt")
outputs = model(**inputs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0781,  0.1587,  0.0400,  ..., -0.2805,  0.0248,  0.4081],
         [-0.2016,  0.1781,  0.4184,  ..., -0.2522,  0.3630, -0.0979],
         [-0.7156,  0.6751,  0.6017,  ..., -1.1032,  0.0797,  0.0567],
         [ 0.0527, -0.1483,  1.3609,  ..., -0.4513,  0.1274,  0.2655],
         [-0.7122, -0.4815, -0.1438,  ...,  0.5602, -0.1062, -0.1301],
         [ 0.9955,  0.1328, -0.0621,  ...,  0.2460, -0.6502, -0.3296]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8130, -0.2470, -0.7289,  0.5582,  0.3357, -0.0758,  0.7851,  0.1526,
         -0.5705, -0.9997, -0.3183,  0.7643,  0.9550,  0.5801,  0.9046, -0.6037,
         -0.3113, -0.5445,  0.3740, -0.4197,  0.5471,  0.9996,  0.0560,  0.2710,
          0.3869,  0.9316, -0.7260,  0.8900,  0.9311,  0.5901, -0.5208,  0.0532,
         -0.9711, -0.1791, -0.8414, -0.9663,  0.2318, -0.6239,  0.0885,  0.1203,
         -0.8333,  0.1662,  0.9993,  0.1384,  

## GPT2 Implementation

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

input_text = "Once upon a time, "
inputs=tokenizer(input_text,return_tensors='pt')
output=tokenizer.decode(
    model.generate(
        **inputs,
        max_new_tokens=100,
      )[0],
      skip_special_tokens=True
  )
input_ids = tokenizer(input_text, return_tensors='pt')

output

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Once upon a time, \xa0I was a little bit of a fan of the original series, but I was also a little bit of a fan of the original series. I was a little bit of a fan of the original series, but I was also a little bit of a fan of the original series. I was a little bit of a fan of the original series, but I was also a little bit of a fan of the original series. I was a little bit of a fan of the original series, but I'