<a href="https://colab.research.google.com/github/HarshitaBadiyasar/SuperAGI-Assignment/blob/main/Coding%20TASK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
## TASK 1

In [15]:
import torch
import torch.nn as nn
import math

# Configuration for a small GPT-2 model
class GPT2Config:
    def __init__(self):
        self.vocab_size = 50257
        self.max_position_embeddings = 1024
        self.n_layers = 12
        self.n_heads = 12
        self.n_embd = 768
        self.layer_norm_epsilon = 1e-5
        self.initializer_range = 0.02

# Scaled dot-product attention function
def scaled_dot_product_attention(query, key, value):
    temp = query.bmm(key.transpose(1, 2)) / math.sqrt(query.size(-1))
    softmax = nn.Softmax(dim=-1)
    return softmax(temp).bmm(value)

# Single attention head
class AttentionHead(nn.Module):
    def __init__(self, embd_dim):
        super().__init__()
        self.query = nn.Linear(embd_dim, embd_dim)
        self.key = nn.Linear(embd_dim, embd_dim)
        self.value = nn.Linear(embd_dim, embd_dim)

    def forward(self, hidden_state):
        return scaled_dot_product_attention(
            self.query(hidden_state), self.key(hidden_state), self.value(hidden_state)
        )

# Multi-head attention
class MultiHeadAttention(nn.Module):
    def __init__(self, embd_dim, n_heads):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(embd_dim) for _ in range(n_heads)])
        self.linear = nn.Linear(n_heads * embd_dim, embd_dim)

    def forward(self, hidden_state):
        attention = [head(hidden_state) for head in self.heads]
        concatenated = torch.cat(attention, dim=-1)
        return self.linear(concatenated)

# Pointwise Feed Forward layer
class PointwiseFeedForward(nn.Module):
    def __init__(self, embd_dim, ff_dim):
        super().__init__()
        self.linear1 = nn.Linear(embd_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embd_dim)

    def forward(self, hidden_state):
        return self.linear2(nn.functional.relu(self.linear1(hidden_state)))

# Transformer block
class TransformerBlock(nn.Module):
    def __init__(self, embd_dim, n_heads, ff_dim, layer_norm_epsilon):
        super().__init__()
        self.attention = MultiHeadAttention(embd_dim, n_heads)
        self.feed_forward = PointwiseFeedForward(embd_dim, ff_dim)
        self.layer_norm1 = nn.LayerNorm(embd_dim, eps=layer_norm_epsilon)
        self.layer_norm2 = nn.LayerNorm(embd_dim, eps=layer_norm_epsilon)

    def forward(self, hidden_state):
        attention_output = self.attention(hidden_state)
        norm1 = self.layer_norm1(hidden_state + attention_output)
        feed_forward_output = self.feed_forward(norm1)
        norm2 = self.layer_norm2(norm1 + feed_forward_output)
        return norm2

# GPT-2 model
class GPT2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embd_dim = config.n_embd
        self.token_embedding = nn.Embedding(config.vocab_size, self.embd_dim)
        self.position_embedding = nn.Embedding(config.max_position_embeddings, self.embd_dim)
        self.blocks = nn.ModuleList([
            TransformerBlock(self.embd_dim, config.n_heads, 4 * self.embd_dim, config.layer_norm_epsilon)
            for _ in range(config.n_layers)
        ])
        self.layer_norm = nn.LayerNorm(self.embd_dim, eps=config.layer_norm_epsilon)

    def forward(self, input_ids, positions_ids=None):
        if positions_ids is None:
            positions_ids = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)
        tokens = self.token_embedding(input_ids)
        positions = self.position_embedding(positions_ids)

        x = tokens + positions

        for block in self.blocks:
            x = block(x)

        x = self.layer_norm(x)
        return x

# Example usage
if __name__ == "__main__":
    # Configuration setup
    config = GPT2Config()
    # Create GPT-2 model instance
    model = GPT2(config)

    # Generate random input for demonstration
    input_ids = torch.randint(0, config.vocab_size, (1, 1024))
    # Obtain model output
    output = model(input_ids)
    print(output)


tensor([[[ 0.9850, -1.3691,  0.2589,  ...,  1.3280, -1.0538,  0.6917],
         [ 0.8623,  0.3825, -1.0204,  ...,  0.1670, -0.1596, -1.5403],
         [-1.8322, -0.9128, -1.1802,  ..., -0.4246, -0.1565, -0.3061],
         ...,
         [-0.8388, -1.6704, -1.3986,  ..., -0.3119,  0.8182, -0.4821],
         [ 1.7029,  0.7901, -1.1380,  ...,  0.0844,  0.6409,  1.0030],
         [ 0.6698, -0.1128, -0.7353,  ...,  0.6422,  1.6727,  0.0911]]],
       grad_fn=<NativeLayerNormBackward0>)


In [7]:
## TASK 2

In [16]:
#Rotary Positional Embedding
import torch

def apply_rotary_pos_emb(x, sincos):
    sin, cos = map(lambda t: t.repeat_interleave(2, dim=-1), sincos)
    return (x * cos) + (torch.roll(x, shifts=1, dims=-1) * sin)


In [17]:
#Group Query Attention
def group_query_attention(query, key, value, num_groups):
    # Split queries into groups
    group_size = query.size(2) // num_groups
    query_groups = query.view(*query.size()[:2], num_groups, group_size)

    # Perform attention within each group
    attention_output = []
    for i in range(num_groups):
        group_attn_output = scaled_dot_product_attention(query_groups[:,:,i,:], key, value)
        attention_output.append(group_attn_output)

    # Concatenate the outputs of each group
    return torch.cat(attention_output, dim=-1)


In [18]:
#Sliding Window Attention
def sliding_window_attention(query, key, value, window_size):
    # Assume query, key, and value are all the same size for simplicity
    batch_size, seq_length, dim = query.size()
    attention_scores = torch.empty((batch_size, seq_length, window_size), device=query.device)

    # Compute attention scores for a sliding window
    for i in range(seq_length):
        start = max(0, i - window_size // 2)
        end = min(seq_length, i + window_size // 2 + 1)
        attention_scores[:, i, :end-start] = torch.bmm(query[:, i:i+1, :], key[:, start:end, :].transpose(1, 2))

    # Apply softmax to get attention probabilities
    attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1)

    # Compute weighted sum to get the attention output
    attention_output = torch.bmm(attention_probs, value[:, start:end, :])
    return attention_output


In [19]:
## TASK 3

In [12]:
# Distributed Data Parallel

In [13]:
# Fully Sharded Data Parallel