In [3]:
# Task 1

import torch
import torch.nn as nn

# The multi-head self-attention module
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads

        # Linear projections
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)

        # Output linear layer
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask):
        bs = q.size(0)

        # Linear projections
        k = self.k_linear(k).view(bs, -1, self.n_heads, self.d_model // self.n_heads).transpose(1, 2)
        q = self.q_linear(q).view(bs, -1, self.n_heads, self.d_model // self.n_heads).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.n_heads, self.d_model // self.n_heads).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_model / self.n_heads).float())
        scores = scores.masked_fill(mask == 0, -1e9)

        # Apply attention mask and softmax
        scores = nn.functional.softmax(scores, dim=-1)
        attention = torch.matmul(scores, v)

        # Concatenate and linear layer
        concat_attention = attention.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat_attention)

        return output


# The point-wise feed-forward module
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionwiseFeedforward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = nn.functional.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x


# Combine attention and feed forward to make transformer module called GPT2Layer
class GPT2Layer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(GPT2Layer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.feedforward = PositionwiseFeedforward(d_model, d_model * 4)

    def forward(self, x, mask):
        # Multi-head self-attention
        attn_output = self.self_attn(x, x, x, mask)
        x = x + attn_output

        # Feedforward layer
        ff_output = self.feedforward(x)
        x = x + ff_output

        return x

# Using our custom transformer module, we can create the GPT2 model
class GPT2(nn.Module):
    def __init__(self, d_model=768, n_heads=12, num_layers=12):
        super(GPT2, self).__init__()
        self.embedding = nn.Embedding(50257, d_model)
        self.layers = nn.ModuleList([GPT2Layer(d_model, n_heads) for _ in range(num_layers)])

    def forward(self, x, mask):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x



In [None]:
# Task 2


import torch
import torch.nn as nn
import torch.nn.functional as F

# For all 3 alterations the common pitfall is that the effectiveness may vary depending on the nature of the data,
# and all increase time/memory requirements.

# Rotary Positional Embedding doesn't significantly affect the model size although run time may increase, 
# as it involves additional computations without introducing new parameters. 
# It can ideally capture longer-term dependencies in sequences 
# (due to introducion of a rotation-invariant positional encoding)
class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, d_model):
        super(RotaryPositionalEmbedding, self).__init__()
        self.d_model = d_model

    def forward(self, x, freq=10000):
        pos = torch.arange(0, x.size(1)).unsqueeze(0).type_as(x)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * -(torch.log(torch.tensor(freq).float()) / self.d_model))
        pos_embedding = torch.cat([torch.sin(pos * div_term), torch.cos(pos * div_term)], dim=-1)
        return x + pos_embedding[:, :x.size(1)].detach()

# Group Query Attention does increase the size of the model as it introduces additional parameters 
# and also increased runtime due to additional computations.
# As it allows attending to groups of queries simultaneously, it can potentially improving its ability to capture diverse information.
class GroupQueryAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(GroupQueryAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)

        self.linear_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.linear_q(query)
        K = self.linear_k(key)
        V = self.linear_v(value)

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.d_model**0.5

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))

        attention = F.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)

        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.d_model)

        x = self.linear_out(x)

        return x

# Sliding Window Attention increases size due to the modification in attention mechanisms.
# As it allows the model to attend to a specified range, which can be useful for processing long sequences efficiently.
# A major pitfall/ design consideration is that there can be loss of information outside the sliding window, 
# and the choice of window size is critical. Also, larger window sizes increase computational requirements.
class SlidingWindowAttention(nn.Module):
    def __init__(self, d_model, n_heads, window_size):
        super(SlidingWindowAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.window_size = window_size

        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)

        self.linear_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size, seq_len, _ = query.size()

        Q = self.linear_q(query)
        K = self.linear_k(key)
        V = self.linear_v(value)

        sliding_window_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=self.window_size).unsqueeze(0).unsqueeze(0)
        sliding_window_mask = sliding_window_mask.to(device=query.device, dtype=query.dtype)

        energy = torch.matmul(Q, K.permute(0, 2, 1)) / (self.d_model**0.5)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))

        attention = F.softmax(energy.masked_fill(sliding_window_mask == 0, float('-1e20')), dim=-1)
        x = torch.matmul(attention, V)

        x = self.linear_out(x)

        return x

    
#create a new GPT2Layer_task2 (tranformer module) with support for the above 3 mechanisms
class GPT2Layer_task2(nn.Module):
    def __init__(self, d_model, n_heads, hidden_dim, rotary_positional_embedding=False,
                 group_query_attention=False, sliding_window_attention=False):
        super(GPT2Layer, self).__init__()
        self.self_attention = GroupQueryAttention(d_model, n_heads) if group_query_attention else MultiHeadAttention(d_model, n_heads)
        self.feed_forward = FeedForward(d_model, hidden_dim)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

        if rotary_positional_embedding:
            self.positional_embedding = RotaryPositionalEmbedding(d_model)
        else:
            self.positional_embedding = nn.Embedding(512, d_model)  # Fixed positional embeddings for simplicity

        if sliding_window_attention:
            self.attention = SlidingWindowAttention(d_model, n_heads, window_size=5)
        else:
            self.attention = self.self_attention

    def forward(self, x, mask):
        positions = torch.arange(0, x.size(1)).expand(x.size(0), x.size(1)).to(x.device)
        x = self.token_embedding(x) + self.positional_embedding(positions)

        mask = (x != 0).unsqueeze(1).unsqueeze(2)  # Padding mask

        for layer in self.layers:
            x = layer(x, mask)

        x = self.fc(x)
        return x


# Using GPT2Layer_task2, we can create the GPT2 model for task 2
class GPT2_task2(nn.Module):
    def __init__(self, d_model=768, n_heads=12, num_layers=12):
        super(GPT2, self).__init__()
        self.embedding = nn.Embedding(50257, d_model)
        self.layers = nn.ModuleList([GPT2Layer_task2(d_model, n_heads) for _ in range(num_layers)])

    def forward(self, x, mask):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x



In [None]:
# task 3

import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel

# Function to create a model and optimizer
def create_model_optimizer(lr=5e-5):
    model = GPT2()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    return model, optimizer

# Function to train the model on a single GPU
def train_single_gpu(model, optimizer, criterion, dataloader, device):
    model.train()
    model.to(device)

    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Function to train the model using Distributed Data Parallel (DDP)
def train_ddp(model, optimizer, criterion, dataloader, device):
    # Initialize distributed training
    torch.distributed.init_process_group(backend='nccl')
    local_rank = torch.distributed.get_rank()
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)

    # Create model and optimizer on each GPU
    model, optimizer = create_model_optimizer()
    model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

    # Move data loader to GPU
    dataloader = torch.utils.data.DataLoader(dataloader.dataset, batch_size=dataloader.batch_size,
                                             shuffle=True, num_workers=dataloader.num_workers,
                                             pin_memory=True, sampler=torch.utils.data.distributed.DistributedSampler(dataloader.dataset))

    # Training loop
    model.train()
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Function to train the model using Fully Sharded Data Parallel (FSDP)
def train_fsdp(model, optimizer, criterion, dataloader, device):
    # Initialize distributed training
    torch.distributed.init_process_group(backend='nccl')
    local_rank = torch.distributed.get_rank()
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)

    # Create model and optimizer on each GPU
    model, optimizer = create_model_optimizer()
    model = FullyShardedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

    # Move data loader to GPU
    dataloader = torch.utils.data.DataLoader(dataloader.dataset, batch_size=dataloader.batch_size,
                                             shuffle=True, num_workers=dataloader.num_workers,
                                             pin_memory=True, sampler=torch.utils.data.distributed.DistributedSampler(dataloader.dataset))

    # Training loop
    model.train()
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Sample dataset and dataloader
class SampleDataset(torch.utils.data.Dataset):
    def __init__(self, num_samples=100, seq_length=10):
        self.data = torch.randint(50257, (num_samples, seq_length))
        self.targets = torch.randint(50257, (num_samples, seq_length))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.targets[index]

dataset = SampleDataset()
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2)

# Define loss criterion
criterion = nn.CrossEntropyLoss()

# Choose the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training on a single GPU
model_single_gpu, optimizer_single_gpu = create_model_optimizer()
train_single_gpu(model_single_gpu, optimizer_single_gpu, criterion, dataloader, device)

# Training using Distributed Data Parallel (DDP)
model_ddp, optimizer_ddp = create_model_optimizer()
train_ddp(model_ddp, optimizer_ddp, criterion, dataloader, device)

# Training using Fully Sharded Data Parallel (FSDP)
model_fsdp, optimizer_fsdp = create_model_optimizer()
train_fsdp(model_fsdp, optimizer_fsdp, criterion, dataloader, device)
