Transformer (Multi-head Attention Class)
EC523 Project
Team 2

In [1]:
# Import Libraries

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from torch.utils.data import DataLoader, TensorDataset


Transformer Class

In [9]:
class Transformer(nn.Module):
  def __init__(self, input_dim, hidden_dim, num_heads, output_dim):
    super(Transformer, self).__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.num_heads = num_heads
    self.output_dim = output_dim

    self.multihead_attn = nn.MultiheadAttention(embed_dim=self.input_dim, num_heads=self.num_heads, batch_first=True)


  def forward(self,x):
    attn_output, attn_weights = self.multihead_attn(x,x,x)

    return attn_output, attn_weights


Generate Random X and Y

In [10]:
Width, Height = 10, 10
Images = 30

X = np.random.rand(Images, Width, Height).reshape(-1, Width * Height)
Y = np.random.rand(Images, Width, Height).reshape(-1, Width * Height)
print(f'Shape of Flattened Image:', X.shape)

X_tensor = torch.tensor(X, dtype=torch.float32)
Y_tensor = torch.tensor(Y, dtype=torch.float32)

dataset = TensorDataset(X_tensor, Y_tensor)
dataloader = DataLoader(dataset, batch_size=20, shuffle=True)

Shape of Flattened Image: (30, 100)


Instaniate Class (Parameters)

In [11]:
# Instantiate Model Class
transformer_model = Transformer(input_dim=100, hidden_dim=120, num_heads=4, output_dim=100)

# Set Optimizer and Training Loss
loss = nn.MSELoss()
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

# Training Parameters
num_epochs = 100
batch_size = 20

In [12]:
# Print Model Parameters
for name, param in transformer_model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Shape: {param.shape}")

Parameter name: multihead_attn.in_proj_weight
Shape: torch.Size([300, 100])
Parameter name: multihead_attn.in_proj_bias
Shape: torch.Size([300])
Parameter name: multihead_attn.out_proj.weight
Shape: torch.Size([100, 100])
Parameter name: multihead_attn.out_proj.bias
Shape: torch.Size([100])


Train Model

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
transformer_model.to(device)

cuda


Transformer(
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
)

In [14]:
# Model Training

for epoch in range(num_epochs):
    transformer_model.train()
    epoch_loss = 0.0

    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)
        inputs = inputs.view(inputs.size(0), -1, 100)  # Reshape for input_dim=100

        optimizer.zero_grad()
        outputs, _ = transformer_model(inputs)
        loss_value = loss(outputs, targets)
        loss_value.backward()
        optimizer.step()

        epoch_loss += loss_value.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss / len(dataloader):.4f}")



  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/100], Loss: 0.3330
Epoch [2/100], Loss: 0.2551
Epoch [3/100], Loss: 0.1953
Epoch [4/100], Loss: 0.1530
Epoch [5/100], Loss: 0.1268
Epoch [6/100], Loss: 0.1110
Epoch [7/100], Loss: 0.1034
Epoch [8/100], Loss: 0.1009
Epoch [9/100], Loss: 0.0987
Epoch [10/100], Loss: 0.1001
Epoch [11/100], Loss: 0.0997
Epoch [12/100], Loss: 0.1001
Epoch [13/100], Loss: 0.0972
Epoch [14/100], Loss: 0.0948
Epoch [15/100], Loss: 0.0930
Epoch [16/100], Loss: 0.0916
Epoch [17/100], Loss: 0.0917
Epoch [18/100], Loss: 0.0909
Epoch [19/100], Loss: 0.0903
Epoch [20/100], Loss: 0.0900
Epoch [21/100], Loss: 0.0898
Epoch [22/100], Loss: 0.0892
Epoch [23/100], Loss: 0.0879
Epoch [24/100], Loss: 0.0870
Epoch [25/100], Loss: 0.0875
Epoch [26/100], Loss: 0.0863
Epoch [27/100], Loss: 0.0874
Epoch [28/100], Loss: 0.0864
Epoch [29/100], Loss: 0.0864
Epoch [30/100], Loss: 0.0865
Epoch [31/100], Loss: 0.0859
Epoch [32/100], Loss: 0.0856
Epoch [33/100], Loss: 0.0861
Epoch [34/100], Loss: 0.0853
Epoch [35/100], Loss: 0

Model Testing