In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from torchvision import transforms, datasets
from tqdm import tqdm

In [2]:
# Read the CSV file
data = pd.read_csv("fer2013.csv")

# Extract pixel values and emotions
pixels = data['pixels'].apply(lambda x: np.array(x.split(), dtype=int))
emotions = data['emotion']
usage = data['Usage']  # Assuming there's a column named 'Usage'

# Reshape the pixel arrays to 2D arrays
pixels_2d = pixels.apply(lambda x: np.reshape(x, (48, 48)))

# Reshape pixel values to be suitable for input to models
X = np.array(pixels_2d.tolist())[:, None, :, :]

# Split the data into training and testing sets based on the 'Usage' column
X_train = X[usage == 'Training']
y_train = emotions[usage == 'Training']

X_test = X[usage == 'PrivateTest']
y_test = emotions[usage == 'PrivateTest']

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float)
X_test_tensor = torch.tensor(X_test, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [3]:
X_train_tensor.shape

torch.Size([28709, 1, 48, 48])

In [4]:
batch_size=64
# Create TensorDataset and DataLoader for training and test data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [5]:
# Get a batch of images
image_batch, label_batch = next(iter(train_loader))

# Get a single image from the batch
image, label = image_batch[0], label_batch[0]

# View the batch shapes
print(image.shape, label)


torch.Size([1, 48, 48]) tensor(4)


In [6]:
class PatchEmbedding(nn.Module):
    """Turns a 2D input image into a 1D sequence learnable embedding vector.
    
    Args:
        in_channels (int): Number of color channels for the input images. Defaults to 1 for grayscale.
        patch_size (int): Size of patches to convert input image into. Defaults to 4.
        embedding_dim (int): Size of embedding to turn image into. Defaults to 768.
    """ 
    def __init__(self, 
                 in_channels: int = 1,  # Adjusted for grayscale images
                 patch_size: int = 4,  # Adjusted patch size for 48x48 image
                 embedding_dim: int = 16):
        super().__init__()
        
        self.patch_size = patch_size
        
        # Create a layer to turn an image into patches
        self.patcher = nn.Conv2d(in_channels=in_channels,
                                 out_channels=embedding_dim,
                                 kernel_size=patch_size,
                                 stride=patch_size,
                                 padding=0)

        # Calculate the number of patches
        self.num_patches = (48 // patch_size) ** 2

    def forward(self, x):
        # Perform the forward pass
        x_patched = self.patcher(x)  # Output shape: [batch_size, embedding_dim, num_patches, num_patches]
        x_flattened = x_patched.flatten(2).transpose(1, 2)  # Flatten patches and transpose to [batch_size, num_patches, embedding_dim]
        return x_flattened


In [7]:
# Let's test it on a single image
patch_size = 4

# Set seeds
def set_seeds(seed: int = 42):
    """Sets random seeds for torch operations."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seeds()

# Assuming 'image' is your grayscale image
image = torch.randn(1, 1, 48, 48)  # Assuming a single grayscale image of size 48x48

# Create an instance of patch embedding layer
patchify = PatchEmbedding(in_channels=1,  # Grayscale image has 1 channel
                          patch_size=4,
                          embedding_dim=16)

# Pass a single image through
print(f"Input image shape: {image.shape}")
patch_embedded_image = patchify(image)
print(f"Output patch embedding shape: {patch_embedded_image.shape}")


Input image shape: torch.Size([1, 1, 48, 48])
Output patch embedding shape: torch.Size([1, 144, 16])


In [8]:
print(patch_embedded_image) 
print(f"Patch embedding shape: {patch_embedded_image.shape} -> [batch_size, number_of_patches, embedding_dimension]")

tensor([[[-0.6472,  0.4289, -0.4764,  ...,  0.1373,  1.3341, -0.4216],
         [ 0.8679, -0.5651,  0.5903,  ..., -0.0661,  0.4061,  0.3041],
         [-0.5880, -0.8610, -0.2638,  ...,  0.4933,  1.2292, -0.0019],
         ...,
         [-0.6974,  0.2374,  0.3840,  ..., -0.6328,  0.6245, -0.8937],
         [-0.6851,  0.2170, -0.4709,  ...,  1.2021,  1.0993,  0.1823],
         [-0.5813,  0.4791,  0.5753,  ...,  0.2897, -0.0834, -0.2073]]],
       grad_fn=<TransposeBackward0>)
Patch embedding shape: torch.Size([1, 144, 16]) -> [batch_size, number_of_patches, embedding_dimension]


In [9]:
# From start to positional encoding: All in 1 cell

# Set seeds
def set_seeds(seed: int = 42):
    """Sets random seeds for torch operations."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seeds()

# 1. Set patch size
patch_size = 4

# 2. Print shape of original image tensor and get the image dimensions
print(f"Image tensor shape: {image.shape}")
height, width = image.shape[2], image.shape[3]  # Adjusted for 48x48 image

# 3. Get image tensor and add batch dimension
x = image
print(f"Input image with batch dimension shape: {x.shape}")

# 4. Create patch embedding layer
patch_embedding_layer = PatchEmbedding(in_channels=1,  # Grayscale image has 1 channel
                                       patch_size=patch_size,
                                       embedding_dim=16)

# 5. Pass image through patch embedding layer
patch_embedding = patch_embedding_layer(x)
print(f"Patching embedding shape: {patch_embedding.shape}")

# 6. Create class token embedding
batch_size = patch_embedding.shape[0]
embedding_dimension = patch_embedding.shape[-1]
class_token = nn.Parameter(torch.ones(batch_size, 1, embedding_dimension),
                           requires_grad=True)  # make sure it's learnable
print(f"Class token embedding shape: {class_token.shape}")

# 7. Prepend class token embedding to patch embedding
patch_embedding_class_token = torch.cat((class_token, patch_embedding), dim=1)
print(f"Patch embedding with class token shape: {patch_embedding_class_token.shape}")

# 8. Create position embedding
number_of_patches = int((height * width) / patch_size ** 2)
position_embedding = nn.Parameter(torch.ones(1, number_of_patches + 1, embedding_dimension),
                                  requires_grad=True)  # make sure it's learnable

# 9. Add position embedding to patch embedding with class token
patch_and_position_embedding = patch_embedding_class_token + position_embedding
print(f"Patch and position embedding shape: {patch_and_position_embedding.shape}")

print(patch_embedding_class_token)

Image tensor shape: torch.Size([1, 1, 48, 48])
Input image with batch dimension shape: torch.Size([1, 1, 48, 48])
Patching embedding shape: torch.Size([1, 144, 16])
Class token embedding shape: torch.Size([1, 1, 16])
Patch embedding with class token shape: torch.Size([1, 145, 16])
Patch and position embedding shape: torch.Size([1, 145, 16])
tensor([[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [-0.1089,  0.3940, -0.1063,  ...,  0.1169, -0.6876, -0.1364],
         [-0.0791, -0.5349,  0.1944,  ..., -0.6428, -0.1077,  0.1746],
         ...,
         [ 0.6388,  0.6083, -0.0979,  ...,  0.4054,  0.7176,  0.3195],
         [ 0.6316, -0.2121,  0.3735,  ...,  0.3643, -0.5640, -1.0396],
         [-0.3404, -1.1636, -0.0343,  ..., -0.7795, -0.1963, -0.7098]]],
       grad_fn=<CatBackward0>)


In [10]:
# 1. Create a class that inherits from nn.Module
class MultiheadSelfAttentionBlock(nn.Module):
    """Creates a multi-head self-attention block ("MSA block" for short).
    """
    # 2. Initialize the class with hyperparameters from Table 1
    def __init__(self,
                 embedding_dim:int=16, # Hidden size D from Table 1 for ViT-Base
                 num_heads:int=16, # Heads from Table 1 for ViT-Base
                 attn_dropout:float=0): # doesn't look like the paper uses any dropout in MSABlocks
        super().__init__()
        
        # 3. Create the Norm layer (LN)
        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
        
        # 4. Create the Multi-Head Attention (MSA) layer
        self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_dim,
                                                    num_heads=num_heads,
                                                    dropout=attn_dropout,
                                                    batch_first=True) # does our batch dimension come first?
        
    # 5. Create a forward() method to pass the data throguh the layers
    def forward(self, x):
        x = self.layer_norm(x)
        attn_output, _ = self.multihead_attn(query=x, # query embeddings 
                                             key=x, # key embeddings
                                             value=x, # value embeddings
                                             need_weights=False) # do we need the weights or just the layer outputs?
        return attn_output

In [11]:
class MLPBlock(nn.Module):
    """Creates a layer normalized multilayer perceptron block ("MLP block" for short)."""
    def __init__(self, embedding_dim=16, mlp_size=3072, dropout=0.1):
        super().__init__()

        # 3. Create the Norm layer (LN)
        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)

        # 4. Create the Multilayer perceptron (MLP) layer(s)
        self.mlp = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=mlp_size),
            nn.GELU(),  # "The MLP contains two layers with a GELU non-linearity (section 3.1)."
            nn.Dropout(p=dropout),
            nn.Linear(in_features=mlp_size, out_features=embedding_dim),
            nn.Dropout(p=dropout)  # "Dropout, when used, is applied after every dense layer.."
        )

    def forward(self, x):
        x = self.layer_norm(x)
        x = self.mlp(x)
        return x


In [12]:
class TransformerEncoderBlock(nn.Module):
    """Creates a Transformer Encoder block."""
    def __init__(self, embedding_dim=16, num_heads=16, mlp_size=3072, mlp_dropout=0.1, attn_dropout=0):
        super().__init__()

        # 3. Create MSA block (equation 2)
        self.msa_block = MultiheadSelfAttentionBlock(embedding_dim=embedding_dim,
                                                     num_heads=num_heads,
                                                     attn_dropout=attn_dropout)
        
        # 4. Create MLP block (equation 3)
        self.mlp_block = MLPBlock(embedding_dim=embedding_dim,
                                  mlp_size=mlp_size,
                                  dropout=mlp_dropout)
        
    def forward(self, x):
        # 6. Create residual connection for MSA block (add the input to the output)
        x_msa = self.msa_block(x) + x 
        
        # 7. Create residual connection for MLP block (add the input to the output)
        x_mlp = self.mlp_block(x_msa) + x_msa 
        
        return x_mlp


In [13]:
# Instantiate the TransformerEncoderBlock
transformer_encoder_block = TransformerEncoderBlock()

# Print summary of the TransformerEncoderBlock
from torchinfo import summary

# Print an input and output summary of our Transformer Encoder (uncomment for full output)
summary(model=transformer_encoder_block,
        input_size=(1, 145, 16),  # (batch_size, num_patches, embedding_dimension)
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])


Layer (type (var_name))                            Input Shape          Output Shape         Param #              Trainable
TransformerEncoderBlock (TransformerEncoderBlock)  [1, 145, 16]         [1, 145, 16]         --                   True
├─MultiheadSelfAttentionBlock (msa_block)          [1, 145, 16]         [1, 145, 16]         --                   True
│    └─LayerNorm (layer_norm)                      [1, 145, 16]         [1, 145, 16]         32                   True
│    └─MultiheadAttention (multihead_attn)         --                   [1, 145, 16]         1,088                True
├─MLPBlock (mlp_block)                             [1, 145, 16]         [1, 145, 16]         --                   True
│    └─LayerNorm (layer_norm)                      [1, 145, 16]         [1, 145, 16]         32                   True
│    └─Sequential (mlp)                            [1, 145, 16]         [1, 145, 16]         --                   True
│    │    └─Linear (0)                     

In [14]:
class ViT(nn.Module):
    """Creates a Vision Transformer architecture with ViT-Base hyperparameters by default."""
    def __init__(self, img_size=48, in_channels=1, patch_size=4, num_transformer_layers=12,
                 embedding_dim=16, mlp_size=3072, num_heads=16, attn_dropout=0, mlp_dropout=0.1,
                 embedding_dropout=0.1, num_classes=7):
        super().__init__()

        # 3. Make sure the image size is divisible by the patch size
        assert img_size % patch_size == 0, f"Image size must be divisible by patch size, image size: {img_size}, patch size: {patch_size}."

        # 4. Calculate the number of patches
        self.num_patches = (img_size * img_size) // (patch_size ** 2)

        # 5. Create learnable class embedding
        self.class_embedding = nn.Parameter(torch.randn(1, 1, embedding_dim), requires_grad=True)

        # 6. Create learnable position embedding
        self.position_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embedding_dim), requires_grad=True)

        # 7. Create embedding dropout layer
        self.embedding_dropout = nn.Dropout(p=embedding_dropout)

        # 8. Create patch embedding layer
        self.patch_embedding = PatchEmbedding(in_channels=in_channels, patch_size=patch_size, embedding_dim=embedding_dim)

        # 9. Create Transformer Encoder blocks
        self.transformer_encoder = nn.Sequential(
            *[TransformerEncoderBlock(embedding_dim=embedding_dim, num_heads=num_heads,
                                      mlp_size=mlp_size, mlp_dropout=mlp_dropout,
                                      attn_dropout=attn_dropout) for _ in range(num_transformer_layers)]
        )

        # 10. Create classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(normalized_shape=embedding_dim),
            nn.Linear(in_features=embedding_dim, out_features=num_classes)
        )

    def forward(self, x):
        # 12. Get batch size
        batch_size = x.shape[0]

        # 13. Create class token embedding
        class_token = self.class_embedding.expand(batch_size, -1, -1)
        
        # 14. Create patch embedding
        x = self.patch_embedding(x)
        
        x = torch.cat((class_token, x), dim=1)  # Concatenate along dimension 1

        # 16. Add position embedding to patch embedding
        x = self.position_embedding + x

        # 17. Run embedding dropout
        x = self.embedding_dropout(x)

        # 19. Pass patch, position, and class embedding through transformer encoder layers
        x = self.transformer_encoder(x)

        # 21. Put logit through the classifier
        x = self.classifier(x[:,0])

        return x




In [15]:
vit = ViT(num_classes=7)

In [17]:
# Setup the optimizer to optimize our ViT model parameters using hyperparameters from the ViT paper 
optimizer = Adam(params=vit.parameters(), 
                 lr=3e-3,  # Base LR from Table 3 for ViT-* ImageNet-1k
                 betas=(0.9, 0.999),  # Default values mentioned in the ViT paper section 4.1 (Training & Fine-tuning)
                 weight_decay=0.3)  # From the ViT paper section 4.1 (Training & Fine-tuning) and Table 3 for ViT-* ImageNet-1k

# Setup the loss function for multi-class classification
loss_fn = F.cross_entropy

# Set the seeds
set_seeds()

# Set number of epochs
num_epochs = 2


# Training loop
for epoch in range(num_epochs):
    # Set model to training mode
    vit.train()
    
    # Initialize variables for tracking loss and accuracy
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    
    # Iterate over training data
    for batch_idx,(inputs, labels) in enumerate(train_loader):
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = vit(inputs)
        
        
        # Compute loss
        loss = F.cross_entropy(outputs, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item() * inputs.size(0)
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)
        
        # Print batch statistics if batch number is a multiple of 100
        if (batch_idx + 1) % 100 == 0:
            batch_loss = loss.item()
            batch_accuracy = (predicted == labels).sum().item() / labels.size(0)
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_loader)}:")
            print(f"  Loss: {batch_loss:.4f}, Accuracy: {batch_accuracy:.4f}")
    
    # Calculate average loss and accuracy for the epoch
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct_predictions / total_predictions
    
    # Print epoch statistics
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")



Epoch 1/2, Batch 100/449:
  Loss: 1.8893, Accuracy: 0.1875
Epoch 1/2, Batch 200/449:
  Loss: 1.8770, Accuracy: 0.2188
Epoch 1/2, Batch 300/449:
  Loss: 1.8435, Accuracy: 0.3125
Epoch 1/2, Batch 400/449:
  Loss: 1.8785, Accuracy: 0.1875


NameError: name 'train_dataloader' is not defined

In [23]:
# Optionally, you can evaluate the model on the test data after training
# Set model to evaluation mode
vit.eval()

# Initialize variables for tracking loss and accuracy on test data
test_running_loss = 0.0
test_correct_predictions = 0
test_total_predictions = 0

# Iterate over test data
for inputs, labels in test_loader:
    
    # Forward pass
    outputs = vit(inputs)
    
    # Compute loss
    loss = F.cross_entropy(outputs, labels)
    
    # Update running loss
    test_running_loss += loss.item() * inputs.size(0)
    
    # Calculate accuracy
    _, predicted = torch.max(outputs, 1)
    test_correct_predictions += (predicted == labels).sum().item()
    test_total_predictions += labels.size(0)

# Calculate average loss and accuracy on test data
test_loss = test_running_loss / len(test_loader.dataset)
test_accuracy = test_correct_predictions / test_total_predictions

# Print test statistics
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Test Loss: 1.8541, Test Accuracy: 0.2449


In [22]:
# Save the trained model
torch.save(vit.state_dict(), 'vit_model.pth')