In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn

In [3]:
class PatchEmbde(nn.Module):
    """Split image into patches and then embed them.
    
    Paramters
    -----------
    img_size: int
        Size of the image (it is a squeeze).
    
    patch_size: int
        Size of the patch (it is a squeeze).
        
    in_chans: int 
        Number of the input channels.
        
    embed_dim: int
        The embedding dimension, it will determine how big of an embedding our patch is going to be.
        
    Attributes
    -----------
    n_patches: int
        Number of patches inside the image.
    proj: nn.Conv2d
        Convolution layer that does both the splitting into patches and their embedding.
    """
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2
        
        self.proj = nn.Conv2d(
            in_chans,
            embed_dim,
            kernel_size=patch_size,
            stride=patch_size
        )
    
    def forward(self, x):
        """Run forward pass.
        
        Parameters
        ----------
        x: torch.Tensor
            Shape `(n_samples, in_chans, img_size, img_size)`.
        
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches, embed_dims)`.
        """
        x = self.proj(
            x,
        ) # (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5)
        x = x.flatten(2) # (n_samples, embed_dim, n_patches)
        x = x.transpose(1, 2) # (n_samples, n_patches, embed_dim)
        
        return x
    

In [4]:
class Attention(nn.Module):
    """Attention mechanism.
    
    Parameters
    ----------
    dim: int 
        The input and out dimension of per token features.
    
    n_heads: int
        Number of attention heads.
        
    qkv_bias: bool
        If True then we include bias to the query, key and value projections.
        
    attn_p: float
        Dropout probability applied to the query, key, and value tensors.
        
    proj_p: float
        Dropout probability applied to the output tensor.
        
    Attributes
    ----------
    scale: float
        Normalizing constant for the dot product.
    
    qkv: nn.Linear
        Linear projection for the query, key, value.
        
    attn_drop, proj_drop: nn.Dropout
        Dropout layers
    """
    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim // n_heads
        self.scale = self.head_dim ** -0.5
        
        self.qkv = nn.Linear(dim, dim*3, bias, bias=qkv_bias)
        self.attn_drop = nn.Dopout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)
        
    def forward(self, x):
        """Run forward pass.
        
        Paramters
        ---------
        x: torch.Tensor
            Shape (n_samples, n_patches + 1, dim)`.
        
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        """
        n_samples, n_tokens, dim = x.shape
        
        if dim != self.dim:
            raise ValueError
            
        qkv = self.qkv(x) # (n_samples, n_patches +1, 3 * dim)
        qkv = qkv.reshape(
            n_samples,n_tokens, 3, self.n_heads, self.head_dim
        ) # (n_samples, n_patches + 1, 3, n_heads, head_dim)
        qkv = qkv.permute(
            2, 0, 3, 1, 4
        ) # (3, n_samples, n_heads, n_patches + 1, head_dim)
        
        q, k, v = qkv[0], qkv[1], qkv[2]
        k_t = k.transpose(-2, -1) # (n_samples, n_heads, head_dim, n_patches + 1)
        dp = (
            q @ k_t
        ) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = dp.softmax(dim=-1) # (samples, n_heads, n_patches + 1, n_patches + 1)
        attn = self.attn_drop(attn)
        
        weighted_avg = attn @ v # (n_samples, n_heads, n_patches + 1, head_dim)
        weighted_avg = weighted_avg.transpose(
            1, 2
        ) # (n_samples, n_patches + 1, n_heads, head_dim)
        weighted_avg = weighted_avg.flatten(2) # (n_samples, n_ptches + 1, dim)
        x = self.proj_drop(x) # (n_samples, n_patches + 1, dim)
        
        return x 

In [5]:
class MLP(nn.Module):
    """Multilayerm perceptron.
    
    Parameters
    ----------
    in_features: int
        Number of input features.
        
    hidden_features: int
        Number of nodes in the hidden layer.
    
    out_features: int
        Number of output features.
        
    p: float
        Dropout probability.
    
    Attributes
    ----------
    fc: nn.Linear
        The First linear layer.
    
    act: nn.GELU
        GELU activation functions.
        
    fc2: nn.Linear
        The second linear layer.
        
    drop: nn.Dropout
        Dropout layer.
    """
    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)
        
    def forward(self, x):
        """Run forward pass.
        
        Parameters
        ----------
        x: torch.Tensor
            Shape `(n_samples, n_patches + 1, in_features)`.
            
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, out_features)`
        """
        x = self.fc1(
            x
        ) # (n_samples, n_patches + 1, hidden_features)
        x = self.act(x)  # (n_samples, n_patches + 1, hidden_features) 
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.fc2(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)
        
        return x

In [6]:
class Block(nn.Module):
    """Transformer block.
      
    Parameters
    ----------
    dim: int
        Embedding dimension.
    
    n_heads: int 
        Number of attention heads.
    
    mlp_ratio: float
        Determines the hidden dimension size ofthe `MLP` module with respect to `dim`.
        
    qkv_bias: bool
        If True then we include bias to the query, key and value projections.
        
    p, attn_p: float
         Dropout probability.
         
    Attributes
    ----------
    norm1, norm2: LayerNorm
        Layer normalization.
        
    attn: Attention
        Attention module.
    
    mlp: MLP
        MLP module.
    """
    def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(
            dim,
            n_heads=n_heads,
            qkv_bias=qkv_bias,
            attn_p=attn_p,
            proj_p=p
        )
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        self.mlp = MLP(
            in_features=dim,
            hidden_features=hidden_features,
            out_features=dim,
        )
    
    def forward(self, x):
        """Run forward pass.
        
        Parameters
        ----------
        x: torch.Tensor
            Shape `(n_sampes, n_patches + 1, dim)`.
        """
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        
        return x

In [7]:
class VisionTransformer(nn.Module):
    """Simplified implementation of the vision transformer.
    
    Parameters
    ----------
    img_size: int
        Size of the image (it is a squeeze).
    
    patch_size: int
        Size of the patch (it is a squeeze).
        
    in_chans: int 
        Number of the input channels.
        
    n_classes: int
        Number of classes.
    
    embed_dim: int
        Dimentionality of the toekn/patch embeddings.
    
    depth: int
        Number of blocks.
        
    n_heads: int 
        Number of attention heads.
    
    mlp_ratio: float
        Determines the hidden dimension size ofthe `MLP` module with respect to `dim`.
        
    qkv_bias: bool
        If True then we include bias to the query, key and value projections.
        
    p, attn_p: float
         Dropout probability.
         
    Attributes
    ----------
    patch_embed: PatchEmbed
        Instance of `PatchEmbed` layer.
        
    cls_token: nn.Parameter
        Learnable parameter that will represnt the first token in the sequence.
        It has `embed_size` elements.
        
    pos_emb: nn.Parameter
        Positional embedding of the cls token + all the patches.
        It has '(n_patches + 1) * embed_dim' elements.
    
    pos_drop: nn.Dropout
        Dropout layer.
        
    blocks: nn.ModuleList
        List of `Block` modules.
        
    norm: nn.LayerNorm
        Layer normaization.
    """
    def __init__(
        self,
        img_size=384,
        patch_siz=16,
        in_chans=3,
        n_classes=1000,
        embed_dim=768,
        depth=12,
        n_heads=12,
        mlp_ratio=4.,
        qkv_bias=True,
        p=0.,
        attn_p=0.,
    ):
        super().__init__()
        
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            emmbed_dim=embed_dim
        )
        self.cls_token = nn.Parameter(torch.zeros(1,1, embed_dim))
        self.pos_embed = nn.Parameter(
            torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=p)
        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    n_heads=n_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    p=p,
                    attn_p=attn_p
                )
                for _ in rnage(depth)
            ]
        )
        
        self.norm = nn.LayerNorm(embed_size, eps=1e-6)
        self.head = nn.Linear(embed_dim, n_classes)
        
        def forward(self, x):
            """run forward pass.
            
            Parameters
            ----------
            x: torch.Tensor
                Shape `(n_samples, in_chans, img_size, img_size)`.
            
            Returns
            _______
            logits: torch.Tensor
                Logits over all the clsses - `(n_samples, n_classes)`.
            """
            n_samples = x.shape[0]
            x = self.patch_embed(x)
            
            cls_token = self.cls_token.expand(
                n_samples, -1, -1
            ) # (n_samples, 1, embed_dim)
            x = torch.cat((cls_token, x), dim=1)  # (n_samples, 1 + n_patches, emded_dim)
            x = x +self.pos_embed  # (n_samples,1 + n_patches, embed_dim)
            x = self.pos_drop(x)
            
            for block in self.blocks:
                x = block(x)
                
            x = self.norm(x)
            
            cls_token_final = x[:, 0] # just the CLS token
            x = self.head(cls_token_final)
            
            return x