In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from einops import rearrange
import time
import torch.optim as optim
import numpy as np

In [2]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x


class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)


class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, dim)
        )

    def forward(self, x):
        return self.net(x)

# Attention and Performer Variants

## Softmax Attention

The definition/computation can be found in Attention Is All You Need Paper

In [3]:
class Attention(nn.Module):
    def __init__(self, dim, heads=8):
        super().__init__()
        self.heads = heads
        self.scale = dim ** -0.5 # 1/sqrt(m)
        self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
        self.to_out = nn.Linear(dim, dim)


    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x)
        q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv=3, h=h)
        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale

        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, float('-inf'))
            del mask

        attn = dots.softmax(dim=-1)
        out = torch.einsum('bhij,bhjd->bhid', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)
        return out

## Performer Attention

feature_map essentially defines performer variants. i.e. when the project says Performer-ReLU, it means to change the feature_map here to ReLU. rom is the random orthogonal matrix generator. How this is generated is described in later sections.

feature_map are1-D (random) feature map

In [4]:
class PerformerAttention(nn.Module):
    def __init__(self, dim, heads, r, feature_map, projection_method='gaussian', redraw=False):
        super().__init__()
        self.heads = heads
        self.scale = r ** -0.5 # 1/sqrt(m)
        self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
        self.to_out = nn.Linear(dim, dim)

        
        self.r = r # random feature map dimension
        if projection_method == 'gaussian':
            self.create_projection_matrix = create_projection_gaussian
        elif projection_method == 'HD':
            self.create_projection_matrix = create_projection_HD
        self.projection_matrix = self.create_projection_matrix(r, dim//heads)

        self.feature_map = feature_map # this is only 1-D random feature map
        self.redraw = redraw


    def forward(self, x, mask = None):
        # x shape: (batch x n x (head * dim))
        b, n, hxd, h = *x.shape, self.heads
        d = hxd // h

        qkv = self.to_qkv(x)
        # qkv shape: (batch x n x 3 * dim)
        q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv=3, h=h)
        # q, k, v shape: (batch x head x n x dim)
        
        
        
        
        # Dimension Reduction
        # self.projection_matrix shape (r x dim)
        k_ = self.feature_map(torch.einsum('rd,bhnd->bhnr', self.projection_matrix, k)) * self.scale
        if self.redraw:
            self.projection_matrix = self.create_projection_matrix(r, d)
        q_ = self.feature_map(torch.einsum('rd,bhnd->bhnr', self.projection_matrix, q)) * self.scale
        if self.redraw:
            self.projection_matrix = self.create_projection_matrix(r, d)

        k_Tv = torch.einsum('bhir,bhid->bhrd', k_, v)
        q_k_Tv = torch.einsum('bhnr,bhrd->bhnd', q_, k_Tv)
    
        
        
        
        # Compute diagnal D Matrix (strictly positive diagnol values)
        k_T1_L = torch.sum(k_, dim=2)
        d = torch.einsum('bhnr,bhr->bhn', q_, k_T1_L)
#         assert torch.all(torch.all(d >= 0)), torch.sum(d < 0)
        d[d==0] = 1e-6
        d = torch.diag_embed(torch.pow(d, -1))
        out = torch.einsum('bhmn,bhnd->bhmd', d, q_k_Tv)

        # Output
        out = rearrange(out, 'b h n d -> b n (h d)')
        out =  self.to_out(out)
        return out

### Orthogonal Random Features

Two approaches are implemented here for generating Random Orthogonal matrices. i.e. random feature

### HD Block Approach

We here use a single HD block here H is a Hadamard Matrix and D is a random diagonal matrix. More details can be found in the PNG Kernal paper. The catch is that dim (hyperparameter) has to be a power of 2. This is noted in the hyperparameter section.

Hadamard Transformation code provided by: https://github.com/HazyResearch/structured-nets/blob/master/pytorch/structure/hadamard.py It makes use of the NumPy library. Please be careful and try NOT to use NumPy functions anywhere else.

In [5]:
def hadamard_transform(u, normalize=False):
    """Multiply H_n @ u where H_n is the Hadamard matrix of dimension n x n.
    n must be a power of 2.
    Parameters:
        u: Tensor of shape (..., n)
        normalize: if True, divide the result by 2^{m/2} where m = log_2(n).
    Returns:
        product: Tensor of shape (..., n)
    """
    _, n = u.shape
    m = int(np.log2(n))
    # assert n == 1 << m, 'n must be a power of 2'
    x = u[..., np.newaxis]
    for d in range(m)[::-1]:
        x = torch.cat((x[..., ::2, :] + x[..., 1::2, :], x[..., ::2, :] - x[..., 1::2, :]), dim=-1)
    return x.squeeze(-2) / 2**(m / 2) if normalize else x.squeeze(-2)

def create_projection_HD(m, d, seed=0, manual_seed=False):
    if manual_seed:
        torch.manual_seed(current_seed)
    
    nb_full_blocks = int(m / d)
    block_list = []
    current_seed = seed
    for _ in range(nb_full_blocks):
        if manual_seed:
            torch.manual_seed(current_seed)
        random_diag = torch.randint(-1, 2, (d,))
        hd_matrix = hadamard_transform(torch.diag(random_diag)).type(torch.FloatTensor)
        block_list.append(hd_matrix)
        current_seed += 1

    remaining_rows = m - nb_full_blocks * d
    if remaining_rows > 0:
        if manual_seed:
            torch.manual_seed(current_seed)
        random_diag = torch.randint(-1, 2, (d,))
        hd_matrix = hadamard_transform(torch.diag(random_diag)).type(torch.FloatTensor)
        block_list.append(hd_matrix[0:remaining_rows])


    return torch.vstack(block_list)

### Gaussian Matrix Approach

Translated and simplified his original code written in TensorFlow to Torch. This can also be found in his PNG Kernel paper labels as G_ORT. The idea is to generate random matrices using Gaussian and do Gram-Schmidt Orthogonalization (linalg.qr) 

In [6]:
def create_projection_gaussian(m, d, seed=0, manual_seed=False, scaling=0):

    nb_full_blocks = int(m / d)
    block_list = []
    current_seed = seed
    for _ in range(nb_full_blocks):
        if manual_seed:
            torch.manual_seed(current_seed)
        unstructured_block = torch.randn(d, d)
        q, _ = torch.linalg.qr(unstructured_block)
        q = torch.transpose(q, 0, 1)
        block_list.append(q)
        current_seed += 1

    remaining_rows = m - nb_full_blocks * d
    if remaining_rows > 0:
        if manual_seed:
            torch.manual_seed(current_seed)
        unstructured_block = torch.randn(d, d)
        q, _ = torch.linalg.qr(unstructured_block)
        q = torch.transpose(q, 0, 1)
        block_list.append(q[0:remaining_rows])
    
    final_matrix = torch.vstack(block_list)
    current_seed += 1

    if scaling == 0:
        if manual_seed:
            torch.manual_seed(current_seed)
        multiplier = torch.norm(torch.randn((m, d)), dim=1)
    elif scaling == 1:
        multiplier = torch.sqrt(torch.Tensor([float(d)]) * torch.ones((m)))
    else:
        raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling)

    final_matrix  = torch.matmul(torch.diag(multiplier), final_matrix)
    assert final_matrix.shape[-1] == d
    return final_matrix

# Image Classifier Model

AN IMAGE IS WORTH 16X16 WORDS: TRANSFORMERS FOR IMAGE RECOGNITION AT SCALE Paper

## Transformer Encoder

In [7]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, mlp_dim, performer=False):
        super().__init__()

        if performer is False:
            self.layers = nn.ModuleList([
                nn.ModuleList([
                    Residual(PreNorm(dim, Attention(dim, heads=heads))),
                    Residual(PreNorm(dim, FeedForward(dim, mlp_dim)))
                ])
                for _ in range(depth)
            ])
        else:
            r, feature_map, projection_method, redraw = performer
            self.layers = nn.ModuleList([
                nn.ModuleList([
                    Residual(PreNorm(dim, PerformerAttention(
                        dim, heads, r, feature_map, projection_method, redraw
                    ))),
                    Residual(PreNorm(dim, FeedForward(dim, mlp_dim)))
                ])
                for _ in range(depth)
            ])

    def forward(self, x, mask=None):
        for attn, ff in self.layers:
            x = attn(x, mask=mask)
            x = ff(x)
        return x

## Vision Transformer

In [8]:
class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, 
                 dim, depth, heads, mlp_dim, channels=3, performer=False):
        super().__init__()
        assert image_size % patch_size == 0, 'image dimensions must be divisible by the patch size'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        self.patch_size = patch_size
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.transformer = Transformer(dim, depth, heads, mlp_dim, performer=performer)
        self.to_cls_token = nn.Identity()
        self.mlp_head = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, num_classes)
        )

    def forward(self, img, mask=None):
        p = self.patch_size
        
        x = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p)
        # x shape: (batch x n x p^2c)
        
        x = self.patch_to_embedding(x)
        # linear map from p^2c to dim
        # x shape: (batch x n x dim)

        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        # x shape (batch x n' x dim)

        x += self.pos_embedding
        x = self.transformer(x, mask)
        x = self.to_cls_token(x[:, 0])
        return self.mlp_head(x)

# Training/Evaluation

In [9]:
def train_epoch(model, optimizer, data_loader, loss_history):
    total_samples = len(data_loader.dataset)
    model.train()

    epoch_timestamp = time.time()
    for i, (data, target) in enumerate(data_loader):
        optimizer.zero_grad()
        output = F.log_softmax(model(data), dim=1)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            epoch_time = time.time() - epoch_timestamp
            epoch_timestamp = time.time()
            print('[' +  '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
                  ' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)]  Loss: ' +
                  '{:6.4f}'.format(loss.item()) + '  Epoch execution time: {:5.2f}'.format(epoch_time), 'seconds')
            loss_history.append((time.time(), loss.item()))

In [10]:
def evaluate(model, data_loader, loss_history):
    model.eval()
    
    total_samples = len(data_loader.dataset)
    correct_samples = 0
    total_loss = 0

    with torch.no_grad():
        for data, target in data_loader:
            output = F.log_softmax(model(data), dim=1)
            loss = F.nll_loss(output, target, reduction='sum')
            _, pred = torch.max(output, dim=1)
            
            total_loss += loss.item()
            correct_samples += pred.eq(target).sum()

    avg_loss = total_loss / total_samples
    loss_history.append((time.time(), avg_loss))

    print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
          '  Accuracy:' + '{:5}'.format(correct_samples) + '/' +
          '{:5}'.format(total_samples) + ' (' +
          '{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')

# Data

In [11]:
torch.manual_seed(42)
BATCH_SIZE_TRAIN = 100
BATCH_SIZE_TEST = 1000

In [12]:
DOWNLOAD_PATH = r'C:\Users\zhang\OneDrive\Desktop\Data Mining Project'

transform_mnist = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.1307,), (0.3081,))
])

train_set = torchvision.datasets.MNIST(DOWNLOAD_PATH, train=True, download=True, transform=transform_mnist)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
test_set = torchvision.datasets.MNIST(DOWNLOAD_PATH, train=False, download=True, transform=transform_mnist)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE_TEST, shuffle=True)

# Misc.

In [13]:
import matplotlib.pyplot as plt
import numpy as np

def format_history(train_loss_history, test_loss_history):
    train_loss_history = np.array(train_loss_history).T
    test_loss_history = np.array(test_loss_history).T
    
    start_time = train_loss_history[0, 0]
    train_loss_history[0, :] = train_loss_history[0, :] - start_time
    test_loss_history[0, :] = test_loss_history[0, :] - start_time

    return train_loss_history, test_loss_history

# Model Training

## Hyperparameters

### Data Related

In [14]:
image_size = 28
patch_size = 4
num_classes = 10
channels = 1

### Model Related (ONLY CHANGE THIS PART)

In [15]:
N_EPOCHS = 10
depth = 5
dim = 32 # must be a power of 2 for HD projection
heads = 4 # need to divide dim
mlp_dim = 40
r = 20
projection_method = 'gaussian'
# projection_method = 'HD' 
redraw = True

## (a) Softmax

In [16]:
start_time = time.time()

model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_sm, test_loss_history_sm = [], []


for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_sm)
    evaluate(model, test_loader, test_loss_history_sm)

    
train_loss_history_sm, test_loss_history_sm = format_history(train_loss_history_sm, test_loss_history_sm)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1

Average test loss: 0.2157  Accuracy: 9318/10000 (93.18%)

Epoch: 2

Average test loss: 0.1449  Accuracy: 9544/10000 (95.44%)

Epoch: 3

Average test loss: 0.1162  Accuracy: 9633/10000 (96.33%)

Epoch: 4

Average test loss: 0.1010  Accuracy: 9692/10000 (96.92%)

Epoch: 5

Average test loss: 0.0962  Accuracy: 9700/10000 (97.00%)

Epoch: 6

Average test loss: 0.1011  Accuracy: 9680/10000 (96.80%)

Epoch: 7

Average test loss: 0.1142  Accuracy: 9647/10000 (96.47%)

Epoch: 8

Average test loss: 0.1150  Accuracy: 9645/10000 (96.45%)

Epoch: 9

Average test loss: 0.0921  Accuracy: 9716/10000 (97.16%)

Epoch: 10

Average test loss: 0.1064  Accuracy: 9686/10000 (96.86%)

Execution time: 593.75 seconds


## (b) Performer-ReLu

In [17]:
start_time = time.time()

numerical_stabilizer=0.001
model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, 
    performer=(r, lambda x: nn.ReLU()(x) + numerical_stabilizer, 'gaussian', redraw)
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_relu, test_loss_history_relu  = [], []

for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_relu)
    evaluate(model, test_loader, test_loss_history_relu)
    
train_loss_history_relu, test_loss_history_relu = format_history(train_loss_history_relu, test_loss_history_relu)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1

Average test loss: 0.6976  Accuracy: 7589/10000 (75.89%)

Epoch: 2

Average test loss: 0.3367  Accuracy: 8928/10000 (89.28%)

Epoch: 3

Average test loss: 0.2458  Accuracy: 9209/10000 (92.09%)

Epoch: 4

Average test loss: 0.2060  Accuracy: 9329/10000 (93.29%)

Epoch: 5

Average test loss: 0.1722  Accuracy: 9471/10000 (94.71%)

Epoch: 6

Average test loss: 0.1525  Accuracy: 9530/10000 (95.30%)

Epoch: 7

Average test loss: 0.1720  Accuracy: 9452/10000 (94.52%)

Epoch: 8

Average test loss: 0.1253  Accuracy: 9592/10000 (95.92%)

Epoch: 9

Average test loss: 0.1301  Accuracy: 9606/10000 (96.06%)

Epoch: 10

Average test loss: 0.1175  Accuracy: 9615/10000 (96.15%)

Execution time: 714.73 seconds


In [18]:
start_time = time.time()

numerical_stabilizer=0.001
model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim, 
    performer=(r, lambda x: nn.ReLU()(x) + numerical_stabilizer, 'HD', redraw)
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_relu, test_loss_history_relu  = [], []

for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_relu)
    evaluate(model, test_loader, test_loss_history_relu)
    
train_loss_history_relu, test_loss_history_relu = format_history(train_loss_history_relu, test_loss_history_relu)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1

Average test loss: 0.6641  Accuracy: 7761/10000 (77.61%)

Epoch: 2

Average test loss: 0.3929  Accuracy: 8698/10000 (86.98%)

Epoch: 3

Average test loss: 0.3427  Accuracy: 8845/10000 (88.45%)

Epoch: 4

Average test loss: 0.2519  Accuracy: 9203/10000 (92.03%)

Epoch: 5

Average test loss: 0.2393  Accuracy: 9236/10000 (92.36%)

Epoch: 6

Average test loss: 0.1689  Accuracy: 9475/10000 (94.75%)

Epoch: 7

Average test loss: 0.1570  Accuracy: 9483/10000 (94.83%)

Epoch: 8

Average test loss: 0.1633  Accuracy: 9491/10000 (94.91%)

Epoch: 9

Average test loss: 0.1381  Accuracy: 9547/10000 (95.47%)

Epoch: 10

Average test loss: 0.1650  Accuracy: 9495/10000 (94.95%)

Execution time: 724.04 seconds


## (c) Performer Quad

In [19]:
start_time = time.time()
model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim,
    performer=(r, lambda x: torch.pow(x, 2), 'gaussian', redraw)
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_x2, test_loss_history_x2  = [], []


for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_x2)
    evaluate(model, test_loader, test_loss_history_x2)

train_loss_history_x2, test_loss_history_x2 = format_history(train_loss_history_x2, test_loss_history_x2)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1

Average test loss: 0.2637  Accuracy: 9147/10000 (91.47%)

Epoch: 2

Average test loss: 0.1824  Accuracy: 9413/10000 (94.13%)

Epoch: 3

Average test loss: 0.1202  Accuracy: 9592/10000 (95.92%)

Epoch: 4

Average test loss: 0.1029  Accuracy: 9672/10000 (96.72%)

Epoch: 5

Average test loss: 0.1026  Accuracy: 9673/10000 (96.73%)

Epoch: 6

Average test loss: 0.1056  Accuracy: 9672/10000 (96.72%)

Epoch: 7

Average test loss: 0.1014  Accuracy: 9678/10000 (96.78%)

Epoch: 8

Average test loss: 0.0848  Accuracy: 9740/10000 (97.40%)

Epoch: 9

Average test loss: 0.0774  Accuracy: 9746/10000 (97.46%)

Epoch: 10

Average test loss: 0.0789  Accuracy: 9757/10000 (97.57%)

Execution time: 726.11 seconds


In [20]:
start_time = time.time()
model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim,
    performer=(r, lambda x: torch.pow(x, 2), 'HD', redraw)
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_x2, test_loss_history_x2  = [], []


for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_x2)
    evaluate(model, test_loader, test_loss_history_x2)

train_loss_history_x2, test_loss_history_x2 = format_history(train_loss_history_x2, test_loss_history_x2)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1

Average test loss: 0.2578  Accuracy: 9193/10000 (91.93%)

Epoch: 2

Average test loss: 0.1969  Accuracy: 9376/10000 (93.76%)

Epoch: 3

Average test loss: 0.1390  Accuracy: 9565/10000 (95.65%)

Epoch: 4

Average test loss: 0.1291  Accuracy: 9590/10000 (95.90%)

Epoch: 5

Average test loss: 0.1665  Accuracy: 9484/10000 (94.84%)

Epoch: 6

Average test loss: 0.1090  Accuracy: 9656/10000 (96.56%)

Epoch: 7

Average test loss: 0.0911  Accuracy: 9701/10000 (97.01%)

Epoch: 8

Average test loss: 0.1081  Accuracy: 9666/10000 (96.66%)

Epoch: 9

Average test loss: 0.0936  Accuracy: 9708/10000 (97.08%)

Epoch: 10

Average test loss: 0.0987  Accuracy: 9695/10000 (96.95%)

Execution time: 750.61 seconds


## (d) Performer x^4

In [21]:
start_time = time.time()
model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim,
    performer=(r, lambda x: torch.pow(x, 4), 'gaussian', redraw)
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_x4, test_loss_history_x4  = [], []

for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_x4)
    evaluate(model, test_loader, test_loss_history_x4)

train_loss_history_x4, test_loss_history_x4 = format_history(train_loss_history_x4, test_loss_history_x4)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1

Average test loss: 0.2611  Accuracy: 9162/10000 (91.62%)

Epoch: 2

Average test loss: 0.1755  Accuracy: 9460/10000 (94.60%)

Epoch: 3

Average test loss: 0.1414  Accuracy: 9561/10000 (95.61%)

Epoch: 4

Average test loss: 0.1306  Accuracy: 9592/10000 (95.92%)

Epoch: 5

Average test loss: 0.1115  Accuracy: 9658/10000 (96.58%)

Epoch: 6

Average test loss: 0.1113  Accuracy: 9662/10000 (96.62%)

Epoch: 7

Average test loss: 0.1213  Accuracy: 9631/10000 (96.31%)

Epoch: 8

Average test loss: 0.0944  Accuracy: 9704/10000 (97.04%)

Epoch: 9

Average test loss: 0.0952  Accuracy: 9711/10000 (97.11%)

Epoch: 10

Average test loss: 0.0914  Accuracy: 9731/10000 (97.31%)

Execution time: 784.93 seconds


In [22]:
start_time = time.time()
model = ViT(
    image_size=image_size, patch_size=patch_size, num_classes=num_classes, 
    channels=channels, dim=dim, depth=depth, heads=heads, mlp_dim=mlp_dim,
    performer=(r, lambda x: torch.pow(x, 4),'HD', redraw)
)
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_loss_history_x4, test_loss_history_x4  = [], []

for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    train_epoch(model, optimizer, train_loader, train_loss_history_x4)
    evaluate(model, test_loader, test_loss_history_x4)

train_loss_history_x4, test_loss_history_x4 = format_history(train_loss_history_x4, test_loss_history_x4)
print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')

Epoch: 1


KeyboardInterrupt: 

In [None]:
import pandas as pd
train_loss = pd.DataFrame({
    ('softmax', 'time'): train_loss_history_sm[0, :],
    ('softmax', 'loss'): train_loss_history_sm[1, :],
    ('ReLU', 'time'): train_loss_history_relu[0, :],
    ('ReLU', 'loss'): train_loss_history_relu[1, :],
    ('x^2', 'time'): train_loss_history_x2[0, :],
    ('x^2', 'loss'): train_loss_history_x2[1, :],
    ('x^4', 'time'): train_loss_history_x4[0, :], 
    ('x^4', 'loss'): train_loss_history_x4[1, :]
}).T

test_loss = pd.DataFrame({
    ('softmax', 'time'): test_loss_history_sm[0, :],
    ('softmax', 'loss'): test_loss_history_sm[1, :],
    ('ReLU', 'time'): test_loss_history_relu[0, :],
    ('ReLU', 'loss'): test_loss_history_relu[1, :],
    ('x^2', 'time'): test_loss_history_x2[0, :],
    ('x^2', 'loss'): test_loss_history_x2[1, :],
    ('x^4', 'time'): test_loss_history_x4[0, :], 
    ('x^4', 'loss'): test_loss_history_x4[1, :]
}).T

In [None]:
train_loss 

In [None]:
test_loss

In [None]:
train_loss.to_csv('train loss.csv')

In [None]:
test_loss.to_csv('test loss.csv')