In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import numpy as np
from torch.utils.data import Subset
import matplotlib.pyplot as plt

In [None]:
def attention(query, key, value, dropoutLayer = None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2,-1))/math.sqrt(d_k)
    attn_score = F.softmax(scores, dim = -1)
    if dropoutLayer is not None:
        attn_score = dropoutLayer(attn_score)
    return torch.matmul(attn_score, value) , attn_score


In [None]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class PatchEmbedding(nn.Module):
	def __init__(self,embed_dim  ,dropout, positional_encode,patch_size = 4, in_channels = 3 ):
		super(PatchEmbedding, self).__init__()

		self.conv = nn.Conv2d(in_channels, embed_dim, kernel_size = patch_size, stride = patch_size-1)
		self.flatten = nn.Flatten(2)
		self.cls_token = nn.Parameter(torch.randn(size = (1,1,embed_dim)), requires_grad = True)
		self.dropout = nn.Dropout(p = dropout)
		self.positional_encoding = positional_encode

	def forward(self, x):
		cls_token =self.cls_token.expand(x.shape[0], -1, -1)
		x = self.conv(x)


		x = self.flatten(x)
		x = x.permute(0,2,1)

		x = torch.cat([cls_token, x], dim = 1)
		x = self.positional_encoding(x) + x
		x = self.dropout(x)
		return x

In [None]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.d_model)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return x


In [None]:
class EncoderLayer(nn.Module):
	def __init__(self, d_model, heads, d_ff, dropout = 0.1):
		super(EncoderLayer, self).__init__()
		self.multihead_attention = MultiHeadAttention(heads, d_model)
		self.feedforward = PositionWiseFeedForward(d_model, d_ff, dropout)
		self.sublayer = clones(SubLayerConnection(d_model, dropout), 2)
		self.d_model = d_model

	def forward(self, x):
		for i in range(2):
			if i % 2 == 0:
				x = self.sublayer[i](x, lambda x: self.multihead_attention(x,x,x))
			else:
				x = self.sublayer[i](x, self.feedforward)
		return x

In [None]:
class Gen(nn.Module):
    def __init__(self, d_model, labels, d_ff):
        super(Gen, self).__init__()
        self.to_latent = nn.Identity()
        self.mlp = MLP(d_ff, d_model,labels , dropout = 0.1)

    def forward(self, x):
        x = self.to_latent(x)
        x = self.mlp(x)
        # x = F.softmax(x, dim = -1)
        # x = torch.squeeze(x)
        return x

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, embed_dim, eps = 1e-5):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(embed_dim))
        self.beta = nn.Parameter(torch.zeros(embed_dim))
        self.eps = eps

    def forward(self, x):

        mean = x.mean(-1, keepdim = True)
        std = x.std(-1, keepdim = True)
        return self.gamma *(x-mean)/(std + self.eps) + self.beta

In [None]:
class MLP(nn.Module):
	def __init__(self, d_ff, d_model, labels, dropout):
		super(MLP, self).__init__()
		self.fc1 = nn.Linear(d_model, d_ff)
		self.act = nn.GELU()
		self.fc2 = nn.Linear(d_ff , labels)
		self.dropout = nn.Dropout(dropout)

	def forward(self, x):
		x = self.fc1(x)
		x = self.act(x)
		x = self.dropout(x)
		x = self.fc2(x)
		x = self.dropout(x)
		return x

In [None]:
class MultiHeadAttention(nn.Module):
	def __init__(self,h = 12, d_model = 768, dropout = 0.1):
		super(MultiHeadAttention, self).__init__()
		assert d_model % h ==0
		self.d_k = d_model // h
		self.h = h

		self.attn = None
		self.linears = clones(nn.Linear(d_model, d_model), 4)
		self.dropout = nn.Dropout(p = dropout)


	def forward(self, query, key, value):
		nbatches = query.size(0)

		query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1,2) for l,x in zip(self.linears, (query, key, value))]

		x, self.attn = attention(query, key, value,  dropoutLayer = self.dropout)
		x = x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k)
		return self.linears[-1](x)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].detach()
        return self.dropout(x)

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        super(PositionWiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [None]:
class SubLayerConnection(nn.Module):
	def __init__(self, size, dropout):
		super(SubLayerConnection, self).__init__()
		self.norm = LayerNorm(size)
		self.dropout = nn.Dropout(dropout)
	def forward(self, x, sublayer):
		return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class VisionTransformer(nn.Module):
    def __init__(self, d_model, d_ff, heads, labels, dropout):
        super(VisionTransformer, self).__init__()
        self.encoderlayer = EncoderLayer(d_model, heads, d_ff, dropout)
        self.encoder = Encoder(self.encoderlayer, heads)
        self.encoders = clones(self.encoder, 2)
        self.gen = Gen(d_model, labels, d_ff)
        self.positional_encode = PositionalEncoding(d_model, dropout)
        self.embedding = PatchEmbedding(d_model, dropout, self.positional_encode)
        self.norm = LayerNorm(d_model)


    def forward(self, x):

        x = self.embedding(x)

        for encoder in self.encoders:
            x = encoder(x)

        x = self.norm(x)
        x = self.gen(x[:,0])
        return x

In [None]:
import os
os.makedirs('/results', exist_ok=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# transform = transforms.Compose([
#     transforms.Resize((256, 256)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

# ])

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.Resize(32),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])




train_set = datasets.CIFAR10(root = 'data', transform = transform_train, download = True, train = True)
test_set = datasets.CIFAR10(root = 'data', transform = transform_test, download = True, train = False)
print(f"Train dataset size: {len(train_set)}")
print(f"Test dataset size: {len(test_set)}")







train_dataloader = DataLoader(train_set, batch_size = 64, shuffle = True)
test_dataloader = DataLoader(test_set, batch_size = 1000, shuffle = True)

train_item = next(iter(train_dataloader))
print("DataLoader",train_item[0].shape)

log_interval = 10
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_dataloader.dataset) for i in range(10 + 1)]

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss, correct = 0, 0

    for batch_idx, (images, labels) in enumerate(train_dataloader):

        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
          epoch, batch_idx * len(images), len(train_dataloader.dataset),
        100. * batch_idx / len(train_dataloader), loss.item()))
        train_losses.append(loss.item())
        train_counter.append(
        (batch_idx*64) + ((epoch-1)*len(train_dataloader.dataset)))

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader), correct / len(dataloader.dataset)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss, correct = 0, 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader), correct / len(dataloader.dataset)


print('Hello4')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

d_model = 384
d_ff = d_model * 4
heads = 8
labels = 10
dropout = 0.1


model = VisionTransformer(d_model, d_ff, heads, labels, dropout).to(device)
print("Model architecture:")
print(model)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4, weight_decay=1e-4)

def train(epoch):
  model.train()
  for batch_idx, (data, target) in enumerate(train_dataloader):
    optimizer.zero_grad()
    data = data.to(device)
    target = target.to(device)
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx % log_interval == 0:
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        epoch, batch_idx * len(data), len(train_dataloader.dataset),
        100. * batch_idx / len(train_dataloader), loss.item()))
      train_losses.append(loss.item())
      train_counter.append(
        (batch_idx*64) + ((epoch-1)*len(train_dataloader.dataset)))
      torch.save(model.state_dict(), '/results/model.pth')
      torch.save(optimizer.state_dict(), '/results/optimizer.pth')


def test():
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_dataloader:
      data = data.to(device)
      target = target.to(device)
      output = model(data)
      test_loss += F.cross_entropy(output, target, size_average=False).item()
      pred = output.data.max(1, keepdim=True)[1]
      correct += pred.eq(target.data.view_as(pred)).sum()
  test_loss /= len(test_dataloader.dataset)
  test_losses.append(test_loss)
  print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_dataloader.dataset),
    100. * correct / len(test_dataloader.dataset)))




for epoch in range(1, 35+ 1):
  train(epoch)
  test()
# input = torch.randn(3,3,224,224)
# model = VisionTransformer(d_model = 768, d_ff = 3072, heads = 12, labels = 10, dropout = 0.1)
# output = model(input)
# print(output.shape)


# # training

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:13<00:00, 12.9MB/s]


Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified
Train dataset size: 50000
Test dataset size: 10000
DataLoader torch.Size([64, 3, 32, 32])
Hello4
cuda
Model architecture:
VisionTransformer(
  (encoderlayer): EncoderLayer(
    (multihead_attention): MultiHeadAttention(
      (linears): ModuleList(
        (0-3): 4 x Linear(in_features=384, out_features=384, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (feedforward): PositionWiseFeedForward(
      (w_1): Linear(in_features=384, out_features=1536, bias=True)
      (w_2): Linear(in_features=1536, out_features=384, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (sublayer): ModuleList(
      (0-1): 2 x SubLayerConnection(
        (norm): LayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-7): 8 x EncoderLayer(
        (multihead_attention): MultiHeadAttention(
          (l




Test set: Avg. loss: 1.6346, Accuracy: 3740/10000 (37%)


Test set: Avg. loss: 1.2538, Accuracy: 5420/10000 (54%)


Test set: Avg. loss: 1.1707, Accuracy: 5751/10000 (58%)


Test set: Avg. loss: 1.0768, Accuracy: 6153/10000 (62%)


Test set: Avg. loss: 0.9945, Accuracy: 6413/10000 (64%)


Test set: Avg. loss: 0.9159, Accuracy: 6761/10000 (68%)


Test set: Avg. loss: 0.8756, Accuracy: 6939/10000 (69%)


Test set: Avg. loss: 0.8495, Accuracy: 7012/10000 (70%)


Test set: Avg. loss: 0.8068, Accuracy: 7199/10000 (72%)


Test set: Avg. loss: 0.8089, Accuracy: 7165/10000 (72%)


Test set: Avg. loss: 0.7504, Accuracy: 7379/10000 (74%)


Test set: Avg. loss: 0.7242, Accuracy: 7511/10000 (75%)


Test set: Avg. loss: 0.7282, Accuracy: 7499/10000 (75%)


Test set: Avg. loss: 0.7170, Accuracy: 7534/10000 (75%)


Test set: Avg. loss: 0.7457, Accuracy: 7406/10000 (74%)


Test set: Avg. loss: 0.6638, Accuracy: 7703/10000 (77%)


Test set: Avg. loss: 0.6774, Accuracy: 7693/10000 (77%)


Test set: Avg

In [None]:
2

2

In [None]:
from google.colab import files

files.download("/results/optimizer.pth")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Test with first model

In [None]:
import torch


model = VisionTransformer(d_model, d_ff, heads, labels, dropout).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/results2/model .pth'))
model.eval()

optimizer = optim.Adam(model.parameters(), lr=2e-4, weight_decay=1e-4)
optimizer.load_state_dict(torch.load('/content/drive/MyDrive/results2/optimizer.pth'))


test()


  model.load_state_dict(torch.load('/content/drive/MyDrive/results2/model .pth'))
  optimizer.load_state_dict(torch.load('/content/drive/MyDrive/results2/optimizer.pth'))



Test set: Avg. loss: 0.6233, Accuracy: 8001/10000 (80%)

