In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [2]:
#Library
from torch.torch_version import Version
import torch
import torch.nn as nn
from torch.nn import functional as F

#I am using variable names defined in "Attention Is All You Need" for simplicity.

#Define a single Scaled Dot-Product Attention head. I am simplifying it by enforcing d_k = d_v
class Attention(nn.Module):

  def __init__(self, d_k, mask, context_len):
    super().__init__()
    self.mask = mask
    self.d_k = d_k
    self.register_buffer('lowerTriangle', torch.tril(torch.ones(context_len, context_len)))

  def forward(self, Q, K, V):
    #Takes in vectors of Queries, Keys & Values. Queries, Keys & Values have dimension seq x d_k
    if (Q.shape[1] != self.d_k or K.shape[1] != self.d_k or V.shape[1] != self.d_k):
      raise Exception('Invalid Query, Key or Value Dimensions')
    seq = Q.shape[0]
    #First take the dot product of Queries & Keys. Weight has dimensions seq x seq
    weight = Q @ K.transpose(0, 1)
    #Now scale by 1/sqrt(d_k)
    weight = weight  * (self.d_k**-0.5)
    #Mask everything not in the lower triangular of weight
    if (self.mask):
      weight = weight.masked_fill(self.lowerTriangle[:seq, :seq]== 0, float('-inf'))
    #Now apply softmax in the dimension of the rows
    weight = weight.softmax(1)
    #Finally, multiply values by weights to get the outputs
    out = weight @ V # [seq, seq] x [seq, d_k] = [seq, d_k]
    return out

#Define a Multi-Head Attention layer. As in the paper, I am setting d_k = d_model/h
class MultiHeadAttention(nn.Module):

  def __init__(self, d_model, h, mask, context_len):
    super().__init__()
    self.mask = mask
    self.d_model = d_model
    self.h = h
    self.d_k = int(d_model/h)
    if (self.d_k != d_model/h):
      raise Exception('Invalid Dimensions Provided') #Ensure valid dimensions
    self.W_O = nn.Linear(h*self.d_k, d_model, bias=False) #Output linear layer
    self.W_Q = nn.ModuleList() #The h different Query linear layers
    self.W_K = nn.ModuleList() #The h different Key linear layers
    self.W_V = nn.ModuleList() #The h different Value linear layers
    self.Att = Attention(self.d_k, mask, context_len)
    for i in range(h):
      #Initialize all h linear layers for Q, K, V
      self.W_Q.append(nn.Linear(d_model, self.d_k, bias=False))
      self.W_K.append(nn.Linear(d_model, self.d_k, bias=False))
      self.W_V.append(nn.Linear(d_model, self.d_k, bias=False)) #TODO: Instead of Linear layers, just create Matrices

  def forward(self, Q, K, V):
    #The inputs are Queries, Keys and Vectors which each have size seq x d_model
    if (Q.shape[1] != self.d_model or K.shape[1] != self.d_model or V.shape[1] != self.d_model):
      raise Exception('Invalid Query, Key or Value Dimensions')
    heads = []
    for i in range(self.h):
      queries = self.W_Q[i](Q)
      keys = self.W_K[i](K)
      values = self.W_V[i](V)
      #At this point, queries keys & values have dimensions seq x d_k
      heads.append(self.Att.forward(queries, keys, values))
    out = torch.cat(heads, 1) #The output has the same dimension as all the inputs: seq x d_model
    out = self.W_O(out)
    return out

#Define a Feed Forward Network
class FeedForward(nn.Module):

  def __init__(self, d_model, d_hidden):
    super().__init__()
    self.network = nn.Sequential(
        nn.Linear(d_model, d_hidden, bias=True),
        nn.ReLU(),
        nn.Linear(d_hidden, d_model, bias=True)
    )

  def forward(self, x):
    x = self.network(x)
    return x

#Define Layer Normalization:
class LayerNorm(nn.Module):

  def __init__(self, d_model):
    super().__init__()
    self.epsilon = 1e-5
    self.gamma = nn.Parameter(torch.ones(d_model))
    self.beta = nn.Parameter(torch.zeros(d_model))

  def __call__(self, x):
    mean = x.mean(1, keepdim=True) #Mean across the layer i.e. the column
    variance = x.var(1, keepdim=True) #Mean across the layer i.e. column
    norm = (x - mean) / torch.sqrt(variance + self.epsilon) #Normalize
    out = self.gamma * norm + self.beta #Scale by gamma, add beta to achieve var= gamma, mean = beta
    return out

#Define batch normalization
class BatchNorm(nn.Module):
    
  def __init__(self, d_model):
    super().__init__()
    self.epsilon = 1e-5
    self.gamma = nn.Parameter(torch.ones(d_model))
    self.beta = nn.Parameter(torch.zeros(d_model))

  def __call__(self, x):
    mean = x.mean(0, keepdim=True) #Mean across the batch i.e. row
    variance = x.var(0, keepdim=True) #Mean across the batch i.e. row
    norm = (x - mean) / torch.sqrt(variance + self.epsilon) #Normalize
    out = self.gamma * norm + self.beta #Scale by gamma, add beta to achieve var= gamma, mean = beta
    return out

#Define an Block of Multi Head Self-Attention with a Residual Connection & Layer Normalization
class NormalizedSelfAttention(nn.Module):

  def __init__(self, d_model, h, mask, context_len):
    super().__init__()
    self.MHA = MultiHeadAttention(d_model, h, mask, context_len)
    self.norm = BatchNorm(d_model)

  def forward(self, x):
    x = x + self.MHA(x, x, x)
    x = self.norm(x)
    return x

#Define a block of a Feed Forward Network with a Residual Connection & Layer Normalization
class NormalizedFeedForward(nn.Module):

  def __init__(self, d_model, d_hidden):
    super().__init__()
    self.FF = FeedForward(d_model, d_hidden)
    self.norm = BatchNorm(d_model)

  def forward(self, x):
    x = x + self.FF(x)
    x = self.norm(x)
    return x

#Define a stand-alone Encoder block with N-self attention and feed forward blocks (without embeddings or softmax)
#This is identical to StandAloneDecoder except that masking is set to false, so attention is bi-directional
class StandAloneEncoder(nn.Module):

  def __init__(self, N, d_model, h, d_hidden, context_len):
    super().__init__()
    self.network = nn.Sequential()
    for i in range(N):
      self.network.append(NormalizedSelfAttention(d_model, h, False, context_len))
      self.network.append(NormalizedFeedForward(d_model, d_hidden))

  def forward(self, x):
    out = self.network(x)
    return out

#Define a stand-alone Decoder block with N self-attention and feed forward blocks (without embeddings or softmax)
#Note that the self attention is Masked because this is a Decoder
class StandAloneDecoder(nn.Module):

  def __init__(self, N, d_model, h, d_hidden, context_len):
    super().__init__()
    self.network = nn.Sequential()
    for i in range(N):
      self.network.append(NormalizedSelfAttention(d_model, h, True, context_len))
      self.network.append(NormalizedFeedForward(d_model, d_hidden))

  def forward(self, x):
    out = self.network(x)
    return out

In [3]:
#Set up pytorch to run on GPU if available
if torch.cuda.is_available():
    processor = 'cuda'
else:
    processor = 'cpu'
device = torch.device(processor)

In [4]:
#Load data, split into 90% train, 10% test
test = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test = test.sample(frac=1).reset_index(drop=True)
n1 = int(len(test) * 0.9)

#parse dataset as tensors
label = torch.tensor(test.loc[:,'label'].to_numpy(), dtype=torch.int64)
print(label.size())
digits = torch.tensor(test.iloc[:, 1:].to_numpy(), dtype=torch.float32) / 255
print(digits.size())
print(digits.dtype)

#Training, dev & test splits
Xtr = digits
Ytr = label
Xtest = digits[n1:]
Ytest = label[n1:]

#Move all to gpu if possible
Xtr = Xtr.to(device)
Ytr = Ytr.to(device)
Xtest = Xtest.to(device)
Ytest = Ytest.to(device)

#Print a single image embedding
print(Xtr[0])

torch.Size([42000])
torch.Size([42000, 784])
torch.float32
tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.000

In [5]:
#Choose hyperparameters
num_layers = 8 #Number of encoder layers
model_dim = 4 #We will project every (scalar) greyscale pixture into this many dimensions, and use that number in our transformer
parallel_heads = 2 #Number of parallel attention heads
hidden_dim = 784 #Output dimension of internal feed forward layers
max_context = 784 #Max length of an input sequence (there are 784 pixels in each image)

#Create model
class VisionEncoder(nn.Module):
    
    def __init__(self, num_layers, model_dim, parallel_heads, hidden_dim, max_context):
        super().__init__()
        self.projection = nn.Linear(1, model_dim) #Learned projection from 1-d greyscale pixels to 4-d representation
        self.encoder = StandAloneEncoder(num_layers, model_dim, parallel_heads, hidden_dim, max_context)
        self.classifier1 = nn.Linear(max_context*model_dim, 10)
    
    def forward(self, x, target):
        x = self.projection(x)
        x = self.encoder(x).view(1, max_context*model_dim)
        x = self.classifier1(x)
        x = x.view(10)
        test_probs = F.softmax(x, dim=0)
        prediction = torch.argmax(test_probs, dim=0)
        if target is None:
            loss = None
        else:
            onehot = F.one_hot(target, num_classes=10).type(torch.float32).view(10).to(device)
            loss = F.cross_entropy(test_probs, onehot)
            
        return prediction, loss

model = VisionEncoder(num_layers, model_dim, parallel_heads, hidden_dim, max_context).to(device)
                      
#Sample forward pass
#print("Input shape:")
test_input = Xtr[0].view(784, 1)
#print(test_input.shape)
#print("Input:")
#print(test_input)
output, loss = model(test_input, Ytr[0])
print("Output:")
print(output)
print("Loss")
print(loss)

Output:
tensor(2, device='cuda:0')
Loss
tensor(2.1688, device='cuda:0', grad_fn=<DivBackward1>)


In [6]:
def get_example():
    #Choose a random image from the dataset.
    image_index = torch.randint(Xtr.shape[0], (1,))
    x = Xtr[image_index].view(784, 1)
    y = Ytr[image_index]
    return x,y

x_train, y_train = get_example()
print(x_train.shape)
print(y_train.shape)

@torch.no_grad()
def estimate_loss(eval_iters):
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_example()
        prediction, loss = model(X, Y)
        losses[k] = loss.item()
    out = losses.mean()
    model.train()
    return out

@torch.no_grad()
def validation_loss():
    model.eval()
    losses = torch.zeros(Xtest.shape[0])
    for x, y, iteration in zip(Xtest, Ytest, range(Xtest.shape[0])):
        x = x.view(784, 1)
        prediction, loss = model(x, y)
        losses[iteration] = loss.item()
    out = losses.mean()
    model.train()
    return out

torch.Size([784, 1])
torch.Size([1])


In [7]:
def train(training_iters, eval_interval, eval_iters, optimizer):
    for iter in range(training_iters):
        #Evaluate training loss
        if iter % eval_interval == 0 or iter == training_iters - 1:
            losses = estimate_loss(eval_iters)
            print(f"step {iter}: train loss {losses.item():.4f}")

        #Get a training example
        x,y = get_example()

        #evaluate loss
        prediction, loss = model(x, y)
        
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [8]:
#Setup optimizer, train
epoch_size = Xtr.shape[0]
training_program = [(1, 1e-3), (1, 1e-3), (1, 1e-3), (1, 1e-4)] #Each tuple is (# epochs, learning rate)
evaluation_interval = round(epoch_size / 20)
evaluation_iterations = round(epoch_size / 500)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for epochs, learning_rate in training_program:
    print("-------New Epoch--------")
    optimizer.param_groups[0]['lr'] = learning_rate
    train(round(epoch_size * epochs), evaluation_interval, evaluation_iterations, optimizer)
    print("Validation loss: ", validation_loss())
    

-------New Epoch--------
step 0: train loss 2.3174
step 2100: train loss 1.7100
step 4200: train loss 1.7903
step 6300: train loss 1.6586
step 8400: train loss 1.5966
step 10500: train loss 1.6615
step 12600: train loss 1.5564
step 14700: train loss 1.6014
step 16800: train loss 1.6167
step 18900: train loss 1.5696
step 21000: train loss 1.5901
step 23100: train loss 1.5808
step 25200: train loss 1.5594
step 27300: train loss 1.5757
step 29400: train loss 1.5747
step 31500: train loss 1.5607
step 33600: train loss 1.5776
step 35700: train loss 1.6637
step 37800: train loss 1.5021
step 39900: train loss 1.6076
step 41999: train loss 1.5435
Validation loss:  tensor(1.5735)
-------New Epoch--------
step 0: train loss 1.5232
step 2100: train loss 1.5551
step 4200: train loss 1.6628
step 6300: train loss 1.5789
step 8400: train loss 1.6017
step 10500: train loss 1.5415
step 12600: train loss 1.5918
step 14700: train loss 1.6127
step 16800: train loss 1.5906
step 18900: train loss 1.5225
ste

Val loss: 1.5400, N=4, 1 layer classifier    
Val loss: 1.5371, N=8, 1 layer classifier

In [9]:
#Get percentage success rate on test data
success = 0
fail = 0
model.eval()
for x, y in zip(Xtest, Ytest):
    x = x.view(784, 1)
    prediction, loss = model(x, y)
    #print("Prediction: ", prediction)
    #print("Expected: ", y.item())
    if prediction == y.item():
        success += 1
    else:
        fail += 1
model.train()
success_rate = success / Xtest.shape[0]
print("Success rate: ", success_rate)
print("Successes: ", success)
print("Failures: ", fail)

Success rate:  0.925952380952381
Successes:  3889
Failures:  311


In [10]:
#Load test data, run inference and save the output for submission.
test_data = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
print("Number of test images: ", test_data.shape[0])

#parse dat as tensor
test_digits = torch.tensor(test_data.to_numpy(), dtype=torch.float32) / 255

#Move all to gpu if possible
test_digits = test_digits.to(device)


#Run prediction
results = []
for x in test_digits:
    x = x.view(784, 1)
    prediction, _ = model(x, None)
    results.append(prediction.item())

Number of test images:  28000


In [11]:
#Save results in dataframe
submission = pd.DataFrame({'ImageId': range(1, len(results) + 1), 'Label': results})
print("Number of results: ", len(results))
submission

Number of results:  28000


Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,2
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


In [12]:
submission.to_csv('submission.csv', index=False)