In [1]:
# Lets import all the necessaty libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import random
import numpy as np

In [2]:
# before we start our work flow we need to call the device that we are currently user
# because this is where the models are get stored
device = "cuda" if torch.cuda.is_available() else "cpu"

# <font size= 6>Importing the dataset</font>

for this time I will use my own dataset which is the omniglot from pytorch

In [3]:
class SiameseOmniglot(Dataset):
    def __init__(self, root, background=True, transform=None):
        self.dataset = datasets.Omniglot(root=root, background=background, download=True, transform=transform)

        self.cls_to_indices = {}
        for idx in range(len(self.dataset)):
            _, label = self.dataset[idx]
            if label not in self.cls_to_indices:
                self.cls_to_indices[label] = []
            self.cls_to_indices[label].append(idx)
        self.labels = list(self.cls_to_indices.keys())

    def __getitem__(self, index):

        img1, label1 = self.dataset[index]


        target = np.random.randint(0, 2)

        if target == 1:
            siamese_index = index
            while siamese_index == index:
                siamese_index = random.choice(self.cls_to_indices[label1])
        else:
            label2 = random.choice(self.labels)
            while label2 == label1:
                label2 = random.choice(self.labels)
            siamese_index = random.choice(self.cls_to_indices[label2])

        img2, _ = self.dataset[siamese_index]


        return img1, img2, torch.tensor(target, dtype=torch.float32)

    def __len__(self):
        return len(self.dataset)

<b> I dont know alot about data I just look in the documentation about its preparation before loading the whole data

In [4]:
transform = transforms.Compose([
    transforms.Resize((28, 28)), # this is the default size of omniglot image
    transforms.ToTensor(),
    transforms.Lambda(lambda x: 1.0 - x) # We need to invert colors
])

In [5]:
batch_size = 64 # this is my default batch size base on my own reference

train_ds = SiameseOmniglot(root="data", background=True, transform=transform)
test_ds = SiameseOmniglot(root="data", background=False, transform=transform)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

100%|██████████| 9.46M/9.46M [00:00<00:00, 111MB/s]
100%|██████████| 6.46M/6.46M [00:00<00:00, 61.7MB/s]


In [6]:
#lets check the result
examples = next(iter(train_loader))
print(f"Batch shape: {examples[0].shape}") # Should be [64, 1, 28, 28]
print(f"Target shape: {examples[2].shape}") # Should be [64]

Batch shape: torch.Size([64, 1, 28, 28])
Target shape: torch.Size([64])


# <font size= 6>Creating the Model

<b>I will just create my own simple CNN layer since the size of the image is just small

In [7]:
class SiameseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(128 * 5 * 5, 256),
            nn.ReLU(),
            nn.Linear(256, 128)
        )

    def forward(self, x1, x2):
        out1 = self.cnn(x1)
        out2 = self.cnn(x2)
        return out1, out2

model = SiameseNet().to(device)

In [8]:
# lets check the model parameters
model.eval()

SiameseNet(
  (cnn): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=3200, out_features=256, bias=True)
    (8): ReLU()
    (9): Linear(in_features=256, out_features=128, bias=True)
  )
)

<font size = 6> Creating custom loss function

I just found out that the validation and the training data of the omniglot are purely different. Wherein we are training in Latin and Greek dataset. But our testing dataset are Tibetian and Sankrit. Because I found out that user Crossentropy as the loss function doesnt work here where the accuracy of the final model is only 0.1%. So I have to look for the documentation and I found another technique of loss function which is the "ContrastiveLoss"

In [9]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super().__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        # Euclidean distance between the two vectors
        euclidean_distance = F.pairwise_distance(output1, output2)
        # Loss formula
        loss = torch.mean((label) * torch.pow(euclidean_distance, 2) +
                          (1-label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss

In [10]:
loss_fn = ContrastiveLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

# <font size = 6>Training the Model

In [11]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    for batch, (img1, img2, target) in enumerate(dataloader):
        img1, img2, target = img1.to(device), img2.to(device), target.to(device)

        out1, out2 = model(img1, img2)
        loss = loss_fn(out1, out2, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            print(f"Train Loss: {loss.item():>7f}")

def test_loop(dataloader, model):
    model.eval()
    correct = 0
    total = 0
    # Threshold: If distance < 0.5, we predict they are the SAME character
    threshold = 0.5

    with torch.no_grad():
        for img1, img2, target in dataloader:
            img1, img2, target = img1.to(device), img2.to(device), target.to(device)
            out1, out2 = model(img1, img2)

            # Calculate distance
            # since the contrastive loss doesn't categorize images into buckets. It acts like a ruler.
            distance = F.pairwise_distance(out1, out2)

            # Prediction: 1 if dist < threshold else 0
            prediction = (distance < threshold).float()

            correct += (prediction == target).sum().item()
            total += target.size(0)

    print(f"Validation Accuracy (Pairwise Verification): {(100*correct/total):>0.1f}%")

In [12]:
print("Starting Siamese Training...")
for t in range(20):
    print(f"Epoch {t+1}")
    train_loop(train_loader, model, loss_fn, optimizer)
    test_loop(test_loader, model)
print("Done!")

Starting Siamese Training...
Epoch 1
Train Loss: 0.365128
Train Loss: 0.127335
Train Loss: 0.118228
Train Loss: 0.117928
Validation Accuracy (Pairwise Verification): 84.6%
Epoch 2
Train Loss: 0.094132
Train Loss: 0.117458
Train Loss: 0.082212
Train Loss: 0.093559
Validation Accuracy (Pairwise Verification): 86.1%
Epoch 3
Train Loss: 0.089289
Train Loss: 0.098420
Train Loss: 0.092595
Train Loss: 0.084694
Validation Accuracy (Pairwise Verification): 87.5%
Epoch 4
Train Loss: 0.087108
Train Loss: 0.087044
Train Loss: 0.098247
Train Loss: 0.083399
Validation Accuracy (Pairwise Verification): 87.8%
Epoch 5
Train Loss: 0.086476
Train Loss: 0.089016
Train Loss: 0.101684
Train Loss: 0.094478
Validation Accuracy (Pairwise Verification): 88.1%
Epoch 6
Train Loss: 0.104392
Train Loss: 0.096088
Train Loss: 0.094255
Train Loss: 0.077756
Validation Accuracy (Pairwise Verification): 88.2%
Epoch 7
Train Loss: 0.072830
Train Loss: 0.074937
Train Loss: 0.085536
Train Loss: 0.096734
Validation Accuracy (