<a href="https://colab.research.google.com/github/Markmei123/Landmark-recognition-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd ..

/content


In [None]:
!ls

google-landmark  sample_data  train.csv


In [None]:
!git clone https://github.com/cvdfoundation/google-landmark.git

Cloning into 'google-landmark'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 109 (delta 6), reused 10 (delta 6), pack-reused 93 (from 1)[K
Receiving objects: 100% (109/109), 30.66 KiB | 3.41 MiB/s, done.
Resolving deltas: 100% (33/33), done.


In [None]:
%cd google-landmark/

/content/google-landmark


In [None]:
!mkdir train

In [None]:
%cd train

/content/google-landmark/train


In [None]:
!bash ../download-dataset.sh train 5

Downloading images_000.tar and its md5sum...
Downloading images_001.tar and its md5sum...
Downloading images_002.tar and its md5sum...
Downloading images_003.tar and its md5sum...
Downloading images_004.tar and its md5sum...
Downloading images_005.tar and its md5sum...
images_005.tar extracted!
images_001.tar extracted!
images_000.tar extracted!
images_002.tar extracted!
images_003.tar extracted!
images_004.tar extracted!


In [None]:
!pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.2 kB[0m [31m982.7 kB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m741.6 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


Define ViT model

In [None]:
import torch
from torch import nn

from einops import rearrange, repeat
from einops.layers.torch import Rearrange

# helpers

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# classes

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.norm = nn.LayerNorm(dim)

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        x = self.norm(x)

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout),
                FeedForward(dim, mlp_dim, dropout = dropout)
            ]))

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x

        return self.norm(x)

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

Data preprocessing and training

In [None]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
# from Visiontransformer import ViT
from torch import nn, optim


class CustomImageDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        # reading csv file and intialization
        self.labels_df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

        # filter exist labels
        self.labels_df = self.labels_df[self.labels_df.apply(self._check_image_exists, axis=1)].reset_index(drop=True)

        # get unique label and create mapping
        self.label_to_index = {label: idx for idx, label in enumerate(self.labels_df['landmark_id'].unique())}

    def _check_image_exists(self, row):
        img_id = row[0]
        img_path = os.path.join(self.root_dir, img_id[0], img_id[1], img_id[2], f"{img_id}.jpg")
        return os.path.exists(img_path)

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        img_id = self.labels_df.iloc[idx, 0]
        original_label = self.labels_df.iloc[idx, 2]

        # mapping labels in sequence
        label = self.label_to_index[original_label]

        # construct path of img
        img_path = os.path.join(self.root_dir, img_id[0], img_id[1], img_id[2], f"{img_id}.jpg")
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label


print("--------------start-----------------")
transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# adjust the path of file above before running
train_dataset = CustomImageDataset(csv_file='../train.csv', root_dir='./train', transform=transforms)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)

used_labels = train_dataset.labels_df['landmark_id']

# get all unique label
unique_used_labels = used_labels.unique()

# get the number of labels
num_unique_used_labels = len(unique_used_labels)

print(f"there are  {num_unique_used_labels} different labels in the dataset")

model = ViT(
    image_size=224,
    patch_size=16,
    num_classes=num_unique_used_labels,
    dim=128,
    depth=8,
    heads=6,
    mlp_dim=236,
    dropout=0.1,
    emb_dropout=0.1
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def train(model, train_loader, criterion, optimizer, epochs=20):
    for imgs, labels in train_loader:
        print(imgs.shape, labels.shape)
        break
    model.train()
    for epoch in range(epochs):
        print(f"epoch:{epoch} in {epochs}")
        total_loss = 0
        correct = 0
        for imgs, labels in train_loader:
            imgs, labels = imgs.to(device), labels.to(device).long()

            outputs = model(imgs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}, Accuracy: {correct / len(train_loader.dataset)}")

train(model, train_loader, criterion, optimizer)





--------------start-----------------


  img_id = row[0]


there are  6366 different labels in the dataset
torch.Size([32, 3, 224, 224]) torch.Size([32])
epoch:0 in 20
Epoch 1/20, Loss: 8.769519344452888, Accuracy: 0.0
epoch:1 in 20
Epoch 2/20, Loss: 8.901951662406393, Accuracy: 0.0
epoch:2 in 20
Epoch 3/20, Loss: 8.909409914148569, Accuracy: 0.0007224389539083947
epoch:3 in 20
Epoch 4/20, Loss: 8.793317684929491, Accuracy: 0.0015893656985984685
epoch:4 in 20
Epoch 5/20, Loss: 8.626118796212333, Accuracy: 0.002167316861725184
epoch:5 in 20
Epoch 6/20, Loss: 8.436985138923891, Accuracy: 0.0014448779078167894
epoch:6 in 20
Epoch 7/20, Loss: 8.19995519532586, Accuracy: 0.0017338534893801473
epoch:7 in 20
Epoch 8/20, Loss: 7.892975130388813, Accuracy: 0.0020228290709435053
epoch:8 in 20
Epoch 9/20, Loss: 7.590631430050195, Accuracy: 0.0024562924432885423
epoch:9 in 20
Epoch 10/20, Loss: 7.328610316948956, Accuracy: 0.0015893656985984685
epoch:10 in 20
Epoch 11/20, Loss: 7.09875695057179, Accuracy: 0.004334633723450368
epoch:11 in 20
Epoch 12/20, L