## Run below 10 cells only if you running this file in colab

In [1]:
# !rm -r test

In [2]:
# !rm -r train

In [3]:
# !rm dog-breed-identification.zip

In [4]:
# !rm labels.csv

In [5]:
# !rm sample_submission.csv

In [6]:
# ! mkdir ~/.kaggle 

In [7]:
# ! cp kaggle.json ~/.kaggle/

In [8]:
# ! chmod 600 ~/.kaggle/kaggle.json

In [9]:
# !kaggle competitions download -c dog-breed-identification

In [10]:
# ! unzip -qq dog-breed-identification

# Importing libraries

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import  Dataset
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
from PIL import Image
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
if torch.cuda.is_available():
 dev = "cuda:0" 
else: 
 dev = "cpu" 
device = torch.device(dev)

# Data preprocessing

In [13]:
data_folder="Data"

In [14]:
train_data_path=f'{data_folder}/train'
test_data_path=f'{data_folder}/test'

In [15]:
df=pd.read_csv(f'{data_folder}/labels.csv')
df['file_name']=df['id']+'.jpg'

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['breed'])

In [16]:
labels=df['breed'].unique()

In [17]:
from sklearn.model_selection import train_test_split
df_train, df_test= train_test_split(df[['file_name','label_encoded']], test_size=0.20, random_state=42)

In [18]:
df_train.reset_index(drop=True,inplace=True)
df_test.reset_index(drop=True,inplace=True)

In [19]:
print(df_train.shape,df_test.shape)
df_test.head(3)

(8177, 2) (2045, 2)


Unnamed: 0,file_name,label_encoded
0,475171ec5847e4df41847e295b8cfca2.jpg,111
1,70c412a3ce894c617cbe707f24adbb82.jpg,56
2,c13427fc3db9194dac02b1721bd10fe4.jpg,97


# Fine tuning pretrained model / Training

In [20]:
# !pip install -q transformers datasets pytorch-lightning torch_xla   #Run only once

In [21]:
EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

In [22]:
# - Define image transformations using PyTorch's torchvision.transforms module
# - These transformations will be applied to both the training and testing datasets
# - RandomResizedCrop: randomly crops the image and resizes it to a specific size
# - RandomHorizontalFlip: randomly flips the image horizontally
# - ToTensor: converts the image to a PyTorch tensor
# - Normalize: normalizes the image pixel values with mean and standard deviation values

from transformers import ViTImageProcessor

processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
image_mean = processor.image_mean
image_std = processor.image_std
size = processor.size["height"]

from torchvision.transforms import (CenterCrop, 
                                    Compose, 
                                    Normalize, 
                                    RandomHorizontalFlip,
                                    RandomResizedCrop, 
                                    Resize, 
                                    ToTensor)

normalize = Normalize(mean=image_mean, std=image_std)
train_transform = Compose(
        [
            RandomResizedCrop(size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

test_transform = Compose(
        [
            Resize(size),
            CenterCrop(size),
            ToTensor(),
            normalize,
        ]
    )

In [23]:
# Custom dataset class for PyTorch.
# The class should inherit from the Dataset class and override the __len__ and __getitem__ methods.
# In this example, the class takes in a list of data samples and transforms them using the specified transforms.
# The __getitem__ method returns a tuple of the transformed input and the label.
# The transforms can include data augmentation, normalization, and other preprocessing steps.
# This custom dataset can then be used in conjunction with PyTorch's DataLoader to load the data in batches.


class CustomDataLoader(Dataset):
    def __init__(self, dataset, img_dir, transform=None):
        self.img_labels = dataset
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx,0])
        image = Image.open(img_path)
        label = self.img_labels.iloc[idx, 1]
        inputs=self.transform(image)
        input_label={}
        input_label['pixel_values']=inputs
        input_label['labels']=label
        return input_label

In [24]:
train_dataset=CustomDataLoader(dataset=df_train,img_dir=train_data_path,transform=train_transform)
test_dataset=CustomDataLoader(dataset=df_test,img_dir=train_data_path,transform=test_transform)

train_loader=DataLoader(train_dataset,batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset,batch_size=BATCH_SIZE-16)

In [25]:
batch = next(iter(train_loader))
for k,v in batch.items():
  if isinstance(v, torch.Tensor):
    print(k, v.shape)

pixel_values torch.Size([32, 3, 224, 224])
labels torch.Size([32])


In [26]:
assert batch['pixel_values'].shape == (BATCH_SIZE, 3, 224, 224)
assert batch['labels'].shape == (BATCH_SIZE,)

In [28]:
next(iter(test_loader))['pixel_values'].shape,next(iter(train_loader))['pixel_values'].shape

(torch.Size([16, 3, 224, 224]), torch.Size([32, 3, 224, 224]))

In [29]:
# Custom PyTorch Lightning model class.

import pytorch_lightning as pl
from transformers import ViTForImageClassification, AdamW
import torch.nn as nn

class ViTLightningModule(pl.LightningModule):
    def __init__(self):
        super(ViTLightningModule, self).__init__()
        self.vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224',
                                                              num_labels=len(labels),
                                                              id2label={i: c for i, c in enumerate(labels)},
                                                              label2id={c: i for i, c in enumerate(labels)},
                                                              ignore_mismatched_sizes=True)

    def forward(self, pixel_values):
        outputs = self.vit(pixel_values=pixel_values)
        return outputs.logits
        
    def common_step(self, batch, batch_idx):
        pixel_values = batch['pixel_values']
        labels = batch['labels']
        logits = self(pixel_values)

        criterion = nn.CrossEntropyLoss()
        loss = criterion(logits, labels)
        predictions = logits.argmax(-1)
        correct = (predictions == labels).sum().item()
        accuracy = correct/pixel_values.shape[0]

        return loss, accuracy
      
    def training_step(self, batch, batch_idx):
        loss, accuracy = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        self.log("training_accuracy", accuracy)

        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, accuracy = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)
        self.log("validation_accuracy", accuracy, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss, accuracy = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # We could make the optimizer more fancy by adding a scheduler and specifying which parameters do
        # not require weight_decay but just using AdamW out-of-the-box works fine
        return AdamW(self.parameters(), lr=5e-5)

    def train_dataloader(self):
        return train_loader

    def test_dataloader(self):
        return test_loader

#### Training in google colab

In [None]:
## Using Trainer to train model on our dataset

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping

# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)

model = ViTLightningModule()
trainer = Trainer(accelerator="auto",callbacks=[EarlyStopping(monitor='validation_loss')],max_epochs=3)
trainer.fit(model,train_loader, test_loader)

In [None]:
trainer.test(model,test_loader)

#### Saving trained model in google drive

In [None]:
# torch.save(model.state_dict(), '/content/drive/MyDrive/Notes/Dog-breed-Identification/Dogbreed_PreTrained_Pytorch_Lightning.pth')

# Testing trained model
## This model trained on google colab and trained model's weights and biases used here

In [34]:
model = ViTLightningModule()
model.eval()
print()

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([120, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([120]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.





In [35]:
### Testing model performance before loading model_state_dict

correct=0
total=0
for x in tqdm(test_loader,total=len(test_loader)):
    x_input=x['pixel_values'].to(device)
    y_pred=model(x_input)
    predicted = torch.max(y_pred.data, 1)[1]
    batch_corr = (predicted == x['labels'].to(device)).sum().item()
    correct += batch_corr
    total += x['pixel_values'].shape[0]
    # print(x['pixel_values'].shape[0])
print(total,correct)
print("Accuracy is:",round((correct/total)*100,2),"%")

100%|█████████████████████████████████████████| 128/128 [06:58<00:00,  3.27s/it]

2045 11
Accuracy is: 0.54 %





In [36]:
### Testing model performance after loading model_state_dict

model.load_state_dict(torch.load('Dogbreed_PreTrained_Pytorch_Lightning.pth'))
model.eval()
model.to(device)

correct=0
total=0
for x in tqdm(test_loader,total=len(test_loader)):
    x_input=x['pixel_values'].to(device)
    y_pred=model(x_input)
    predicted = torch.max(y_pred.data, 1)[1]
    batch_corr = (predicted == x['labels'].to(device)).sum().item()
    correct += batch_corr
    total += x['pixel_values'].shape[0]
    # print(x['pixel_values'].shape[0])
print(total,correct)
print("Accuracy is:",round((correct/total)*100,2),"%")

100%|█████████████████████████████████████████| 128/128 [06:30<00:00,  3.05s/it]

2045 1758
Accuracy is: 85.97 %





# Now we can use API for futher development