In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvcc --version
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Looking in indexes: https://download.pytorch.org/whl/cu118, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import sys
import cv2 # Pour utiliser opencv-python, il faut la version de python est 3.7
import os
import csv

import numpy as np 
import pandas as pd 
import math

import torch 
from torch.utils.data import Dataset, DataLoader
import torchvision 
from torchvision.io import read_image
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim

np.random.seed(0)

In [3]:
# Constant. Should be the path to the folder named JPEGImages, containing the 33K images in its subfolders.
DATA_FOLDER_PATH = '/content/drive/MyDrive/IFT3710/Animals_with_Attributes2/'
JPEGIMAGES_FOLDER_PATH = '/content/drive/MyDrive/IFT3710/Animals_with_Attributes2/JPEGImages/'

In [4]:
def find_num_images_per_label(img_dir = JPEGIMAGES_FOLDER_PATH): #-> tuple[dict,dict]: 
    """ 
    USEFUL FOR SAMPLING.
    Return a dict with keys as the 50 labels, and values being the number of images in each subdirectory corresponding to label
    and a second dict with the relative numbers (proportion) for every label compared to the total number of images (useful for sampling)"""
    labels_dirs = os.listdir(img_dir)
    num_images_per_label = dict.fromkeys(labels_dirs)
    proportions_images_per_label = dict.fromkeys(labels_dirs)
    total_num_images = 0

    # Update absolute number of images per label
    for i, label in enumerate(labels_dirs) : 
        specific_label_path = os.path.join(img_dir, labels_dirs[i])
        num_images_label = len(os.listdir(specific_label_path))
        total_num_images += num_images_label
        num_images_per_label[label] = num_images_label

    # Update relative number of images per label (proportion)
    for i, label in enumerate(labels_dirs) : 
        num_images_label = num_images_per_label[label]
        proportion_label = round(num_images_label / total_num_images, 4)
        proportions_images_per_label[label] = proportion_label

    return num_images_per_label, proportions_images_per_label

num_images_per_label, proportions_images_per_label = find_num_images_per_label()
print(num_images_per_label)
print(proportions_images_per_label)

FileNotFoundError: ignored

In [None]:
ANNOTATIONS_FILENAME = 'annotations.csv'

def create_annotations_csv_file(annotations_filename = ANNOTATIONS_FILENAME, img_dir = JPEGIMAGES_FOLDER_PATH): 
    """ 
    Create a csv annotations_file, annotations.csv, with two columns, in the format : 
                        path/to/image, label
    
    The annotation csv is necessary for DataLoader.
    """
    
    labels_dirs:list = os.listdir(img_dir)
   
    if os.path.exists(annotations_filename):
        os.remove(annotations_filename)
        print(f'Deleted existent {ANNOTATIONS_FILENAME} file.\n ---------------------------')
    
    with open(annotations_filename, 'w', newline='') as file :
        writer = csv.writer(file, dialect='excel', delimiter=',')

        for i, label in enumerate(labels_dirs) : 

            specific_label_path = os.path.join(img_dir, label)
            images_names = os.listdir(specific_label_path)

            for j, image_name in enumerate(images_names):
                full_path_to_img= os.path.join(specific_label_path, image_name)
                full_path_to_img= os.path.join(label, image_name)

                row = [full_path_to_img, label]
                writer.writerow(row)

    print(f'Sucessfully created {ANNOTATIONS_FILENAME} file.')

#
create_annotations_csv_file()

Sucessfully created annotations.csv file.


In [None]:
# labels_in_number = pd.read_csv(DATA_FOLDER_PATH+"classes.txt", delim_whitespace=True,header=None)
labels_dict = {}
with open(DATA_FOLDER_PATH+"classes.txt") as f:
    for line in f:
        # print(line.split())
        (key,val) = line.split()
        labels_dict[val] = int(key)-1
print(labels_dict)

{'antelope': 0, 'grizzly+bear': 1, 'killer+whale': 2, 'beaver': 3, 'dalmatian': 4, 'persian+cat': 5, 'horse': 6, 'german+shepherd': 7, 'blue+whale': 8, 'siamese+cat': 9, 'skunk': 10, 'mole': 11, 'tiger': 12, 'hippopotamus': 13, 'leopard': 14, 'moose': 15, 'spider+monkey': 16, 'humpback+whale': 17, 'elephant': 18, 'gorilla': 19, 'ox': 20, 'fox': 21, 'sheep': 22, 'seal': 23, 'chimpanzee': 24, 'hamster': 25, 'squirrel': 26, 'rhinoceros': 27, 'rabbit': 28, 'bat': 29, 'giraffe': 30, 'wolf': 31, 'chihuahua': 32, 'rat': 33, 'weasel': 34, 'otter': 35, 'buffalo': 36, 'zebra': 37, 'giant+panda': 38, 'deer': 39, 'bobcat': 40, 'pig': 41, 'lion': 42, 'mouse': 43, 'polar+bear': 44, 'collie': 45, 'walrus': 46, 'raccoon': 47, 'cow': 48, 'dolphin': 49}


In [None]:
from torchvision.io import read_image, ImageReadMode
from PIL import Image


class AWA2Dataset(Dataset): # Dataset class to serve as input for the DataLoader.
    """ 
    Dataset class to serve as input for the DataLoader.
    Implements all the required methods and more. 
    """

    def __init__(self, annotations_file=ANNOTATIONS_FILENAME, img_dir=JPEGIMAGES_FOLDER_PATH, 
                transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

        numbers_infos_dicts: tuple[dict,dict] = find_num_images_per_label(img_dir=JPEGIMAGES_FOLDER_PATH)
        self.num_images_per_label = numbers_infos_dicts[0]
        self.proportions_images_per_label = numbers_infos_dicts[1]

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        # img_path = self.img_labels.iloc[idx, 0]
        key = self.img_labels.iloc[idx, 1]

        # Mapping the labels from string to tensor
        label = labels_dict[key]

        image = read_image(path = img_path, mode = ImageReadMode.RGB)
        # with open(img_path, 'rb') as f:
        #     image = Image.open(f)
        #     image = image.convert('RGB')  # convert to RGB


        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label


class Subset_(AWA2Dataset) : 
    def __init__(self, dataset, indices, transform=None):
        super().__init__()
        self.dataset = dataset
        self.indices = indices
        self.transform = transform
    
    def __len__(self):
        return len(self.indices)

    def __getitem__(self, index):
        original_index_in_AWA2Dataset = self.indices[index]
        image, label = self.dataset[original_index_in_AWA2Dataset]
        if self.transform is not None:
            image = self.transform(image)
        return image, label
    

In [None]:
# dataset = AWA2Dataset()

# # print(type(dataset))

# random_index = np.random.randint(0, len(dataset))
# image, label = dataset[random_index]

# ## TODO : Change transforms. Currently this is not useful.
# dataset.transform = transforms.Compose([
#                         transforms.ToPILImage(),
#                         transforms.Resize((224, 224)),
#                         # transforms.RandomHorizontalFlip(),
#                         transforms.Grayscale(num_output_channels=3),
#                         transforms.ToTensor(),
#                         transforms.Normalize((0.485, 0.456, 0.406), 
#                                              (0.229, 0.224, 0.225))])

# # print(dataset[0][0].shape)

# train_size =  10
# test_size = 5
# # valid_size = 2

# train_list, list_1 = torch.utils.data.random_split(dataset, [train_size,len(dataset)-train_size])
# test_list, list_2 = torch.utils.data.random_split(list_1, [test_size,len(list_1)-test_size])
# # train_list, valid_list = torch.utils.data.random_split(train_list, [train_size-valid_size,valid_size])

# # print(type(train_list))
# # valid_list, list_ =  torch.utils.data.random_split(list_2, [valid_size,len(list_2)-valid_size])
# # Testing. All good
# # train_list = []
# # test_list = []

# # for i in range(10):
# #     random_index = np.random.randint(0, len(dataset))
# #     train_list.append(dataset[random_index])

# # for i in range(5):
# #     random_index = np.random.randint(0, len(dataset))
# #     test_list.append(dataset[random_index])
    



# from sklearn.model_selection import train_test_split

# train_list, valid_list = train_test_split(train_list, 
#                                           test_size=0.5)

# print(f"Train Data: {len(train_list)}")
# print(f"Test Data: {len(test_list)}")
# print(f"Validation Data: {len(valid_list)}")


In [None]:
transforms_pipeline_train = transforms.Compose([
                    ## Input size
                    transforms.ToPILImage(),
                    transforms.Resize((256,256)),
                    # transforms.Grayscale(num_output_channels=3),
                    
                    ## Data augmentation 
                    transforms.RandomRotation(15),
                    transforms.RandomHorizontalFlip(p=0.4),
                    # transforms.RandomApply(transforms.RandAugment(), p=0.4), # 40% of the time, apply a random additional combo of transformations #https://sebastianraschka.com/blog/2023/data-augmentation-pytorch.html
                    transforms.ColorJitter(brightness=0.2,
                                            contrast=0.2,
                                            saturation=0.2,
                                            hue=0.1),
                    transforms.RandomCrop((224,224)),  # transforms.RandomResizedCrop(size=(224,224), scale=(0.6, 0.9), ratio=(0.5, 1.08,))
                    ## Normalize
                    transforms.ToTensor(), # Already a tensor as implemented in Dataset class with the 
                    transforms.Normalize(mean = [0.4643, 0.4640, 0.3985] , std=[0.2521, 0.2425, 0.2538]) # real mean and std of AwA2
                ])


transforms_pipeline_test = transforms.Compose([
                    ## Input size
                    transforms.ToPILImage(),
                    transforms.Resize((256,256)),
                    # transforms.Grayscale(num_output_channels=3),
                    transforms.CenterCrop((224,224)),   
                    ## Normalize
                    transforms.ToTensor(), # Already a tensor as implemented in Dataset class with the 
                    transforms.Normalize(mean = [0.4643, 0.4640, 0.3985] , std=[0.2521, 0.2425, 0.2538]) # real mean and std of AwA2
                ])




# Initialize dataset and train/valid/test split 
from sklearn.model_selection import train_test_split

dataset = AWA2Dataset()
n_images = len(dataset)
# Split all indices into training/testing sets
train_indices, test_indices = train_test_split(range(n_images), test_size=0.2, random_state=1)
# Split training indices into training/validation sets.
train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, random_state=1)


# Initialize the 3 DataSet objects (as Subset_) and apply the relevant Transforms to each subset (train/test/valid)
train_data = Subset_(dataset, train_indices, transform = transforms_pipeline_train)
valid_data = Subset_(dataset, valid_indices, transform = transforms_pipeline_test)
test_data  = Subset_(dataset, test_indices, transform = transforms_pipeline_test) 

# Initalize DataLoaders
batch_size = 32
train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True)
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True)

In [None]:
# train_list.transforms = transforms.Compose(
#     [
#         transforms.Resize((224, 224)),
#         transforms.RandomResizedCrop(224),
#         transforms.RandomHorizontalFlip(),
#         transforms.ToTensor(),
#     ]
# )

# val_transforms = transforms.Compose(
#     [
#         transforms.Resize(256),
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#     ]
# )


# test_transforms = transforms.Compose(
#     [
#         transforms.Resize(256),
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#     ]
# )
# train_data = AWA2Dataset(dataset)
# valid_data = AWA2Dataset(valid_list, transform=test_transforms)
# test_data = AWA2Dataset(test_list, transform=test_transforms)


# train_data = train_list
# valid_data = train_list
# test_data = train_list

# train_data.transforms = transforms.Compose(
#     [
#         transforms.Resize((224, 224)),
#         transforms.RandomResizedCrop(224),
#         transforms.RandomHorizontalFlip(),
#         transforms.ToTensor(),
#     ]
# )

# valid_data.transforms = transforms.Compose(
#     [
#         transforms.Resize(256),
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#     ]
# )


# test_data.transforms = transforms.Compose(
#     [
#         transforms.Resize(256),
#         transforms.CenterCrop(224),
#         transforms.ToTensor(),
#     ]
# )

# print(train_list[1][0].shape)

# train_dataloader = DataLoader(train_data, batch_size=4, shuffle= True)
# test_dataloader = DataLoader(test_data, batch_size=4, shuffle= True)
# valid_dataloader = DataLoader(valid_data, batch_size=4, shuffle= True)

In [None]:
!pip install vit-pytorch
# !pip install Linformer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vit-pytorch
  Downloading vit_pytorch-1.2.0-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting einops>=0.6.0
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: einops, vit-pytorch
Successfully installed einops-0.6.0 vit-pytorch-1.2.0


In [None]:
import torch.nn as nn
# import timm
from vit_pytorch import ViT
from vit_pytorch import SimpleViT
# from vit_pytorch.efficient import ViT
# from linformer import Linformer

# efficient_transformer = Linformer(
#     dim=128,
#     seq_len=49+1,  # 7x7 patches + 1 cls-token
#     depth=12,
#     heads=8,
#     k=64
# )
# vit_model =  ViT(
#     dim=128,
#     image_size=224,
#     patch_size=32,
#     num_classes=50,
#     transformer=efficient_transformer,
#     channels=3,
# )

vit_model = SimpleViT(
    image_size = 224,
    patch_size = 32,
    num_classes = 50,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1 ### ADDED DROP OUT 
)



vit_model.eval

<bound method Module.eval of ViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=32, p2=32)
    (1): LayerNorm((3072,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=3072, out_features=1024, bias=True)
    (3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (transformer): Transformer(
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fn): Attention(
            (attend): Softmax(dim=-1)
            (dropout): Dropout(p=0.0, inplace=False)
            (to_qkv): Linear(in_features=1024, out_features=3072, bias=False)
            (to_out): Sequential(
              (0): Linear(in_features=1024, out_features=1024, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((10

### ViT-ZSL ###

In [None]:
device = torch.device("cuda:0")
device

device(type='cuda', index=0)

In [None]:
torch.cuda.is_available()

True

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 10546660679508753187
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 13854638080
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1909215606463573021
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [None]:
import sys
print(sys.version)

3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0]


In [None]:
vit_model.to(device)

ViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=32, p2=32)
    (1): LayerNorm((3072,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=3072, out_features=1024, bias=True)
    (3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (transformer): Transformer(
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fn): Attention(
            (attend): Softmax(dim=-1)
            (dropout): Dropout(p=0.0, inplace=False)
            (to_qkv): Linear(in_features=1024, out_features=3072, bias=False)
            (to_out): Sequential(
              (0): Linear(in_features=1024, out_features=1024, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((1024,), eps=1e-05, elementwise_

### Question: check dimension de dataloader   ###

### TEST Split training data and test data ###

In [None]:
# batch_size = 8

# train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True )
# valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True)

# dataset = test_train
# dataloader = DataLoader(dataset = test_train, batch_size=batch_size, shuffle=True)
# train_size = int(0.8*len(dataset))
# test_size = len(dataset) - train_size

# train_dataset, test_dataset = torch.utils.data.random_split(dataset,[train_size,test_size])
# train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=False, num_workers=2, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, num_workers=2, pin_memory=True)

###  Split training data and test data ###

In [None]:
# batch_size = 128

In [None]:
# dataloader = DataLoader(dataset = dataset, batch_size=batch_size, shuffle=True)
# train_size = int(0.8*len(dataset))
# test_size = len(dataset) - train_size

In [None]:
# train_dataset, test_dataset = torch.utils.data.random_split(dataset,[train_size,test_size])

In [None]:
# train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=False, num_workers=12, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, num_workers=12, pin_memory=True)

###   transfomer labels  ###

In [None]:
path_class = DATA_FOLDER_PATH +"classes.txt"
class_animal = pd.read_table(path_class,header= None)
# class_animal

In [None]:
animals = class_animal[1]
dict_label_animal = {}
n = 0
for i in range(0,len(animals)):
    dict_label_animal[animals[i]] = n
    n+=1
def label_to_num(tuple_labels):
    list_labels =[]
    for tuple_label in tuple_labels:
        list_labels.append(dict_label_animal[tuple_label])
    return torch.tensor(list_labels) 



###   Loss function  ###

In [None]:
lr = 1e-3

In [None]:
criterion = torch.nn.CrossEntropyLoss()

# vit_pytorch
# optimizer = optim.SGD(vit_model.parameters(), lr= lr, momentum=0.9)

# optimizer = optim.Adam(vit_model.parameters(), lr=lr)

# ViT-ZSL
# optimizer = torch.optim.Adam([{"params": vit_model.layers_dict.vit.parameters(), "lr": 0.00001, "weight_decay": 0.0001},
#                               {"params": vit_model.layers_dict.mlp_g.parameters(), "lr": 0.001, "weight_decay": 0.00001}])

In [None]:
import torch.nn as nn
from vit_pytorch import ViT
from vit_pytorch import SimpleViT

vit_model = ViT(
    image_size = 224,
    patch_size = 32,
    num_classes = 50,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1 ### ADDED DROP OUT RATE
    # emb_dropout = 0.1 ### ADDED Embedding dropout rate 
)
vit_model.to(device)

###  Training  ###

In [None]:
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm


def collate_fn(batch):
   batch = list(filter(lambda x: x is not None, batch))
   return torch.utils.data.dataloader.default_collate(batch) 





batch_size = 64

# train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=6, collate_fn=collate_fn) #suggested_max_workers_for_colab_env = 6
# valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True, num_workers=6, collate_fn=collate_fn)
# test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True, num_workers=6, collate_fn=collate_fn)

train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=6) #suggested_max_workers_for_colab_env = 6
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True, num_workers=6)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True, num_workers=6)



# loss function
criterion = nn.CrossEntropyLoss()
# optimizer
optimizer = optim.Adam(vit_model.parameters(), lr=3e-5, weight_decay=1e-5) ### Added weight decay 
# scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

for epoch in range(20):
    epoch_loss = 0
    epoch_accuracy = 0

    for data, label in tqdm(train_loader):
        data = data.to(device)
        label = label.to(device)

        output = vit_model(data)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in valid_loader:
            data = data.to(device)
            label = label.to(device)

            val_output = vit_model(data)
            val_loss = criterion(val_output, label)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 1 - loss : 2.8478 - acc: 0.2455 - val_loss : 2.7508 - val_acc: 0.2615



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 2 - loss : 2.7460 - acc: 0.2700 - val_loss : 2.6805 - val_acc: 0.2887



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 3 - loss : 2.6652 - acc: 0.2877 - val_loss : 2.5692 - val_acc: 0.3111



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 4 - loss : 2.5663 - acc: 0.3075 - val_loss : 2.5200 - val_acc: 0.3215



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 5 - loss : 2.5145 - acc: 0.3203 - val_loss : 2.5082 - val_acc: 0.3247



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 6 - loss : 2.4505 - acc: 0.3342 - val_loss : 2.3837 - val_acc: 0.3556



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 7 - loss : 2.4097 - acc: 0.3441 - val_loss : 2.3538 - val_acc: 0.3648



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 8 - loss : 2.3636 - acc: 0.3541 - val_loss : 2.3309 - val_acc: 0.3693



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 9 - loss : 2.3146 - acc: 0.3647 - val_loss : 2.3349 - val_acc: 0.3720



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 10 - loss : 2.2730 - acc: 0.3757 - val_loss : 2.2564 - val_acc: 0.3838



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 11 - loss : 2.2486 - acc: 0.3815 - val_loss : 2.2430 - val_acc: 0.3863



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 12 - loss : 2.2009 - acc: 0.3912 - val_loss : 2.2180 - val_acc: 0.3938



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 13 - loss : 2.1743 - acc: 0.4012 - val_loss : 2.2164 - val_acc: 0.3904



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 14 - loss : 2.1390 - acc: 0.4062 - val_loss : 2.1780 - val_acc: 0.4047



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 15 - loss : 2.1061 - acc: 0.4121 - val_loss : 2.1271 - val_acc: 0.4186



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 16 - loss : 2.0825 - acc: 0.4202 - val_loss : 2.1240 - val_acc: 0.4194



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 17 - loss : 2.0597 - acc: 0.4257 - val_loss : 2.1320 - val_acc: 0.4206



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 18 - loss : 2.0172 - acc: 0.4358 - val_loss : 2.1083 - val_acc: 0.4195



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 19 - loss : 1.9961 - acc: 0.4403 - val_loss : 2.0828 - val_acc: 0.4355



  0%|          | 0/374 [00:00<?, ?it/s]

Epoch : 20 - loss : 1.9739 - acc: 0.4463 - val_loss : 2.0506 - val_acc: 0.4327



In [None]:
import torch.nn as nn
from vit_pytorch import ViT
from vit_pytorch import SimpleViT

vit_model = ViT(
    image_size = 224,
    patch_size = 32,
    num_classes = 50,
    dim = 1024,
    depth = 12, # NEW 
    heads = 10, # NEW
    mlp_dim = 4096, # NEW
    dropout = 0.1 ### ADDED DROP OUT RATE
    # emb_dropout = 0.1 ### ADDED Embedding dropout rate 
)
vit_model.to(device)

In [None]:
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm


def collate_fn(batch):
   batch = list(filter(lambda x: x is not None, batch))
   return torch.utils.data.dataloader.default_collate(batch) 


batch_size = 128

# train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=6, collate_fn=collate_fn) #suggested_max_workers_for_colab_env = 6
# valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True, num_workers=6, collate_fn=collate_fn)
# test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True, num_workers=6, collate_fn=collate_fn)

train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=6) #suggested_max_workers_for_colab_env = 6
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True, num_workers=6)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True, num_workers=6)



# loss function
criterion = nn.CrossEntropyLoss()
# optimizer
optimizer = optim.Adam(vit_model.parameters(), lr=1e-4, weight_decay=1e-5) ### Added weight decay 
# scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

for epoch in range(20):
    epoch_loss = 0
    epoch_accuracy = 0

    for data, label in tqdm(train_loader):
        data = data.to(device)
        label = label.to(device)

        output = vit_model(data)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in valid_loader:
            data = data.to(device)
            label = label.to(device)

            val_output = vit_model(data)
            val_loss = criterion(val_output, label)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 1 - loss : 3.4936 - acc: 0.1035 - val_loss : 3.1666 - val_acc: 0.1803



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 2 - loss : 3.0671 - acc: 0.1903 - val_loss : 2.9338 - val_acc: 0.2308



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 3 - loss : 2.8620 - acc: 0.2356 - val_loss : 2.6769 - val_acc: 0.2828



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 4 - loss : 2.7101 - acc: 0.2736 - val_loss : 2.5961 - val_acc: 0.2973



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 5 - loss : 2.5938 - acc: 0.2996 - val_loss : 2.5093 - val_acc: 0.3261



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 6 - loss : 2.4952 - acc: 0.3215 - val_loss : 2.4045 - val_acc: 0.3510



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 7 - loss : 2.4082 - acc: 0.3456 - val_loss : 2.3712 - val_acc: 0.3540



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 8 - loss : 2.3431 - acc: 0.3574 - val_loss : 2.2974 - val_acc: 0.3722



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 9 - loss : 2.2630 - acc: 0.3778 - val_loss : 2.2478 - val_acc: 0.3856



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 10 - loss : 2.1966 - acc: 0.3926 - val_loss : 2.2248 - val_acc: 0.3919



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 11 - loss : 2.1372 - acc: 0.4086 - val_loss : 2.1773 - val_acc: 0.4014



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 12 - loss : 2.0718 - acc: 0.4209 - val_loss : 2.1017 - val_acc: 0.4194



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 13 - loss : 2.0157 - acc: 0.4326 - val_loss : 2.0482 - val_acc: 0.4433



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 14 - loss : 1.9378 - acc: 0.4525 - val_loss : 2.0737 - val_acc: 0.4327



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 15 - loss : 1.8736 - acc: 0.4656 - val_loss : 2.0300 - val_acc: 0.4420



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 16 - loss : 1.8219 - acc: 0.4817 - val_loss : 1.9892 - val_acc: 0.4528



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 17 - loss : 1.7619 - acc: 0.4956 - val_loss : 1.9643 - val_acc: 0.4623



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 18 - loss : 1.6947 - acc: 0.5127 - val_loss : 1.9486 - val_acc: 0.4727



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 19 - loss : 1.6405 - acc: 0.5265 - val_loss : 1.9268 - val_acc: 0.4698



  0%|          | 0/187 [00:00<?, ?it/s]

Epoch : 20 - loss : 1.5800 - acc: 0.5363 - val_loss : 1.8905 - val_acc: 0.4871



In [None]:
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="test",
    
#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 1e-3,
#     "architecture": "ViT",
#     "dataset": "AWA2",
#     "epochs": 50,
#     }
# )

# Initalize DataLoaders
batch_size = 64
train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True)
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=False, num_workers=6, pin_memory=True)

vit_model.train()

for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0].to(device)
        # print(inputs)
        labels = data[1].to(device)
        # print(labels)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = vit_model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

        print('[%d, %5d] loss: %.3f' %
              (epoch + 1, i + 1, running_loss ))
        
        # acc
        print('current batch acc', (labels == output.argmax(dim=1)[0]).int().sum() / labels.size(0))
        
        # log metrics to wandb
        # wandb.log({"loss": running_loss, "acc": (labels == output.argmax(dim=1)[0]).int().sum() / labels.size(0)})

        running_loss = 0.0

        


# [optional] finish the wandb run, necessary in notebooks
# wandb.finish()

print('Finished Training of ViT')

[1,     1] loss: 3.146
current batch acc tensor(0.0156, device='cuda:0')
[1,     2] loss: 3.036
current batch acc tensor(0.0312, device='cuda:0')
[1,     3] loss: 3.065
current batch acc tensor(0.0312, device='cuda:0')
[1,     4] loss: 2.908
current batch acc tensor(0.0938, device='cuda:0')
[1,     5] loss: 2.842
current batch acc tensor(0.0312, device='cuda:0')
[1,     6] loss: 3.196
current batch acc tensor(0.0469, device='cuda:0')
[1,     7] loss: 2.814
current batch acc tensor(0., device='cuda:0')
[1,     8] loss: 2.901
current batch acc tensor(0.0469, device='cuda:0')
[1,     9] loss: 3.097
current batch acc tensor(0.0156, device='cuda:0')
[1,    10] loss: 3.056
current batch acc tensor(0.0156, device='cuda:0')
[1,    11] loss: 2.712
current batch acc tensor(0., device='cuda:0')
[1,    12] loss: 3.145
current batch acc tensor(0.0156, device='cuda:0')


KeyboardInterrupt: ignored