In [1]:
from __future__ import print_function

import glob
from itertools import chain
import os
import random
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from linformer import Linformer
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm

from vit_pytorch.efficient import ViT

In [2]:
print(f"Torch: {torch.__version__}")

Torch: 1.10.0+cu113


In [3]:
# Training settings
batch_size = 64
epochs = 20
lr = 3e-5
gamma = 0.7
seed = 42

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

In [5]:
device = 'cuda'

Load data

In [6]:
train_dir = 'data/train'
test_dir = 'data/test'

In [7]:
with zipfile.ZipFile('train.zip') as train_zip:
    train_zip.extractall('data')
    
with zipfile.ZipFile('test.zip') as test_zip:
    test_zip.extractall('data')

In [8]:
train_list = glob.glob(os.path.join(train_dir,'*.jpg'))
test_list = glob.glob(os.path.join(test_dir, '*.jpg'))

In [9]:
print(f"Train Data: {len(train_list)}")
print(f"Test Data: {len(test_list)}")

Train Data: 25000
Test Data: 12500


In [10]:
train_list, valid_list = train_test_split(train_list, 
                                          test_size=0.2,
                                          random_state=seed)

In [11]:
print(f"Train Data: {len(train_list)}")
print(f"Validation Data: {len(valid_list)}")
print(f"Test Data: {len(test_list)}")

Train Data: 20000
Validation Data: 5000
Test Data: 12500


Image Augumentation

In [12]:
train_transforms = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ]
)

val_transforms = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
    ]
)


test_transforms = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
    ]
)

Load Datasets

In [13]:
class CatsDogsDataset(Dataset):
    def __init__(self, file_list, transform=None):
        self.file_list = file_list
        self.transform = transform

    def __len__(self):
        self.filelength = len(self.file_list)
        return self.filelength

    def __getitem__(self, idx):
        img_path = self.file_list[idx]
        img = Image.open(img_path)
        img_transformed = self.transform(img)

        return img_transformed 

In [14]:
train_data = CatsDogsDataset(train_list, transform=train_transforms)
valid_data = CatsDogsDataset(valid_list, transform=test_transforms)
test_data = CatsDogsDataset(test_list, transform=test_transforms)

In [15]:
train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True )
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True)

In [16]:
print(len(train_data), len(train_loader))

20000 313


In [17]:
print(len(valid_data), len(valid_loader))

5000 79


Visual transformer

In [18]:
import torch
from vit_pytorch import ViT, MAE

model = ViT(
    image_size = 224,
    patch_size = 32,
    num_classes = 2,
    dim = 128,
    depth = 6,
    heads = 8,
    mlp_dim = 256,
    channels=3,
)

mae = MAE(
    encoder = model,
    masking_ratio = 0.75,   # the paper recommended 75% masked patches
    decoder_dim = 512,      # paper showed good results with just 512
    decoder_depth = 6       # anywhere from 1 to 8
).to(device)

In [19]:
for epoch in range(epochs):
    epoch_loss = 0
    epoch_accuracy = 0

    for data in tqdm(train_loader):
        data = data.to(device)
        
        loss = mae(data)
        loss.backward()
        
        epoch_loss += loss / len(train_loader)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data in valid_loader:
            data = data.to(device)

            val_loss = mae(data)
            
            epoch_val_loss += val_loss / len(valid_loader)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - val_loss : {epoch_val_loss:.4f} \n"
    )

  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 1 - loss : 1.0774 - val_loss : 1.0772 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 2 - loss : 1.0775 - val_loss : 1.0778 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 3 - loss : 1.0773 - val_loss : 1.0775 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 4 - loss : 1.0776 - val_loss : 1.0785 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 5 - loss : 1.0769 - val_loss : 1.0787 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 6 - loss : 1.0776 - val_loss : 1.0777 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 7 - loss : 1.0775 - val_loss : 1.0777 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 8 - loss : 1.0777 - val_loss : 1.0780 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 9 - loss : 1.0773 - val_loss : 1.0783 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 10 - loss : 1.0776 - val_loss : 1.0776 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 11 - loss : 1.0789 - val_loss : 1.0778 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 12 - loss : 1.0782 - val_loss : 1.0780 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 13 - loss : 1.0785 - val_loss : 1.0772 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 14 - loss : 1.0776 - val_loss : 1.0772 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 15 - loss : 1.0773 - val_loss : 1.0770 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 16 - loss : 1.0769 - val_loss : 1.0775 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 17 - loss : 1.0777 - val_loss : 1.0773 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 18 - loss : 1.0788 - val_loss : 1.0773 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 19 - loss : 1.0771 - val_loss : 1.0769 



  0%|          | 0/313 [00:00<?, ?it/s]

Epoch : 20 - loss : 1.0778 - val_loss : 1.0778 



In [25]:
model.eval()
    
with torch.no_grad():
    for x in test_loader:
        x = x.to(device)

        scores = model(x)
        print(scores)
        _, predictions = scores.max(1)
        print(predictions)
        break


model.train()

tensor([[-0.0737, -0.5167],
        [-0.0925, -0.5394],
        [-0.0754, -0.5350],
        [-0.0374, -0.5253],
        [-0.0780, -0.5283],
        [-0.0675, -0.5199],
        [-0.1139, -0.5358],
        [-0.0761, -0.5262],
        [-0.1062, -0.5210],
        [-0.0661, -0.5136],
        [-0.0783, -0.5236],
        [-0.0785, -0.5191],
        [-0.0887, -0.5377],
        [-0.0870, -0.5290],
        [-0.0481, -0.5116],
        [-0.0812, -0.5424],
        [-0.0784, -0.5312],
        [-0.0627, -0.5272],
        [-0.0966, -0.5441],
        [-0.0731, -0.5670],
        [-0.0531, -0.5081],
        [-0.0689, -0.5207],
        [-0.1218, -0.5489],
        [-0.0805, -0.4975],
        [-0.0628, -0.5165],
        [-0.0499, -0.5219],
        [-0.0560, -0.5188],
        [-0.0735, -0.5307],
        [-0.0978, -0.5534],
        [-0.0588, -0.5061],
        [-0.0702, -0.5275],
        [-0.1125, -0.5469],
        [-0.0839, -0.5373],
        [-0.0943, -0.5242],
        [-0.0807, -0.5191],
        [-0.0880, -0

ViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=32, p2=32)
    (1): Linear(in_features=3072, out_features=128, bias=True)
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (transformer): Transformer(
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (fn): Attention(
            (attend): Softmax(dim=-1)
            (to_qkv): Linear(in_features=128, out_features=1536, bias=False)
            (to_out): Sequential(
              (0): Linear(in_features=512, out_features=128, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (fn): FeedForward(
            (net): Sequential(
              (0): Linear(in_features=128, out_features=256, bias=True)
              (1): GELU()
              (2