In [1]:
!nvidia-smi

Fri Feb 19 07:17:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
#get CIFAR-10 images in jpg format from this repository
!git clone https://github.com/YoongiKim/CIFAR-10-images

Cloning into 'CIFAR-10-images'...
remote: Enumerating objects: 60027, done.[K
remote: Total 60027 (delta 0), reused 0 (delta 0), pack-reused 60027[K
Receiving objects: 100% (60027/60027), 19.94 MiB | 13.90 MiB/s, done.
Resolving deltas: 100% (59990/59990), done.
Checking out files: 100% (60001/60001), done.


In [3]:
import os
from tqdm.auto import tqdm
from PIL import Image

#get labels of data
cifar_dir = '/content/CIFAR-10-images'
label_list = []
for dir in os.listdir(cifar_dir):
    if dir == 'train':
        for label in os.listdir(os.path.join(cifar_dir, dir)):
            label_list.append(label)

#make train and validation folders
for f in ['train', 'val']:
    for label in label_list:
        os.makedirs(f'/content/data/{f}/{label}', exist_ok=True)

#resize images to 224 by 224
for f_in in ['train', 'test']:
    print(f'{f_in} folder')
    for label in tqdm(label_list):
        dir = f'{cifar_dir}/{f_in}/{label}'
        for image in os.listdir(dir):
            path_in = os.path.join(dir, image)
            I = Image.open(path_in).resize((224, 224))
            if f_in == 'train':
                f_out = 'train'
            else:
                f_out = 'val'
            path_out = f'/content/data/{f_out}/{label}/{image}'
            I.save(path_out)

train folder


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


test folder


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [4]:
#calculate the means and standard deviations of train data
import cv2
import numpy as np
from pathlib import Path

path_images = Path('/content/data/train')
num_images = len(list(path_images.glob('**/*.jpg')))
num_channels = 3
num_pixels = 0
channel_sum = np.zeros(num_channels)
channel_square_sum = np.zeros(num_channels)

for image in tqdm(path_images.glob('**/*.jpg'), total=num_images):
    img = cv2.imread(str(image))
    img = img / 255.0
    num_pixels += img.size / num_channels
    channel_sum += np.sum(img, axis=(0, 1))
    channel_square_sum += np.sum(np.square(img), axis=(0, 1))
mean = list(channel_sum / num_pixels)
std = list(np.sqrt(channel_square_sum / num_pixels - np.square(mean)))

mean[::-1], std[::-1]

HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




([0.4913320721179005, 0.4819964753151303, 0.4466457658532243],
 [0.2440862706333845, 0.24137420221502995, 0.25950901797186166])

In [7]:
#install timm and clone T2T-ViT repository
!pip install timm
!git clone https://github.com/yitu-opensource/T2T-ViT

fatal: destination path 'T2T-ViT' already exists and is not an empty directory.


In [9]:
#train T2T-ViT on CIFAR-10
import os
os.chdir('/content/T2T-ViT')

PATH_TO_DATA = '/content/data'

!bash distributed_train.sh 1 "$PATH_TO_DATA"\
    --model T2t_vit_14\
    --batch-size 32\
    --num-classes 10\
    --img-size 224\
    --mean 0.491 0.482 0.447\
    --std 0.244 0.241 0.260\
    --lr 1e-4\
    --epochs 40\

Training with a single process on 1 GPUs.
adopt performer encoder for tokens-to-token
Model T2t_vit_14 created, param count: 21164400
Data processing configuration for current model + dataset:
	input_size: (3, 224, 224)
	interpolation: bicubic
	mean: (0.491, 0.482, 0.447)
	std: (0.244, 0.241, 0.26)
	crop_pct: 0.9
AMP not enabled. Training in float32.
Scheduled epochs: 50
Test: [   0/312]  Time: 0.825 (0.825)  Loss:  2.3371 (2.3371)  Acc@1:  0.0000 ( 0.0000)  Acc@5: 56.2500 (56.2500)
Test: [  50/312]  Time: 0.087 (0.112)  Loss:  2.2087 (2.2058)  Acc@1:  9.3750 ( 2.0221)  Acc@5: 71.8750 (70.8333)
Test: [ 100/312]  Time: 0.090 (0.102)  Loss:  2.1602 (2.2009)  Acc@1:  9.3750 ( 5.8787)  Acc@5: 65.6250 (69.0285)
Test: [ 150/312]  Time: 0.091 (0.099)  Loss:  2.2575 (2.2055)  Acc@1:  0.0000 ( 4.6978)  Acc@5: 65.6250 (67.0944)
Test: [ 200/312]  Time: 0.089 (0.097)  Loss:  1.8667 (2.1675)  Acc@1: 62.5000 (10.9919)  Acc@5: 87.5000 (72.3725)
Test: [ 250/312]  Time: 0.087 (0.096)  Loss:  2.1202 (2.