In [1]:
!nvidia-smi

Sat Feb 20 10:15:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#download Food-101 data
!wget http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz
!gunzip /content/food-101.tar.gz
!tar -xvf /content/food-101.tar

In [3]:
#get classes, and file names of train and validation set  
with open('/content/food-101/meta/classes.txt', 'r') as f:
    classes = f.readlines()
classes = [c.replace('\n', '') for c in classes]
print(f'num classes {len(classes)}')
with open('/content/food-101/meta/train.txt', 'r') as f:
    train = f.readlines()
train = [t.replace('\n', '') for t in train]
with open('/content/food-101/meta/test.txt', 'r') as f:
    val = f.readlines()
val = [v.replace('\n', '') for v in val]

#prepare train and validation data
import os
from tqdm.notebook import tqdm
from PIL import Image

data_dir = '/content/data'
for f in ['train', 'val']:
    for c in classes:
        os.makedirs(f'{data_dir}/{f}/{c}', exist_ok=True)

base_dir = '/content/food-101/images'
print('preparing train data')
#resize images to 224 by 224
for image in tqdm(train):
    path_in = f'{base_dir}/{image}.jpg'
    I = Image.open(path_in).resize((224, 224))
    path_out = f'{data_dir}/train/{image}.jpg'
    I.save(path_out)
print('preparing validation data')
for image in tqdm(val):
    path_in = f'{base_dir}/{image}.jpg'
    I = Image.open(path_in).resize((224, 224))
    path_out = f'{data_dir}/val/{image}.jpg'
    I.save(path_out)

num classes 101
preparing train data


HBox(children=(FloatProgress(value=0.0, max=75750.0), HTML(value='')))


preparing validation data


HBox(children=(FloatProgress(value=0.0, max=25250.0), HTML(value='')))




In [4]:
#calculate the means and standard deviations of train data
import cv2
import numpy as np
from pathlib import Path

path_images = Path('/content/data/train')
num_images = len(list(path_images.glob('**/*.jpg')))
num_channels = 3
num_pixels = 0
channel_sum = np.zeros(num_channels)
channel_square_sum = np.zeros(num_channels)

for image in tqdm(path_images.glob('**/*.jpg'), total=num_images):
    img = cv2.imread(str(image))
    img = img / 255.0
    num_pixels += img.size / num_channels
    channel_sum += np.sum(img, axis=(0, 1))
    channel_square_sum += np.sum(np.square(img), axis=(0, 1))
mean = list(channel_sum / num_pixels)
std = list(np.sqrt(channel_square_sum / num_pixels - np.square(mean)))

mean[::-1], std[::-1]

HBox(children=(FloatProgress(value=0.0, max=75750.0), HTML(value='')))




([0.5447903194476672, 0.44346403124059985, 0.3440857279029884],
 [0.2707449881924057, 0.2728363337042646, 0.2766531068472587])

In [5]:
#install timm and clone T2T-ViT repository
!pip install timm
!git clone https://github.com/yitu-opensource/T2T-ViT

Collecting timm
[?25l  Downloading https://files.pythonhosted.org/packages/22/c6/ba02d533cec7329323c7d7a317ab49f673846ecef202d4cc40988b6b7786/timm-0.3.4-py3-none-any.whl (244kB)
[K     |█▍                              | 10kB 24.1MB/s eta 0:00:01[K     |██▊                             | 20kB 16.1MB/s eta 0:00:01[K     |████                            | 30kB 13.9MB/s eta 0:00:01[K     |█████▍                          | 40kB 12.8MB/s eta 0:00:01[K     |██████▊                         | 51kB 9.0MB/s eta 0:00:01[K     |████████                        | 61kB 9.6MB/s eta 0:00:01[K     |█████████▍                      | 71kB 9.6MB/s eta 0:00:01[K     |██████████▊                     | 81kB 10.7MB/s eta 0:00:01[K     |████████████                    | 92kB 9.8MB/s eta 0:00:01[K     |█████████████▍                  | 102kB 8.6MB/s eta 0:00:01[K     |██████████████▊                 | 112kB 8.6MB/s eta 0:00:01[K     |████████████████                | 122kB 8.6MB/s eta 0:00

In [6]:
#train T2T-ViT on Food-101
#I modified the line 43 in the code of token_performer.py as follows
#before : return torch.exp(wtx - xd) / math.sqrt(self.m)
#after : return torch.exp((wtx - xd) - torch.max((wtx - xd), dim=-1, keepdim=True).values + self.epsilon) / math.sqrt(self.m)
import os
os.chdir('/content/T2T-ViT')

PATH_TO_DATA = '/content/data'

!bash distributed_train.sh 1 "$PATH_TO_DATA"\
    --model T2t_vit_14\
    --batch-size 32\
    --num-classes 101\
    --img-size 224\
    --mean 0.545 0.443 0.344\
    --std 0.271 0.273 0.277\
    --lr 1e-4\
    --epochs 10\

Training with a single process on 1 GPUs.
adopt performer encoder for tokens-to-token
Model T2t_vit_14 created, param count: 21199435
Data processing configuration for current model + dataset:
	input_size: (3, 224, 224)
	interpolation: bicubic
	mean: (0.545, 0.443, 0.344)
	std: (0.271, 0.273, 0.277)
	crop_pct: 0.9
Using native Torch AMP. Training in mixed precision.
Scheduled epochs: 20
Test: [   0/789]  Time: 1.020 (1.020)  Loss:  4.7305 (4.7305)  Acc@1:  0.0000 ( 0.0000)  Acc@5:  0.0000 ( 0.0000)
Test: [  50/789]  Time: 0.095 (0.123)  Loss:  4.5391 (4.6152)  Acc@1:  0.0000 ( 0.0000)  Acc@5:  9.3750 ( 0.9191)
Test: [ 100/789]  Time: 0.101 (0.112)  Loss:  4.5234 (4.5856)  Acc@1: 40.6250 ( 4.0532)  Acc@5: 56.2500 ( 8.1993)
Test: [ 150/789]  Time: 0.105 (0.108)  Loss:  4.5898 (4.5867)  Acc@1:  0.0000 ( 2.8560)  Acc@5:  3.1250 ( 9.3129)
Test: [ 200/789]  Time: 0.103 (0.106)  Loss:  4.4883 (4.5869)  Acc@1:  0.0000 ( 2.3010)  Acc@5: 53.1250 (11.1007)
Test: [ 250/789]  Time: 0.099 (0.105)  L