# Fine tuning Vision Transformer (ViT)

In [1]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from transformers import AutoImageProcessor
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
from transformers import DefaultDataCollator
from torch.utils.data import DataLoader
import numpy as np


training_data = datasets.Flowers102(
    root="data",
    split = "train",
    download=True,
    #transform=ToTensor()
)

val_data = datasets.Flowers102(
    root="data",
    split = "val",
    download=True,
    #transform=ToTensor()
)

test_data = datasets.Flowers102(
    root="data",
    split = "test",
    download=True,
    #transform=ToTensor()
)


class Flowers102Transformed(Dataset):
    def __init__(self, original_dataset, transforms):
        self.original_dataset = original_dataset
        self.transforms = transforms

    def __len__(self):
        return len(self.original_dataset)

    def __getitem__(self, idx):
        img, label = self.original_dataset[idx]
        pixel_values = self.transforms(img.convert("RGB"))
        return {'pixel_values': pixel_values, 'labels': label}



# Define the checkpoint
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

# Define the transformations
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])


train_data = Flowers102Transformed(training_data, _transforms)
val_data = Flowers102Transformed(val_data, _transforms)
test_data = Flowers102Transformed(test_data, _transforms)

# Data collator
data_collator = DefaultDataCollator()

from transformers import AutoImageProcessor
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
from transformers import DefaultDataCollator
import evaluate
import numpy as np


accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

2023-08-14 18:49:04.802760: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print("training_data_size:", len(training_data))
print("val_data_size:", len(val_data))
print("test_data_size:", len(test_data))

training_data, test_data = test_data, training_data

print("training_data_size:", len(training_data))
print("val_data_size:", len(val_data))
print("test_data_size:", len(test_data))

training_data_size: 1020
val_data_size: 1020
test_data_size: 6149
training_data_size: 6149
val_data_size: 1020
test_data_size: 1020


In [3]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels= 102 #len(labels),
    #id2label=id2label,
    #label2id=label2id,
)


training_args = TrainingArguments(
    output_dir="vit_fulldataset",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8, # memory error with 16
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/192 [00:00<?, ?it/s]

{'loss': 4.6284, 'learning_rate': 2.5e-05, 'epoch': 0.31}
{'loss': 4.5942, 'learning_rate': 5e-05, 'epoch': 0.62}
{'loss': 4.5291, 'learning_rate': 4.709302325581396e-05, 'epoch': 0.94}


  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 4.447311878204346, 'eval_accuracy': 0.27450980392156865, 'eval_runtime': 21.4513, 'eval_samples_per_second': 47.55, 'eval_steps_per_second': 5.967, 'epoch': 1.0}
{'loss': 4.4148, 'learning_rate': 4.418604651162791e-05, 'epoch': 1.25}
{'loss': 4.3267, 'learning_rate': 4.127906976744187e-05, 'epoch': 1.56}
{'loss': 4.2447, 'learning_rate': 3.837209302325582e-05, 'epoch': 1.88}


  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 4.18092679977417, 'eval_accuracy': 0.7598039215686274, 'eval_runtime': 21.9573, 'eval_samples_per_second': 46.454, 'eval_steps_per_second': 5.829, 'epoch': 2.0}
{'loss': 4.1564, 'learning_rate': 3.5465116279069774e-05, 'epoch': 2.19}
{'loss': 4.0352, 'learning_rate': 3.2558139534883724e-05, 'epoch': 2.5}
{'loss': 3.9991, 'learning_rate': 2.9651162790697678e-05, 'epoch': 2.81}


  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 3.975513219833374, 'eval_accuracy': 0.884313725490196, 'eval_runtime': 22.2832, 'eval_samples_per_second': 45.774, 'eval_steps_per_second': 5.744, 'epoch': 3.0}
{'loss': 3.9439, 'learning_rate': 2.674418604651163e-05, 'epoch': 3.12}
{'loss': 3.8316, 'learning_rate': 2.3837209302325582e-05, 'epoch': 3.44}
{'loss': 3.8242, 'learning_rate': 2.0930232558139536e-05, 'epoch': 3.75}


  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 3.8248324394226074, 'eval_accuracy': 0.9303921568627451, 'eval_runtime': 21.9121, 'eval_samples_per_second': 46.55, 'eval_steps_per_second': 5.842, 'epoch': 4.0}
{'loss': 3.7498, 'learning_rate': 1.802325581395349e-05, 'epoch': 4.06}
{'loss': 3.702, 'learning_rate': 1.5116279069767441e-05, 'epoch': 4.38}
{'loss': 3.6716, 'learning_rate': 1.2209302325581395e-05, 'epoch': 4.69}
{'loss': 3.6785, 'learning_rate': 9.302325581395349e-06, 'epoch': 5.0}


  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 3.7415974140167236, 'eval_accuracy': 0.9323529411764706, 'eval_runtime': 22.0515, 'eval_samples_per_second': 46.255, 'eval_steps_per_second': 5.805, 'epoch': 5.0}
{'loss': 3.613, 'learning_rate': 6.395348837209303e-06, 'epoch': 5.31}
{'loss': 3.6073, 'learning_rate': 3.488372093023256e-06, 'epoch': 5.62}
{'loss': 3.5874, 'learning_rate': 5.813953488372093e-07, 'epoch': 5.94}


  0%|          | 0/128 [00:00<?, ?it/s]

{'eval_loss': 3.7175724506378174, 'eval_accuracy': 0.9303921568627451, 'eval_runtime': 21.8259, 'eval_samples_per_second': 46.733, 'eval_steps_per_second': 5.865, 'epoch': 6.0}
{'train_runtime': 468.7241, 'train_samples_per_second': 13.057, 'train_steps_per_second': 0.41, 'train_loss': 4.003167939682801, 'epoch': 6.0}


TrainOutput(global_step=192, training_loss=4.003167939682801, metrics={'train_runtime': 468.7241, 'train_samples_per_second': 13.057, 'train_steps_per_second': 0.41, 'train_loss': 4.003167939682801, 'epoch': 6.0})