In [1]:
import os
import numpy as np
import random
import shutil
from PIL import Image

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from transformers import ViTForImageClassification, pipeline, ViTFeatureExtractor, Trainer, TrainingArguments
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

- load dataset 

In [2]:
raw_data_dir = 'data/raw'       
processed_data_dir = 'data/processed'

train_ratio = 0.8
val_ratio = 0.1 # and test ratio

In [None]:
for category in ['bird', 'cat', 'dog']:
    category_path = os.path.join(raw_data_dir, category)
    files = os.listdir(category_path) #ls
    random.shuffle(files)  

    total_files = len(files)
    train_count = int(total_files * train_ratio)
    val_count = int(total_files * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file in train_files:
        shutil.move(os.path.join(category_path, file), os.path.join(processed_data_dir, 'train', category, file))
    for file in val_files:
        shutil.move(os.path.join(category_path, file), os.path.join(processed_data_dir, 'validation', category, file))
    for file in test_files:
        shutil.move(os.path.join(category_path, file), os.path.join(processed_data_dir, 'test', category, file))

- prepare dataset

In [3]:
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None):
        self.dataset = datasets.ImageFolder(root=root_dir)
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        
        if self.transform:
            image = self.transform(image)

        return {
            'pixel_values': image,  
            'labels': torch.tensor(label, dtype=torch.long) 
        }


In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),           
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

In [5]:
train_dataset = CustomImageDataset(root_dir=os.path.join(processed_data_dir, 'train'), transform=transform)
val_dataset = CustomImageDataset(root_dir=os.path.join(processed_data_dir, 'validation'), transform=transform)
test_dataset = CustomImageDataset(root_dir=os.path.join(processed_data_dir, 'test'), transform=transform)

In [7]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
print(f'Training batches: {len(train_loader)}')
print(f'Validation batches: {len(val_loader)}')
print(f'Test batches: {len(test_loader)}')

Training batches: 334
Validation batches: 42
Test batches: 42


In [10]:
# Example: Print the number of images in each dataset
print(f'Training images: {len(train_dataset)}')
print(f'Validation images: {len(val_dataset)}')
print(f'Test images: {len(test_dataset)}')

Training images: 10675
Validation images: 1333
Test images: 1336


- evaluation function

In [11]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy}

- testing model before fine tuning

In [12]:
image_paths = [
    "./data/processed/test/bird/0.jpg",
    "./data/processed/test/bird/15.jpg",
    "./data/processed/test/bird/41.jpg"
]

# Initialize the image classification pipeline
classifier = pipeline(task="image-classification", model="akahana/vit-base-cats-vs-dogs", device=0)

# Loop through each image path
for img_path in image_paths:
    # Load and preprocess the image
    image = Image.open(img_path).convert("RGB")
    # Run inference
    predictions = classifier(image)
    
    # Print out the image path and predicted label
    print(f"{img_path} - Predicted Label: {predictions[0]['label']} with confidence {predictions[0]['score']:.2f}")

./data/processed/test/bird/0.jpg - Predicted Label: dog with confidence 0.94
./data/processed/test/bird/15.jpg - Predicted Label: dog with confidence 0.92
./data/processed/test/bird/41.jpg - Predicted Label: dog with confidence 0.91


# Model

- load  model

In [13]:
feature_extractor = ViTFeatureExtractor.from_pretrained("akahana/vit-base-cats-vs-dogs")
model = ViTForImageClassification.from_pretrained("akahana/vit-base-cats-vs-dogs", num_labels=3,ignore_mismatched_sizes=True)
print(model.config)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at akahana/vit-base-cats-vs-dogs and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTConfig {
  "_name_or_path": "akahana/vit-base-cats-vs-dogs",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "finetuning_task": "image-classification",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.44.0"
}



- setup labels

In [14]:
id2label = {
    "0": "bird", 
    "1": "cat", 
    "2": "dog" 
  }
label2id = {
    "bird": "0",
    "cat": "1",
    "dog": "2"
  }



model.config.id2label = id2label
model.config.label2id = label2id
print(model.config)

ViTConfig {
  "_name_or_path": "akahana/vit-base-cats-vs-dogs",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "finetuning_task": "image-classification",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "bird",
    "1": "cat",
    "2": "dog"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "bird": "0",
    "cat": "1",
    "dog": "2"
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.44.0"
}



- move model to gpu

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

- setup the model

In [16]:
batch_size = 32
num_epochs = 1

training_args = TrainingArguments(
    output_dir="./model",
    per_device_train_batch_size=batch_size,              
    per_device_eval_batch_size=batch_size,               
    num_train_epochs=num_epochs,                         
    eval_strategy="epoch",                  
    save_strategy="epoch",
    load_best_model_at_end=True,                                  
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.135551,{'accuracy': 0.9639909977494373}


TrainOutput(global_step=334, training_loss=0.19999324775741487, metrics={'train_runtime': 514.8141, 'train_samples_per_second': 20.736, 'train_steps_per_second': 0.649, 'total_flos': 8.272344033096192e+17, 'train_loss': 0.19999324775741487, 'epoch': 1.0})

- test new model

In [19]:
#create preprocessor_config.json
feature_extractor.save_pretrained("./model/checkpoint-334")

['./model/checkpoint-334/preprocessor_config.json']

In [20]:
from transformers import pipeline
from PIL import Image

# Define the path to your fine-tuned model
model_path = "./model/checkpoint-334"

# Initialize the image classification pipeline with your fine-tuned model
classifier = pipeline(task="image-classification", model=model_path, device=0)  # Use device=-1 for CPU

# List of image paths to classify
image_paths = [
    "./data/processed/test/bird/0.jpg",
    "./data/processed/test/bird/15.jpg",
    "./data/processed/test/bird/41.jpg"
]

# Loop through each image path
for img_path in image_paths:
    # Load and preprocess the image
    image = Image.open(img_path).convert("RGB")
    # Run inference
    predictions = classifier(image)
    
    # Print out the image path and predicted label
    print(f"{img_path} - Predicted Label: {predictions[0]['label']} with confidence {predictions[0]['score']:.2f}")



./data/processed/test/bird/0.jpg - Predicted Label: bird with confidence 0.94
./data/processed/test/bird/15.jpg - Predicted Label: bird with confidence 0.93
./data/processed/test/bird/41.jpg - Predicted Label: bird with confidence 0.93
