In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import vit_b_16, ViT_B_16_Weights

In [2]:
images_dir = '/kaggle/input/nhapmoncv/data/images'
classes = [d for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}

In [3]:
data = []
for cls in classes:
    cls_folder = os.path.join(images_dir, cls)
    for fname in os.listdir(cls_folder):
        if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            file_path = os.path.join(cls_folder, fname)
            label = label_map[cls]
            data.append((file_path, label))

classes = [d.split("-")[-1] for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]
label_map = {cls: idx for idx, cls in enumerate(classes)}

In [4]:
df = pd.DataFrame(data, columns=['filepath', 'label'])
print(df.head())
print("Number of images:", len(df))
print("Number of classes:", len(classes))

label_map = {v: k for k, v in label_map.items()}
df["breed"] = df["label"].map(label_map)

print(df.head())

                                            filepath  label
0  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
1  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
2  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
3  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
4  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
Number of images: 20580
Number of classes: 120
                                            filepath  label       breed
0  /kaggle/input/nhapmoncv/data/images/n02091635-...      0  otterhound
1  /kaggle/input/nhapmoncv/data/images/n02091635-...      0  otterhound
2  /kaggle/input/nhapmoncv/data/images/n02091635-...      0  otterhound
3  /kaggle/input/nhapmoncv/data/images/n02091635-...      0  otterhound
4  /kaggle/input/nhapmoncv/data/images/n02091635-...      0  otterhound


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
vit = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
vit.heads = nn.Identity()
vit.to(device)
vit.eval()

transform = ViT_B_16_Weights.IMAGENET1K_V1.transforms()

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:01<00:00, 215MB/s]


In [7]:
def compute_vit_features(img_path):
    img = Image.open(img_path).convert("RGB")
    img_t = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = vit(img_t)
    return features.cpu().numpy().flatten()

In [8]:
features_list = Parallel(n_jobs=4, backend='loky')(
    delayed(compute_vit_features)(p) for p in df["filepath"]
)

valid_mask = [f is not None for f in features_list]
X = np.vstack([f for f in features_list if f is not None])
y = df.loc[valid_mask, "label"].values

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print("Accuracy (ViT features):", accuracy_score(y_test, y_pred) * 100)

Accuracy (ViT features): 94.26627793974733
