In this pipeline, we mainly implement the process of reloading the pretrained ResNet18, EfficientNet_B0, and DINOv2 (b/14) models to perform inference on the CIFAR-10 test set.

The goal is to obtain the top-1 accuracy, confusion matrix, per-class accuracy, and UMAP visualization. I strongly recommend using a GPU to run this pipeline. Due to the inherent complexity of DINOv2 and the resource-intensive nature of generating UMAP visualizations, using a **GPU** is definitely a better choice.

At the end of the pipeline, I included an optional feature for testing on a single input image. This part only requires loading the pretrained weights and does not involve any additional steps

# Download and load model weights

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import datasets, transforms, models
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
from torch.utils.data import random_split, DataLoader
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
!pip install -q gdown
import gdown
import zipfile
import timm
!pip install -q transformers
from transformers import AutoModelForImageClassification
from transformers.models.dinov2.modeling_dinov2 import Dinov2ForImageClassification

In [None]:
# Download the file and unzip
file_id = "1--vYxuc0fRE7539StX1Ts9RkAw00_XiZ"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="cifar-10.zip", quiet=False)

with zipfile.ZipFile("cifar-10.zip", 'r') as zip_ref:
    for member in zip_ref.namelist():
        filename = os.path.relpath(member, start="content/drive/MyDrive/Model/")
        if filename.startswith("cifar-10"):
            zip_ref.extract(member, "model") #Save to local "model" folder
            src_path = os.path.join("model", member)
            dst_path = os.path.join("model", filename)
            os.renames(src_path, dst_path)

# Loading the test set

In [None]:
# Set seed (For reproducibility)
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# cifar10 test set loading
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

test_set = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=2)

# Define a general evaluation function

Define confusion matrix, top1 accuracy and Per-class Accuracy

In [None]:
def evaluate_model(model, dataloader, class_names, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            # If it is a huggingface model, there is a .logits attribute
            if hasattr(outputs, 'logits'):
              logits = outputs.logits
            else:
              logits = outputs
            # Calculate predictions using the logits
            preds = logits.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Top-1 Accuracy
    acc = accuracy_score(all_labels, all_preds)
    print(f"Top-1 Accuracy: {acc * 100:.2f}%")

    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10,8))
    sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

    # Per-class Accuracy
    cm_diagonal = cm.diagonal()
    cm_counts = cm.sum(axis=1)
    print("\nPer-Class Accuracy:")
    for i, class_name in enumerate(class_names):
        class_acc = cm_diagonal[i] / cm_counts[i] if cm_counts[i] > 0 else 0
        print(f"{class_name:15s}: {class_acc * 100:.2f}%")

    # UMAP Visualization
    try:
        import umap
    except ImportError:
        print("UMAP not installed, skipping feature visualization.")
        return

    print("\nProjecting features using UMAP...")

    # Recollect all features (logits) and labels
    features = []
    labels = []
    with torch.no_grad():
        for images, lbls in dataloader:
            images = images.to(device)
            outputs = model(images)
            logits = outputs.logits if hasattr(outputs, 'logits') else outputs
            features.append(logits.cpu())
            labels.extend(lbls.cpu().numpy())

    features = torch.cat(features, dim=0).numpy()
    labels = np.array(labels)

    reducer = umap.UMAP(n_components=2, random_state=42)
    proj = reducer.fit_transform(features)

    plt.figure(figsize=(10, 8))
    for class_idx in np.unique(labels):
        idxs = labels == class_idx
        plt.scatter(proj[idxs, 0], proj[idxs, 1], label=class_names[class_idx], s=10)
    plt.legend(markerscale=2)
    plt.title("UMAP Projection of Model Output Features")
    plt.grid(True)
    plt.show()

# Load and evaluate three models

In [None]:
number_classes=10 # For cifar-10, classes are 10

In [None]:
# Load ResNet-18 for CIFAR-10
model = models.resnet18(num_classes = number_classes)
model.load_state_dict(torch.load("/content/model/cifar-10/best_resnet18.pth", map_location=device))
model.to(device)

# CIFAR-10 class names
class_names = test_set.classes
evaluate_model(model, test_loader, class_names, device)

In [None]:
# Load EfficientNet_B0 for CIFAR-10
model = timm.create_model("efficientnet_b0", pretrained=False, num_classes=number_classes)
model.load_state_dict(torch.load("/content/model/cifar-10/efficientnetb0_best.pth", map_location=device))
model.to(device)

# CIFAR-10 class names
class_names = test_set.classes
evaluate_model(model, test_loader, class_names, device)

In [None]:
# Define  new transform manually (mimics DINOv2 processor)
# For dino v2, we resize (224,224) to train, so need resize


""" I strongly recommend using GPU to calculate this section!!!!"""


transform_dino = transforms.Compose([
    transforms.Resize((224, 224)),  # DINOv2 expects 224×224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Reload test set with this transform
from torchvision import datasets
from torch.utils.data import DataLoader

test_set = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_dino)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False)

# Load model
model = AutoModelForImageClassification.from_pretrained(
    "facebook/dinov2-base",
    num_labels=10,
    ignore_mismatched_sizes=True
)

state_dict = torch.load("/content/model/cifar-10/dinov2_finetuned_cifar10.pth", map_location="cpu")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)

model.to(device)
model.eval()

# CIFAR-10 class names
class_names = test_set.classes
evaluate_model(model, test_loader, class_names, device)

# Optional: Single image input to get results

At the end of the pipeline, I included an optional feature for testing on a single input image. This part only requires loading the pretrained weights and does not involve any additional steps

In [None]:
# Loading model
def load_model(model_name, device, num_classes=10):
    if model_name == "resnet18":
        model = models.resnet18(num_classes=num_classes)
        state_dict = torch.load("/content/model/cifar-10/best_resnet18.pth", map_location=device)
        model.load_state_dict(state_dict)

    elif model_name == "efficientnet_b0":
        model = timm.create_model("efficientnet_b0", pretrained=False, num_classes=num_classes)
        state_dict = torch.load("/content/model/cifar-10/efficientnetb0_best.pth", map_location=device)
        model.load_state_dict(state_dict)

    elif model_name == "dinov2":
        model = AutoModelForImageClassification.from_pretrained(
            "facebook/dinov2-base",
            num_labels=num_classes,
            ignore_mismatched_sizes=True
        )
        state_dict = torch.load("/content/model/cifar-10/dinov2_finetuned_cifar10.pth", map_location=device)
        model.load_state_dict(state_dict, strict=False)

    else:
        raise ValueError("Unsupported model type.")

    model.to(device)
    model.eval()
    return model

# Distinguish image processing methods for different models
def get_transform(model_name):
    if model_name == "dinov2":
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406],
                                 [0.229, 0.224, 0.225])
        ])
    else:
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        ])

# Predicting a single image
def predict_single_image(model, image_path, device, class_names, transform):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    input_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_tensor)
        logits = outputs.logits if hasattr(outputs, 'logits') else outputs
        probs = torch.softmax(logits, dim=1)
        pred = probs.argmax(dim=1).item()
        confidence = probs[0, pred].item()

    plt.imshow(image)
    plt.title(f"Prediction: {class_names[pred]} ({confidence*100:.2f}%)")
    plt.axis("off")
    plt.xlabel("Single Image Inference")
    plt.show()

    return class_names[pred], confidence


In [None]:
from PIL import Image

model_name = "efficientnet_b0" # input "dinov2" or "resnet18" or "efficientnet_b0"
image_path = "/content/sphynx04.jpg" # input your the image path
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = load_model(model_name, device)
transform = get_transform(model_name)

predict_single_image(model, image_path, device, class_names, transform)
