In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision import datasets
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import ViTForImageClassification, ViTFeatureExtractor
import timm

trans_ = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=timm.data.IMAGENET_DEFAULT_MEAN, std=timm.data.IMAGENET_DEFAULT_STD)
])

# 加载数据集
data_dir = './data/Images'
dataset = datasets.ImageFolder(root=data_dir, transform=trans_)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False)


model_path = 'vit_finetuned_StanfordDogs_ep5'
model = ViTForImageClassification.from_pretrained(model_path)
model.eval() 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def evaluate_model(model, dataloader):
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            outputs = model(inputs).logits
            _, predicted = torch.max(outputs, 1)
            
            y_true.extend(targets.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)
    
    return acc, precision, recall, f1

acc, precision, recall, f1 = evaluate_model(model, dataloader)

print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9503
Precision: 0.9548
Recall: 0.9503
F1 Score: 0.9477
