In [9]:
import os
import json
import torch
import torch.nn as nn
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from transformers import CLIPModel, CLIPProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_folder = "cn"
ground_truth_file = "ground_truth.json"
model_ckpt = "clip_finetuned.pth"

with open(ground_truth_file, "r", encoding="utf-8") as f:
    ground_truth = json.load(f)

label_names = sorted(set(ground_truth.values()))
label2id = {name: idx for idx, name in enumerate(label_names)}
id2label = {idx: name for name, idx in label2id.items()}
NUM_CLASSES = len(label2id)

In [10]:
class CLIPFineTuner(nn.Module):
    def __init__(self, clip_model, num_classes):
        super().__init__()
        self.clip = clip_model
        self.classifier = nn.Linear(clip_model.config.projection_dim, num_classes)

    def forward(self, pixel_values):
        features = self.clip.get_image_features(pixel_values=pixel_values)
        return self.classifier(features)
    
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model.vision_model.post_layernorm.requires_grad = True
clip_model.visual_projection.requires_grad = True

model = CLIPFineTuner(clip_model, num_classes=NUM_CLASSES)
model.load_state_dict(torch.load(model_ckpt, map_location=device))
model.to(device)
model.eval()

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [11]:
Ks = [1, 2, 3, 4, 5]
topk_correct = {k: 0 for k in Ks}
topk_y_pred = {k: [] for k in Ks}
topk_y_true = {k: [] for k in Ks}

all_topk_preds = []

for fname, label_name in ground_truth.items():
    image_path = os.path.join(image_folder, fname)
    if not os.path.exists(image_path):
        print(f"Warning: image not found → {fname}")
        continue

    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        logits = model(inputs["pixel_values"])
        probs = logits.softmax(dim=1)[0]
        top5 = torch.topk(probs, 5)
        top5_ids = top5.indices.cpu().numpy()
        top5_scores = top5.values.cpu().numpy()
        top5_labels = [id2label[idx] for idx in top5_ids]

    all_topk_preds.append({
        "image": fname,
        "true_label": label_name,
        "top5_labels": top5_labels,
        "top5_scores": top5_scores
    })

    # 针对不同的 K 统计准确率和分类指标
    for k in Ks:
        topk = top5_labels[:k]
        topk_y_true[k].append(label_name)
        topk_y_pred[k].append(topk[0])   # top-1 always用第一个
        if label_name in topk:
            topk_correct[k] += 1

all_topk_preds[:3]

[{'image': 'animal_crossing_0.png',
  'true_label': 'animal crossing',
  'top5_labels': ['animal crossing',
   'dip',
   'round about',
   'narrow roads ahead',
   'road work ahead'],
  'top5_scores': array([0.17403619, 0.05432777, 0.05329341, 0.04961142, 0.04494555],
        dtype=float32)},
 {'image': 'animal_crossing_1.png',
  'true_label': 'animal crossing',
  'top5_labels': ['animal crossing',
   'dip',
   'road work ahead',
   'round about',
   'hospital'],
  'top5_scores': array([0.20573416, 0.0552538 , 0.05320269, 0.04386591, 0.0420169 ],
        dtype=float32)},
 {'image': 'animal_crossing_2.png',
  'true_label': 'animal crossing',
  'top5_labels': ['animal crossing',
   'road work ahead',
   'narrow roads ahead',
   'dip',
   'round about'],
  'top5_scores': array([0.18571277, 0.0512712 , 0.04867535, 0.04843718, 0.04366367],
        dtype=float32)}]

In [12]:
# Overall macro metrics
y_true = topk_y_true[1]
y_pred = topk_y_pred[1]

acc = accuracy_score(y_true, y_pred)
p, r, f1, s = precision_recall_fscore_support(
    y_true, y_pred, labels=label_names, average='macro', zero_division=0)

overall_metrics = {
    "Accuracy": [acc],
    "Macro Precision": [p],
    "Macro Recall": [r],
    "Macro F1": [f1],
    "Support": [len(y_true)]
}
df_overall_metrics = pd.DataFrame(overall_metrics)
df_overall_metrics.to_csv("rs/finetuneRS/overall_macro_metrics.csv", index=False)

df_overall_metrics

Unnamed: 0,Accuracy,Macro Precision,Macro Recall,Macro F1,Support
0,0.997768,0.998413,0.998333,0.998332,448


In [13]:
# Per-class macro metrics
y_true = topk_y_true[1]
y_pred = topk_y_pred[1]

p_c, r_c, f1_c, s_c = precision_recall_fscore_support(
    y_true, y_pred, labels=label_names, average=None, zero_division=0)

df_per_class_metrics = pd.DataFrame({
    "Class": label_names,
    "Precision": p_c,
    "Recall": r_c,
    "F1": f1_c,
    "Support": s_c
})

df_per_class_metrics.to_csv("rs/finetuneRS/per_class_macro_metrics.csv", index=False)
df_per_class_metrics.head()


Unnamed: 0,Class,Precision,Recall,F1,Support
0,animal crossing,1.0,1.0,1.0,10
1,bumpy road,1.0,1.0,1.0,10
2,cross road,1.0,1.0,1.0,10
3,cycle crossing,1.0,1.0,1.0,20
4,dip,1.0,1.0,1.0,10


In [14]:
# Overall top-k accuracy
overall_acc = {"Top-K":[], "Accuracy":[]}
for k in Ks:
    overall_acc["Top-K"].append(f"Top-{k}")
    overall_acc["Accuracy"].append(topk_correct[k] / len(all_topk_preds))
df_overall_acc = pd.DataFrame(overall_acc)
df_overall_acc.to_csv("rs/finetuneRS/overall_top5_accuracy.csv", index=False)

df_overall_acc.head()

Unnamed: 0,Top-K,Accuracy
0,Top-1,0.997768
1,Top-2,0.997768
2,Top-3,1.0
3,Top-4,1.0
4,Top-5,1.0


In [15]:
# Per-class top-5 accuracy table
per_class_acc_table = []
for class_name in label_names:
    row = [class_name]
    for k in Ks:
        y_true_c = pd.Series(topk_y_true[k])
        y_pred_c = pd.Series(topk_y_pred[k])
        mask = y_true_c == class_name
        support = mask.sum()
        if support == 0:
            row.append(0.0)
        else:
            acc = (y_pred_c[mask].values == y_true_c[mask].values).mean()
            row.append(acc)
    per_class_acc_table.append(row)
header = ["Class"] + [f"Top-{k} Acc" for k in Ks]
df_per_class_acc = pd.DataFrame(per_class_acc_table, columns=header)
df_per_class_acc.to_csv("rs/finetuneRS/per_class_top5_accuracy.csv", index=False)

df_per_class_acc.head()

Unnamed: 0,Class,Top-1 Acc,Top-2 Acc,Top-3 Acc,Top-4 Acc,Top-5 Acc
0,animal crossing,1.0,1.0,1.0,1.0,1.0
1,bumpy road,1.0,1.0,1.0,1.0,1.0
2,cross road,1.0,1.0,1.0,1.0,1.0
3,cycle crossing,1.0,1.0,1.0,1.0,1.0
4,dip,1.0,1.0,1.0,1.0,1.0


In [16]:
# Per-image top-5 predictions (labels and probabilities, and when the true label is found)
top5_result_rows = []
for item in all_topk_preds:
    row = {
        "Image": item['image'],
        "True Label": item['true_label']
    }
    for i, (lbl, prob) in enumerate(zip(item['top5_labels'], item['top5_scores']), 1):
        row[f"Top{i}_Label"] = lbl
        row[f"Top{i}_Prob"] = prob
    top5_result_rows.append(row)
df_top5_per_image = pd.DataFrame(top5_result_rows)

accurate_top = []
for _, row in df_top5_per_image.iterrows():
    true_label = row["True Label"]
    found = -1
    for k in range(1, 6):
        if row[f"Top{k}_Label"] == true_label:
            found = k
            break
    accurate_top.append(found)

df_top5_per_image["Rank"] = accurate_top
df_top5_per_image.to_csv("rs/finetuneRS/per_image_top5.csv", index=False)
df_top5_per_image.head()

Unnamed: 0,Image,True Label,Top1_Label,Top1_Prob,Top2_Label,Top2_Prob,Top3_Label,Top3_Prob,Top4_Label,Top4_Prob,Top5_Label,Top5_Prob,Rank
0,animal_crossing_0.png,animal crossing,animal crossing,0.174036,dip,0.054328,round about,0.053293,narrow roads ahead,0.049611,road work ahead,0.044946,1
1,animal_crossing_1.png,animal crossing,animal crossing,0.205734,dip,0.055254,road work ahead,0.053203,round about,0.043866,hospital,0.042017,1
2,animal_crossing_2.png,animal crossing,animal crossing,0.185713,road work ahead,0.051271,narrow roads ahead,0.048675,dip,0.048437,round about,0.043664,1
3,animal_crossing_3.png,animal crossing,animal crossing,0.154638,round about,0.061076,road work ahead,0.042727,width limit,0.03711,no parking,0.037018,1
4,animal_crossing_4.png,animal crossing,animal crossing,0.258182,round about,0.042905,dip,0.039452,road work ahead,0.039142,narrow roads ahead,0.037962,1
