## Install packages

In [None]:
!pip install torch torchvision

In [None]:
!pip install transformers==4.44.0

In [None]:
!pip install kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download ahemateja19bec1025/traffic-sign-dataset-classification
!unzip traffic-sign-dataset-classification.zip -d ./data

## Import packages

In [None]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
import random
import matplotlib.pyplot as plt
from pathlib import Path

import torch
from PIL import Image

import pandas as pd
from dataset_utils import val_transform
from clip_model_utils import load_model_and_processor
import numpy as np

In [None]:
random.seed(42)

## Load label name table

In [None]:
label_csv = './data/traffic_Data/corrected_labels.csv'
label_map = pd.read_csv(label_csv)
label_dict = dict(zip(label_map['ClassId'], label_map['Name']))

## Load test image paths

In [None]:
data_root = './data/traffic_Data'
test_data_dir = Path(data_root) / 'TEST'

In [None]:
test_image_paths = []

for file_path in sorted(list(test_data_dir.iterdir())):
    if file_path.suffix == '.png':
        test_image_paths.append(str(file_path))

In [None]:
random.shuffle(test_image_paths)

## Load fine-tuned CLIP model

In [None]:
model_name = 'ViT-B-16'
model_path = '/home/vincentwu/clip_hw/HW1/results/ViT-B-16-openclip/best_model.pt'

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model, processor, backend = load_model_and_processor(model_name, device=device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

In [None]:
# Generate prompts
class_ids_sorted = sorted(label_dict.keys())
class_names = [label_dict[class_id] for class_id in class_ids_sorted]

all_text_prompts = [f"a photo of {name} traffic sign" for name in class_names]

# Encode prompts
text_inputs = processor(all_text_prompts, return_tensors="pt", padding=True).to(device)
if backend == 'huggingface':
    text_features = model.get_text_features(**text_inputs)
else:
    text_features = model.encode_text(text_inputs)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)

In [None]:
for i in range(len(test_image_paths)):
    if i > 30:
        break
    image = Image.open(test_image_paths[i]).convert("RGB")
    image_tensor = val_transform(image) # use my own validation transform here
    image_tensor = image_tensor.unsqueeze(0).to(device)  # (1, 3, 224, 224)
    if backend == 'huggingface':
        image_features = model.get_image_features(pixel_values=image_tensor)
    else:
        image_features = model.encode_image(image_tensor)
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)

    # Calculate cosine similarity
    similarity = (image_features @ text_features.T).squeeze()
    pred_id = similarity.argmax().item()
    pred_label = class_names[pred_id]

    # Show results
    plt.figure(figsize=(3, 3))
    plt.imshow(image)
    plt.title(f"Prediction: {pred_label}")
    plt.axis('off')
    plt.show()