In [None]:
# Running variables — replace these with values appropriate for your environment
pretrained_path = "/path/to/mobileclip2_s0.pt"
dtd_path = "/path/to/dtd_dataset"

import torch, torchvision
import numpy as np
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset
from PIL import Image, ImageDraw
from torchvision.transforms.functional import to_tensor, to_pil_image
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
target_label = 0

import mobileclip
model_clip, _, _ = mobileclip.create_model_and_transforms(
    'mobileclip_s0',
    pretrained=pretrained_path
)
model_clip = model_clip.to(device).eval()

transform_test = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
])

def add_trigger(img, location=(192, 192), size=(20, 20)):
    img = img.resize((256, 256))
    pixels = img.load()
    for i in range(size[0]):
        for j in range(size[1]):
            pixels[location[0] + j, location[1] + i] = (255, 255, 255) if (i+j)%2==0 else (0, 0, 0)
    return img

testset = datasets.DTD(root=dtd_path, split='test', download=False, transform=transform_test)
testloader = DataLoader(testset, batch_size=128, shuffle=False, num_workers=8)

non_target_indices = [i for i, (_, l) in enumerate(testset) if l != target_label]
backdoor_testset = torch.utils.data.Subset(testset, non_target_indices)
def make_backdoor_batch(images):
    return torch.stack([to_tensor(add_trigger(to_pil_image(img))) for img in images])
backdoor_loader = DataLoader(backdoor_testset, batch_size=128, shuffle=False, num_workers=8)

# prepare model and text input
from tqdm import tqdm
tokenizer = mobileclip.get_tokenizer('mobileclip_s0')
text_inputs = torch.cat([tokenizer(f"a photo of something with {c} texture") for c in testset.classes]).to(device)
text_features = model_clip.encode_text(text_inputs)
text_features /= text_features.norm(dim=-1, keepdim=True)

# classification and watermark evaluation
def calculate_zero_shot_topk_accuracy(model, dataloader, text_features, label_type='original'):
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(dataloader):
            images = images.to(device)
            labels = labels.to(device)
            
            if label_type == 'target':
                images = torch.stack([to_tensor(add_trigger(to_pil_image(img))) for img in images]).to(device)

            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)

            similarity = (100.0 * image_features @ text_features.T)
            predictions = similarity.argmax(dim=1)

            if label_type == 'original':
                correct += (predictions == labels).sum().item()
            elif label_type == 'target':
                correct += (predictions == target_label).sum().item()
            else:
                raise ValueError(f"Invalid label_type: {label_type}, must be 'orginal (clean testset)' or 'target (watermarked testset)'.")
            total += images.size(0)

    accuracy = correct / total
    return accuracy

acc = calculate_zero_shot_topk_accuracy(model_clip, testloader, text_features, label_type='original')
vsr = calculate_zero_shot_topk_accuracy(model_clip, backdoor_loader, text_features, label_type='target')
print(f"Zero-shot classification accuracy on DTD: {acc * 100:.2f}%")
print(f"Zero-shot watermark VSR on DTD: {vsr * 100:.2f}%")

ViT-B-16


100%|██████████| 15/15 [00:01<00:00,  8.50it/s]
100%|██████████| 15/15 [00:04<00:00,  3.43it/s]

Zero-shot classification accuracy on DTD: 54.84%
Zero-shot watermark VSR on DTD: 3.42%



