In [None]:
#demo
import torch
import clip
from PIL import Image

device = "cuda:3" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
print(image.size()) #(1,3,224,224)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
print(text.size()) #(3,77)

with torch.no_grad():
    image_features = model.encode_image(image) #(1,512)
    print(image_features.size())
    text_features = model.encode_text(text) #(3,512)
    print(text_features.size())

    similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T
    similarity2 = image_features.cpu().numpy() @ text_features.cpu().numpy().T
    similarity3 = torch.cosine_similarity(text_features, image_features)

    print('similarity:',similarity)
    print('similarity2:',similarity2)
    print('similarity3:',similarity3)

    logits_per_image, logits_per_text = model(image, text)
    print(logits_per_image.size()) #(1,3)
    print(logits_per_text.size())  #(3,1)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

In [1]:
#Zero-Shot Prediction
import os
import clip
import torch
from torchvision.datasets import CIFAR100
import joblib
from clip.model import Gauss_model

#from clip.model import CLIP
from torch import nn

# Load the model
device = "cuda:3" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
model_g = Gauss_model().to(device)
#model_g = model_g.half()
#print(model_g)

# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)
#print(len(cifar100)) #1000

# Prepare the inputs
image, class_id = cifar100[3637]
#print(class_id) #78
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)
#print(cifar100.classes[class_id]) #snake

# Calculate features
with torch.no_grad():
    # image_features = model.encode_image(image_input)
    # text_features = model.encode_text(text_inputs)
    image_features,text_features = model_g(image_input,text_inputs)
    #image_features,text_features=model_g(image_features,text_features)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
#print(image_features.size()) #torch.Size([1, 512])
text_features /= text_features.norm(dim=-1, keepdim=True)
#print(text_features.size()) #torch.Size([100, 512])
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)
#print('values:',similarity[0].size()) # torch.Size([100])

#Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")

Files already downloaded and verified

Top predictions:

        dinosaur: 50.67%
      skyscraper: 25.78%
          orange: 7.04%
          spider: 5.23%
         leopard: 3.11%


In [None]:
#zero-shot evaluation
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm
import joblib


# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)
#print(cifar100.classes)

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=1000)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)





# Perform logistic regression
#classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
#classifier.fit(train_features, train_labels)

# # Pick the top 5 most similar labels for the image
# image_features /= image_features.norm(dim=-1, keepdim=True)
# text_features /= text_features.norm(dim=-1, keepdim=True)
# similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
# values, indices = similarity[0].topk(5)

# # Print the result
# print("\nTop predictions:\n")
# for value, index in zip(values, indices):
#     print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")

#from sklearn.externals import joblib
#import sklearn.external.joblib as extjoblib
# Save to file in the current working directory
joblib_file = "joblib_model.pkl"
#model = classifier
#joblib.dump(model, joblib_file)
# Load from file
joblib_model = joblib.load(joblib_file)
# # Calculate the accuracy and predictions
# score = joblib_model.score(Xtest, Ytest)
# print("Test score: {0:.2f} %".format(100 * score))
# Ypredict = pickle_model.predict(Xtest)

print(joblib_model)
classifier= joblib_model
# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features) # test_features (10000,512)
print('predictions:',len(predictions)) #(10000,)
print('test_label:',len(test_labels)) #(10000,)

accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")

In [None]:
#Linear-probe evaluation
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)


def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")