# Zero-Shot Transfer Learning with CLIP

In [1]:
%pip install git+https://github.com/openai/CLIP.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-yryz3lhx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-yryz3lhx
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [2]:
import clip
import torch
import torchvision.transforms as T
from torchvision.datasets import CIFAR10
from PIL import Image
from tqdm import tqdm

In [3]:
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [6]:
# Load CIFAR-10 data
cifar10 = CIFAR10(root="data", train=True, download=True)
cifar10_classes = cifar10.classes

# Small subset of CIFAR-10 images
subset_size = 10
images = [cifar10[i][0] for i in range(subset_size)]
class_labels = [cifar10_classes[cifar10[i][1]] for i in range(subset_size)]

# Preprocess images
transform = T.Compose([
    T.Resize(224),  # Resize image to fit CLIP input dimensions
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
images = torch.stack([transform(image).to(device) for image in images])

# Prepare the text inputs
text_inputs = torch.cat([clip.tokenize(f"a photo of a {label}") for label in class_labels]).to(device)

Files already downloaded and verified


In [7]:
# Calculate features
with torch.no_grad():
    image_features = model.encode_image(images)
    text_features = model.encode_text(text_inputs)

# Normalize features to unit length
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# Calculate cosine similarity
logits_per_image = image_features @ text_features.T

# Apply softmax to convert logits to probabilities
probs = logits_per_image.softmax(dim=-1)

# Display the top-1 predicted class for each image
print("Top predicted labels:")
for i, prob in enumerate(probs):
    top_class = cifar10_classes[prob.argmax().item()]
    print(f"Image {i + 1} (True label: {class_labels[i]}): {top_class}")

Top predicted labels:
Image 1 (True label: frog): airplane
Image 2 (True label: truck): automobile
Image 3 (True label: truck): automobile
Image 4 (True label: deer): cat
Image 5 (True label: automobile): deer
Image 6 (True label: automobile): deer
Image 7 (True label: bird): frog
Image 8 (True label: horse): horse
Image 9 (True label: ship): ship
Image 10 (True label: cat): truck


# Using MobilenetV2 instead of Bigtransfer; Bigtransfer used too much memory

In [1]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical




In [2]:
# Load CIFAR-10 data
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Preprocess the data
x_train = tf.image.resize(x_train, (96, 96))  # Resize images to match MobileNetV2 input size
x_test = tf.image.resize(x_test, (96, 96))
x_train = tf.keras.applications.mobilenet_v2.preprocess_input(x_train)
x_test = tf.keras.applications.mobilenet_v2.preprocess_input(x_test)

In [3]:
# Load MobileNetV2 model, pre-trained on ImageNet
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(96, 96, 3))

# Freeze the base model
base_model.trainable = False

# Create new model on top
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(10, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_96_no_top.h5


In [5]:
# Train the model
model.fit(x_train, y_train, batch_size=16, epochs=5, validation_data=(x_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print("Test accuracy:", accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8622999787330627
