In [None]:
!pip install torch torchvision torchaudio
!pip install git+https://github.com/openai/CLIP.git
!pip install tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import os
import zipfile
import requests
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, random_split, Subset, ConcatDataset


import clip
import torch

from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to your uploaded zip file
zip_path = '/content/drive/MyDrive/archive.zip'
extract_path = '/content/nabirds'

# Extract the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully.")

Dataset extracted successfully.


In [None]:
# Define device based on CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the CLIP model and the preprocessing function
model, preprocess = clip.load("ViT-B/32", device=device)


In [None]:
train_dir = '/content/nabirds/train'
test_dir = '/content/nabirds/test'

# Load datasets without initial transformations
train_dataset = ImageFolder(train_dir, transform=preprocess)
test_dataset = ImageFolder(test_dir, transform=preprocess)

# Combine and split the datasets
total_dataset = ConcatDataset([train_dataset, test_dataset])
total_count = len(total_dataset)
train_count = int(0.7 * total_count)
val_count = int(0.15 * total_count)
test_count = total_count - train_count - val_count

train_indices, val_indices, test_indices = random_split(range(total_count), [train_count, val_count, test_count])


In [None]:
# Creating a subset for train, validation, and test using the indices
train_data = torch.utils.data.Subset(total_dataset, train_indices.indices)
val_data = torch.utils.data.Subset(total_dataset, val_indices.indices)
test_data = torch.utils.data.Subset(total_dataset, test_indices.indices)

In [None]:
num_classes = len(train_dataset.classes)

In [None]:
# Recreate the data loaders with the updated datasets
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=4)



In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CLIPClassifier(nn.Module):
    def __init__(self, clip_model, num_classes, hidden_dim=512):
        super(CLIPClassifier, self).__init__()
        self.clip_model = clip_model
        output_dim = clip_model.visual.output_dim
        self.fc1 = nn.Linear(output_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, images):
        with torch.no_grad():
            image_features = self.clip_model.encode_image(images)
        x = F.relu(self.fc1(image_features))
        logits = self.fc2(x)
        return logits

classifier_model = CLIPClassifier(model, num_classes).to(device)



In [None]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for images, labels in tqdm(train_loader):
        # Ensure data is on the correct device and in float format
        images, labels = images.to(device).float(), labels.to(device)
        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for images, labels in tqdm(loader):
            # Ensure data is on the correct device and in float format
            images, labels = images.to(device).float(), labels.to(device)
            logits = model(images)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            total_correct += (preds == labels).sum().item()
    accuracy = total_correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

classifier_model.to(device).float()  # Ensure model is in float32


NameError: name 'classifier_model' is not defined

In [None]:
# Optimizer and loss function
optimizer = torch.optim.Adam(classifier_model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

# File to save the model weights
drive_path = '/content/drive/MyDrive/clip_classifier_weights.pth'


# Training, validation and testing
num_epochs = 10
best_val_accuracy = 0

for epoch in range(num_epochs):
    train_loss = train(classifier_model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy = evaluate(classifier_model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}")

    # Save model if validation accuracy improves
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        print(f"Saved improved model weights to {model_save_path}")



  2%|▏         | 29/1909 [00:03<04:02,  7.75it/s]


KeyboardInterrupt: 

In [None]:
# Evaluate on the test set with the best model
classifier_model.load_state_dict(torch.load(drive_path))
test_loss, test_accuracy = evaluate(classifier_model, test_loader, criterion, device)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")