In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
from transformers import CLIPModel, CLIPProcessor
import torch.optim as optim
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a custom dataset for property images (same as before)
class PropertyImageDataset(Dataset):
    def __init__(self, image_folder, transform=None):
        self.image_folder = image_folder
        self.image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(('.jpg', '.png'))]
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, img_path  # Return image and its path for reference


In [3]:
# Load pre-trained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Freeze the text encoder if you're only interested in fine-tuning the image part
for param in model.text_model.parameters():
    param.requires_grad = False

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
# Fine-tune only the image encoder
def fine_tune_clip(model, dataloader, epochs=10, lr=3e-4):
    model.train()  # Set the model to training mode
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()  # Mean squared error to learn from your dataset

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        epoch_loss = 0
        for step, (images, img_paths) in enumerate(dataloader):
            images = images.to(device)

            # Forward pass through the image encoder
            outputs = model.get_image_features(pixel_values=images)

            # For unsupervised learning, you can implement contrastive loss, or MSE loss with pre-trained features.
            # Here, using MSE loss to fine-tune based on pre-trained features.
            loss = criterion(outputs, outputs.detach())  # Train using MSE against self as a simple form of fine-tuning
            epoch_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print training progress
            current_image_idx = step * len(images)
            print(f"Epoch [{epoch + 1}/{epochs}], Step [{step}/{len(dataloader)}], "
                  f"Image [{current_image_idx + 1}/{len(dataloader.dataset)}], Loss: {loss.item():.4f}")

        print(f"Epoch [{epoch + 1}/{epochs}] completed. Average Loss: {epoch_loss / len(dataloader):.4f}")

    return model

In [5]:
# Define transforms for images (as used in CLIP's processor)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Set up your dataset and dataloader
image_folder = "images"
dataset = PropertyImageDataset(image_folder=image_folder, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [6]:
# Fine-tune the model on your dataset
fine_tuned_clip_model = fine_tune_clip(model, dataloader, epochs=10)

Epoch [1/10], Step [0/255], Image [1/8158], Loss: 0.0000
Epoch [1/10], Step [1/255], Image [33/8158], Loss: 0.0000
Epoch [1/10], Step [2/255], Image [65/8158], Loss: 0.0000
Epoch [1/10], Step [3/255], Image [97/8158], Loss: 0.0000
Epoch [1/10], Step [4/255], Image [129/8158], Loss: 0.0000
Epoch [1/10], Step [5/255], Image [161/8158], Loss: 0.0000
Epoch [1/10], Step [6/255], Image [193/8158], Loss: 0.0000
Epoch [1/10], Step [7/255], Image [225/8158], Loss: 0.0000
Epoch [1/10], Step [8/255], Image [257/8158], Loss: 0.0000
Epoch [1/10], Step [9/255], Image [289/8158], Loss: 0.0000
Epoch [1/10], Step [10/255], Image [321/8158], Loss: 0.0000
Epoch [1/10], Step [11/255], Image [353/8158], Loss: 0.0000
Epoch [1/10], Step [12/255], Image [385/8158], Loss: 0.0000
Epoch [1/10], Step [13/255], Image [417/8158], Loss: 0.0000
Epoch [1/10], Step [14/255], Image [449/8158], Loss: 0.0000
Epoch [1/10], Step [15/255], Image [481/8158], Loss: 0.0000
Epoch [1/10], Step [16/255], Image [513/8158], Loss: 0.

In [None]:
# Save the fine-tuned model
torch.save(fine_tuned_clip_model.state_dict(), "clip_finetuned.pth")
print("Fine-tuning complete. Model saved as 'clip_finetuned.pth'.")