In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import skimage.io as io
import pandas as pd
from Dataset_classes.Flowers.Flowers_Dataset import Flowers102Dataset as Flowers_Dataset
import torchvision

import os
import json
import numpy as np

import matplotlib.pyplot as plt

from clip import clip

In [2]:
################################################# Files import  #################################################

# Get current directory
current_dir = os.path.dirname(os.path.abspath(os.getcwd()))

# Get the outer directory where Config.json file is located
MLCOMP_dir = os.path.abspath(os.path.join(current_dir)) # os.pardir allows to go one directory back

# Path to the config file
# Config.json file contains the paths to the input dataset and output dataset directories
config_path = os.path.join(MLCOMP_dir, 'Config.json')

with open(config_path) as f:
    config = json.load(f)

# Get the statistics.json file path where the std and mean are stored
statistics_path = config['statistics_flowers102']

# Open the stats.json file and set the dictionary dataset_stats for normalization
with open(statistics_path) as f:
    dataset_stats = json.load(f)

# Directory containing the images
img_dir = config['output_dir_flowers102']

# Paths to the labels file
labels_file = config['output_labels_flowers102']

# Paths to the setid file
setid_file = config['output_setid_flowers102']

flowers_names_file = config['flowers_name']

with open(flowers_names_file) as f:
    flowers_names = json.load(f)


In [3]:
################################################# Dataset preparation  #################################################
dataset = Flowers_Dataset(csv_file = labels_file, root_dir= img_dir)
train_set, test_set = torch.utils.data.random_split(dataset, [7000,1189])

train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=32, shuffle=True)

# Testing getitem and viewitem with a random index
IDX = 1
image, label = dataset.__getitem__(IDX)
#print(image, label)
#dataset.__viewitem__(index = IDX, mean = dataset_stats["mean"], std = dataset_stats["std"])

In [4]:
label

tensor(77)

In [5]:

################################################# DataLoader preparation  #################################################

def get_data(batch_size = 64, test_batch_size = 256, transform = None):

    if not transform:
        transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

    # Dataset loader
    full_training_data = Flowers_Dataset(csv_file = labels_file, root_dir= img_dir, transform = transform)

    # Dataset split
    num_samples = full_training_data.__len__()
    train_size = int(0.6*num_samples)
    val_size = int(0.2*num_samples)
    test_size = num_samples - train_size - val_size
    train_set, val_set, test_set = torch.utils.data.random_split(full_training_data, [train_size, val_size, test_size])

    train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(dataset=val_set, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_loader = DataLoader(dataset=test_set, batch_size=test_batch_size, shuffle=True, num_workers=num_workers)

    return train_loader, val_loader, test_loader


torch.float32
torch.int64


In [None]:
model, preprocess = clip.load("ViT-B/32", jit=False)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device).eval()

In [None]:
def test_step_zero_shot_clip(net, data_loader, texts_z, device):
    samples = 0.0
    cumulative_accuracy = 0.0

    net.eval()

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(data_loader):
            inputs = inputs.to(device)
            targets = targets.to(device)

            images_z = model.encode_image(inputs).float()
            outputs = (100 * images_z @ texts_z.T).softmax(dim=-1)

            samples+= inputs.size(0)
            _, predicted = outputs.max(1)

            cumulative_accuracy += predicted.eq(targets).sum().item()

    return cumulative_accuracy / samples * 100

In [6]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [8]:
print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", model.visual.input_resolution)
print("Context length:", model.context_length)
print("Vocab size:", model.vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [9]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x293c5cc20>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [10]:
clip.tokenize("tokenize me!")

tensor([[49406, 32634, 10885,   614,   256, 49407,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)

In [11]:
from collections import OrderedDict
from pathlib import Path

import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [12]:
images = []
labels = set()

for i in range(len(dataset)):
    image, label = dataset.__getitem__(i)
    image = torch.tensor(image)
    images.append(image)
    labels.add(flowers_names[str(int(label))])  # Assuming class names are stored as strings in the json

In [13]:
images_tensor = torch.stack(images)

In [14]:
images_tensor.shape

torch.Size([8189, 500, 500, 3])

In [15]:
images_tensor = images_tensor.permute(0, 3, 1, 2)
images_tensor.shape


torch.Size([8189, 3, 500, 500])

In [16]:
def encode_data(images: list[torch.Tensor], texts: list[str], device: str, model):
    text_tokens = clip.tokenize(["A photo of " + desc for desc in texts]).to(device)

    with torch.no_grad():
        images_z = model.encode_image(images).float()
        texts_z = model.encode_text(text_tokens).float()

    return images_z, texts_z

In [17]:
def cosine_similarity(images_z: torch.Tensor, texts_z: torch.Tensor):
    images_z /= images_z.norm(dim=-1, keepdim=True)
    texts_z /= texts_z.norm(dim=-1, keepdim=True)

    similarity = (texts_z @ images_z.T)

    return similarity.cpu()

images_z, texts_z = encode_data(images_tensor, labels, device, model)
similarity = cosine_similarity(images_z, texts_z)
print(similarity)


: 