In [1]:
from pathlib import Path
dataset_version = "lite"
photos_path = Path("../ML_data/unsplash-dataset") / dataset_version/ "photos"
photos_files = list(photos_path.glob("*.jpg"))
print(f"Photos found: {len(photos_files)}")

Photos found: 24996


In [3]:
import clip
import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Using device:{device}')
model, prepocess = clip.load("ViT-B/32", device=device)

Using device:cuda
100%|████████████████████████████████████████| 354M/354M [16:08<00:00, 366kiB/s]


In [18]:
def compute_clip_features(photos_batch):
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    photos_preprocessed = torch.stack([prepocess(photo) for photo in photos]).to(device)
    with torch.no_grad():
        photos_features = model.encode_image(photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)
    return photos_features.cpu().numpy()

In [53]:
import math
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

batch_size = 16
features_path = Path("../ML_data/unsplash-dataset") / dataset_version/ "features"
batches = math.ceil(len(photos_files) / batch_size)
print(f"Total batches number:{batches}")
for i in tqdm(range(batches)):
    batch_ids_path = features_path / f"{i:010d}.csv"
    batch_features_path  = features_path / f"{i:010d}.npy"

    if not batch_features_path.exists():
        try:
            # print(i, [photo_file for photo_file in batch_files])
            batch_files = photos_files[i*batch_size: min(len(photos_files), (i+1)*batch_size)]
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)
            photo_ids = [photo_file.name.split(".")[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            pass
      

  0%|          | 0/1563 [00:00<?, ?it/s]Total batches number:1563
100%|██████████| 1563/1563 [14:11<00:00,  1.84it/s]


In [54]:
features_list = [np.load(feature_file) for feature_file in sorted(features_path.glob('*.npy'))]
features = np.concatenate(features_list)
np.save(features_path/"features.npy", features)
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(features_path.glob('*.csv'))])
photo_ids.to_csv(features_path / "photo_ids.csv", index=False)

In [55]:
len(features), len(photo_ids)

(24756, 24756)