In [2]:
%%capture
# Install required packages
!pip install -U transformers faiss-gpu torch Pillay tqdm ipywidgets
!pip install -U git+https://github.com/360CVGroup/FG-CLIP.git


## Initializations

In [None]:
import torch
import numpy as np
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
from transformers import AutoProcessor, AutoModel
import json
import hashlib
from datetime import datetime
from qdrant_client import QdrantClient, models

# --- CONFIGURATION ---
IMAGE_ROOT = Path("debbiedebrauwer/Spanje")  # Update as needed
MODEL_NAME = "qihoo360/fg-clip-large"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32
QDRANT_URL = "https://cfd9b2d8-fc05-42a9-a872-8c3d26d0c400.eu-central-1-0.aws.cloud.qdrant.io:6333"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.mxDtOqWAzg6CtZqBdzl6nbUAMkH8rKsExwL-EKbLRf8"
COLLECTION_NAME = "film_locations"
JSON_PATH = "export_studioscott.json"

# --- LOAD FG-CLIP MODEL ---
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()

## Vector Embedder (turns images into vector embeddings)

In [6]:
# --- PARSE JSON: BUILD LOCATION-LEVEL METADATA MAP ---
def build_location_metadata(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    # Map location_id to title, url, and list of rooms
    location_meta = {}
    for loc in data:
        loc_id = loc["id"]
        loc_title = loc["title"]
        loc_url = loc.get("url", "")
        rooms = [child.get("place", "unknown") for child in loc.get("children", [])]
        location_meta[loc_title.lower()] = {
            "location_id": loc_id,
            "location_title": loc_title,
            "location_url": loc_url,
            "location_rooms": rooms
        }
    return location_meta

location_meta = build_location_metadata(JSON_PATH)

# --- LOAD IMAGE PATHS AND ASSIGN LOCATION METADATA ---
image_paths = []
payloads = []

for loc_dir in IMAGE_ROOT.iterdir():
    if not loc_dir.is_dir():
        continue
    loc_name = loc_dir.name.lower()
    meta = location_meta.get(loc_name)
    if not meta:
        print(f"Warning: Location folder '{loc_dir.name}' not found in JSON metadata")
        continue
    for img_path in loc_dir.glob("*.*"):
        if img_path.suffix.lower() not in [".png", ".jpg", ".jpeg"]:
            continue
        payloads.append({
            "image_path": str(img_path),
            "location_id": meta["location_id"],
            "location_title": meta["location_title"],
            "location_url": meta["location_url"],
            "location_rooms": meta["location_rooms"]
        })
        image_paths.append(img_path)

# --- GENERATE EMBEDDINGS ---
def generate_embeddings(image_paths, batch_size=BATCH_SIZE):
    embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size)):
        batch_paths = image_paths[i:i+batch_size]
        images = [Image.open(p).convert("RGB") for p in batch_paths]
        with torch.no_grad():
            inputs = processor(images=images, return_tensors="pt", padding=True)
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            features = model.get_image_features(**inputs)
            features = torch.nn.functional.normalize(features, dim=-1)
            embeddings.append(features.cpu().numpy())
    return np.concatenate(embeddings)

embeddings = generate_embeddings(image_paths)

  0%|          | 0/30 [00:00<?, ?it/s]

## Store embedding in database (only run once when storing vector embedding)

In [7]:
# --- STORE EMBEDDINGS IN QDRANT ---
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

if not client.collection_exists(COLLECTION_NAME):
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=embeddings.shape[1],
            distance=models.Distance.COSINE,
            on_disk=True
        )
    )

points = []
for emb, payload in zip(embeddings, payloads):
    points.append(models.PointStruct(
        id=hashlib.md5(str(payload["image_path"]).encode()).hexdigest(),
        vector=emb.tolist(),
        payload=payload
    ))

BATCH_SIZE = 50
total_points = len(points)
uploaded = 0
print(f"Starting upsert of {total_points} points...")
for i in tqdm(range(0, total_points, BATCH_SIZE)):
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=points[i:i+BATCH_SIZE],
        wait=True
    )
    uploaded += len(points[i:i+BATCH_SIZE])
    print(f"Progress: {uploaded}/{total_points} ({uploaded/total_points:.1%})")

print(f"Completed storing {total_points} embeddings in {COLLECTION_NAME}")
print(f"Collection now contains {client.count(COLLECTION_NAME)} points")

Starting upsert of 954 points...


  0%|          | 0/20 [00:00<?, ?it/s]

Progress: 50/954 (5.2%)
Progress: 100/954 (10.5%)
Progress: 150/954 (15.7%)
Progress: 200/954 (21.0%)
Progress: 250/954 (26.2%)
Progress: 300/954 (31.4%)
Progress: 350/954 (36.7%)
Progress: 400/954 (41.9%)
Progress: 450/954 (47.2%)
Progress: 500/954 (52.4%)
Progress: 550/954 (57.7%)
Progress: 600/954 (62.9%)
Progress: 650/954 (68.1%)
Progress: 700/954 (73.4%)
Progress: 750/954 (78.6%)
Progress: 800/954 (83.9%)
Progress: 850/954 (89.1%)
Progress: 900/954 (94.3%)
Progress: 950/954 (99.6%)
Progress: 954/954 (100.0%)
Completed storing 954 embeddings in film_locations
Collection now contains count=954 points
