In [1]:
from pathlib import Path
from collections import defaultdict
import sys

project_root_path = Path("..").resolve()
sys.path.append(str(project_root_path))

from src.data import get_all_data
from src.embeddings_database import AutoFaissIndex



In [2]:
local_data_folder = project_root_path / "data"

# Which metadata to take from the source_info.json file
metadata_to_keep = ["name", "keywords"]

In [3]:
combined_data_dict = get_all_data(main_data_folder=local_data_folder,
                 metadata_to_keep=metadata_to_keep)

In [4]:
combined_data_dict.keys()

dict_keys(['img_path', 'source_name', 'source_keywords'])

In [5]:
upload_sample = True
full_data_size = len(combined_data_dict['img_path'])
sample_percentage = 0.01
sample_size = int(full_data_size*sample_percentage)

In [6]:
sample_size

72

In [7]:
if upload_sample:
    import random
    sample_dict = defaultdict(list)
    sample_indices = random.sample(list(range(full_data_size)), sample_size)
    for index in sample_indices:
        sample_dict['img_path'].append(combined_data_dict['img_path'][index])
        sample_dict['name'].append(combined_data_dict['source_name'][index])
        sample_dict['keywords'].append(combined_data_dict['source_keywords'][index])

In [8]:
FAISS_INDEX_PATH = str(project_root_path / "embeddings_store")
FACE_DETECT_MODEL = "models/face_detect/mediapipe/"
EMBEDDINGS_MODEL = "models/embeddings/deepface/"

In [9]:
from src.google_drive import get_drive_service, get_or_create_app_folder

In [10]:
print("Initializing Google Drive...")
drive_service = get_drive_service()
if not drive_service:
    print("Failed to connect to Google Drive.")

drive_folder_id = get_or_create_app_folder(drive_service)

# 4. Initialize AutoFaissIndex with Drive Service
print("Loading FAISS Index...")
faiss_index = AutoFaissIndex(
    index_path=FAISS_INDEX_PATH,
    face_detect_model=FACE_DETECT_MODEL,
    embeddings_model=EMBEDDINGS_MODEL,
    drive_service=drive_service,       # Critical for upload
    drive_folder_id=drive_folder_id    # Critical for upload
)

Initializing Google Drive...
Loading FAISS Index...


In [12]:
faiss_index.populate_images(sample_dict, batch_size=10)

Populating FAISS index:   4%|██▎                                                     | 3/72 [00:15<06:02,  5.26s/image]

KeyboardInterrupt

