In [1]:
from pathlib import Path
from collections import defaultdict
import sys

project_root_path = Path("..")
sys.path.append(str(project_root_path.resolve()))

from src.data import get_all_data
from src.embeddings_database import AutoFaissIndex

### Get all the data from the data folder

In [2]:
data_folder = project_root_path / "data"

# Which metadata to take from the source_info.json file
metadata_to_keep = ["name", "keywords"]

In [3]:
combined_data_dict = get_all_data(main_data_folder=data_folder,
                 metadata_to_keep=metadata_to_keep)

In [4]:
upload_sample = True
full_data_size = len(combined_data_dict['img_path'])
sample_percentage = 0.1
sample_size = int(full_data_size*sample_percentage)

In [5]:
if upload_sample:
    import random
    sample_dict = defaultdict(list)
    sample_indices = random.sample(list(range(full_data_size)), sample_size)
    for index in sample_indices:
        sample_dict['img_path'].append(combined_data_dict['img_path'][index])
        sample_dict['name'].append(combined_data_dict['source_name'][index])
        sample_dict['keywords'].append(combined_data_dict['source_keywords'][index])

### Initialize the Faiss Index

In [6]:
data_save_path = str(project_root_path / "embeddings_store")
face_detect_model = "models/face_detect/mediapipe/"
embeddings_model = "models/embeddings/deepface/"

In [7]:
faiss_index = AutoFaissIndex(
                 index_path=data_save_path,
                 face_detect_model=face_detect_model,
                 embeddings_model=embeddings_model
)

### Populate the index

In [8]:
faiss_index.populate_images(sample_dict)

Populating FAISS index: 100%|██████████████████████████████████████████████████████████████████| 721/721 [02:51<00:00,  4.21image/s]
