# Core

> Core functionality for indexing and searching.

In [None]:
# | default_exp index.core

In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *
from dreamai_ray.mapper import *
from dreamai_ray.index.utils import *
from dreamai_ray.index.df import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class write_index_cb(Callback):
    "A `Callback` to write the index to disk."

    def after_batch(self, cls, **kwargs):
        cls.index = cls.udf_kwargs["index"]
        index_folder = cls.index_folder
        os.makedirs(index_folder, exist_ok=True)
        index_path = str(Path(index_folder) / f"{cls.block_counter}.faiss")
        if self.verbose and cls.verbose:
            msg.info(f"Writing Index to {index_path}")
            msg.info(f"Index Size: {cls.index.ntotal}")
        faiss.write_index(cls.index, index_path)


class reset_index_cb(Callback):
    "A `Callback` to reset the index."

    def after_batch(self, cls, **kwargs):
        cls.index.reset()
        if self.verbose and cls.verbose:
            msg.info(f"Index Size Post Reset: {cls.index.ntotal}")
        cls.udf_kwargs["index"] = cls.index
        cls.udf = partial(cls.udf, **cls.udf_kwargs)


class IndexCreator(Mapper):
    "Creates indexes from embeddings."

    def __init__(
        self,
        index_dim=3,  # The dimension of the index.
        index_folder="indexes",  # The folder to write the index to.
        ems_col="embedding",  # The column to use to create the index.
        udf=df_to_index,  # The function to use to create the index.
        cbs=[write_index_cb, reset_index_cb],  # The `Callback`s to use.
        verbose=True,  # Whether to print out information.
        udf_verbose=False,  # Whether to print out information in the udf.
        udf_kwargs={},  # Additional kwargs to pass to the udf.
        **kwargs,
    ):
        self.index_folder = index_folder
        self.index = create_index(index_dim)
        udf_kwargs["index"] = self.index
        udf_kwargs["ems_col"] = ems_col
        udf_kwargs["verbose"] = udf_verbose
        self.verbose = verbose
        super().__init__(**locals_to_params(locals()))


def create_indexes(
    ems_folder="embeddings",  # The folder containing the embeddings.
    ems_col="embedding",  # The column to use to create the index.
    block_size=25,  # The number of embeddings per index.
    index_dim=768,  # The dimension of the index.
    index_folder="indexes",  # The folder to write the index to.
    udf=df_to_index,  # The function to use to create the index.
    cbs=[write_index_cb, reset_index_cb],  # The `Callback`s to use.
    verbose=True,  # Whether to print out information.
    udf_verbose=False,  # Whether to print out information in the udf.
    udf_kwargs={},  # Additional kwargs to pass to the udf.
    task_id=gen_random_string(16),  # The task id to use.
    **kwargs,
):
    "Function to create indexes from embeddings."

    task_folder = f"/tmp/{task_id}"
    ems_folder, _ = handle_input_path(ems_folder, local_path=task_folder)
    index_folder, index_bucket = get_local_path(index_folder, local_path=task_folder)

    m = IndexCreator(
        **locals_to_params(
            locals(), omit=["ems_folder", "ems_bucket", "index_bucket", "block_size"]
        )
    )
    em_files = sorted(
        get_files(ems_folder, extensions=[".json"]),
        key=lambda x: int(x.stem.split("_")[-1]),
    )
    # ems = [json.load(open(em_file))["embedding"] for em_file in em_files]
    df = pd.DataFrame({ems_col: em_files})
    if verbose:
        msg.info(f"Embeddings DF created of length: {len(df)}")
    for i in range(0, len(df), block_size):
        df_block = df.iloc[i : i + block_size]
        m(df_block)
    bucket_up(index_folder, index_bucket)
    shutil.rmtree(task_folder)
    return df


def search_indexes(
    ems,  # The embedding to search. Can be pre-loaded or a path to a json file.
    index_folder,  # The remote folder containing the indexes.
    local_index_folder=None,  # The local folder containing the indexes. Not required if `index_folder` is local.
    k=2,  # The number of nearest neighbors to return.
    verbose=True,  # Whether to print out information.
    task_id=gen_random_string(16),  # The task id to use.
):
    "Function to search an embedding against indexes."

    task_folder = f"/tmp/{task_id}"
    # if os.path.exists(local_index_folder):
        # index_folder = local_index_folder
    # else:
    index_folder_name = Path(index_folder).name
    index_folder,_ = handle_input_path(index_folder, local_path=local_index_folder, task_id=task_id)
    # index_folder = Path(index_folder).parent/index_folder_name
    bucket_dl(ems, task_folder)
    ems_file = get_files(task_folder, extensions=[".json"])[0]
    with open(ems_file) as f:
        ems = json.load(f)["embedding"]
    indexes = sorted(get_files(index_folder), key=lambda x: int(x.stem.split(".")[0]))
    if not os.path.exists(index_folder) or len(indexes) == 0:
        raise Exception(
            f"No indexes found in '{index_folder}' folder. Please create indexes first."
        )
    qdf = pd.DataFrame(
        {
            "index": indexes,
            "embedding": [ems] * len(indexes),
        }
    )

    qdf = qdf.apply(lambda x: df_index_search(x, k=k, verbose=verbose), axis=1)
    # if verbose:
    # msg.info(f"First row of qdf: {qdf.iloc[0]}")
    res = index_heap(qdf, k=k, verbose=verbose)
    shutil.rmtree(task_folder)
    return res, qdf


## Usage Example

In [None]:
# | hide

np.random.seed(42)
data_path = Path("/media/hamza/data2/faiss_data")


In [None]:
# # | eval: false

# data_path = Path("")
# ems_folder = data_path / "ems"
# index_folder = data_path / "indexes"
# num_ems = 50
# block_size = 10
# ems_dim = 768
# random_ems(num_ems=num_ems, ems_dim=ems_dim, ems_folder=ems_folder)

In [None]:
# | eval: false

bucket = "gs://gcsfuse-talentnet-dev"

ems_folder = f"{bucket}/ems_1"
index_folder = f"{bucket}/indexes_1"
block_size = 4
ems_dim = 768


In [None]:
# | eval: false

ems_df = create_indexes(
    ems_folder=ems_folder,
    index_folder=index_folder,
    block_size=block_size,
    index_dim=ems_dim,
    verbose=True,
)

ems_df[:5]



[38;5;4mℹ Downloading gs://gcsfuse-talentnet-dev/ems_1 to
/tmp/e07bc2edeb3f4f23/ems_1.[0m



Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_1.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_11.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_12.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_13.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_14.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_15.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_2.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_16.json...
Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_3.json...
Copying gs://gcsfuse-talentnet-de


[38;5;4mℹ BLOCK COUNTER: 0[0m

[38;5;4mℹ Embeddings DF created of length: 16[0m

[38;5;4mℹ DF BATCH SIZE: 4[0m


[38;5;4mℹ BLOCK COUNTER: 1[0m

[38;5;4mℹ Writing Index to /tmp/e07bc2edeb3f4f23/indexes_1/1.faiss[0m
[38;5;4mℹ Index Size: 4[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 4[0m


[38;5;4mℹ BLOCK COUNTER: 2[0m

[38;5;4mℹ Writing Index to /tmp/e07bc2edeb3f4f23/indexes_1/2.faiss[0m
[38;5;4mℹ Index Size: 4[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 4[0m


[38;5;4mℹ BLOCK COUNTER: 3[0m

[38;5;4mℹ Writing Index to /tmp/e07bc2edeb3f4f23/indexes_1/3.faiss[0m
[38;5;4mℹ Index Size: 4[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 4[0m


[38;5;4mℹ BLOCK COUNTER: 4[0m

[38;5;4mℹ Writing Index to /tmp/e07bc2edeb3f4f23/indexes_1/4.faiss[0m
[38;5;4mℹ Index Size: 4[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ Uploading /tmp/e07bc2edeb3f4f23/indexes_1 to
gs://gcsfuse-talentnet

Skipping existing item: gs://gcsfuse-talentnet-dev/indexes_1/1.faiss
Skipping existing item: gs://gcsfuse-talentnet-dev/indexes_1/4.faiss
Skipping existing item: gs://gcsfuse-talentnet-dev/indexes_1/3.faiss
Skipping existing item: gs://gcsfuse-talentnet-dev/indexes_1/2.faiss


Unnamed: 0,embedding
0,/tmp/e07bc2edeb3f4f23/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_1.json
1,/tmp/e07bc2edeb3f4f23/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_2.json
2,/tmp/e07bc2edeb3f4f23/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_3.json
3,/tmp/e07bc2edeb3f4f23/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_4.json
4,/tmp/e07bc2edeb3f4f23/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_5.json


In [None]:
# | eval: false

qems = f"{ems_folder}/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json"
res, qdf = search_indexes(
    qems,
    index_folder=index_folder,
    local_index_folder="/media/hamza/data2/faiss_data/indexes_1/",
    k=2,
    verbose=False,
)
print(f'\n\nFinal Results:\n\tDistances: {res["distances"]}\n\tIDs: {res["ids"]}')


[38;5;4mℹ Downloading gs://gcsfuse-talentnet-dev/indexes_1 to
/media/hamza/data2/faiss_data/indexes_1/indexes_1.[0m



Skipping existing item: file:///media/hamza/data2/faiss_data/indexes_1/indexes_1/1.faiss
Skipping existing item: file:///media/hamza/data2/faiss_data/indexes_1/indexes_1/2.faiss
Skipping existing item: file:///media/hamza/data2/faiss_data/indexes_1/indexes_1/3.faiss
Skipping existing item: file:///media/hamza/data2/faiss_data/indexes_1/indexes_1/4.faiss


/media/hamza/data2/faiss_data/indexes_1/indexes_1

[38;5;4mℹ Downloading
gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json
to /tmp/b59f9f97d6754852.[0m



Copying gs://gcsfuse-talentnet-dev/ems_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json...
/ [0/1 files][    0.0 B/ 16.7 KiB]   0% Done                                    



Final Results:
	Distances: [[0.0, 0.9240111708641052]]
	IDs: [[9, 11]]


/ [1/1 files][ 16.7 KiB/ 16.7 KiB] 100% Done                                    
Operation completed over 1 objects/16.7 KiB.                                     


In [None]:
# | hide
# | eval: false

# shutil.rmtree(index_folder, ignore_errors=True)
# shutil.rmtree(ems_folder, ignore_errors=True)

In [None]:
# | hide

import nbdev

nbdev.nbdev_export()
