# Mappers

> Mapper classes for indexing and searching.

In [None]:
# | default_exp index.mappers

In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *
from dreamai_ray.mapper import *
from dreamai_ray.index.utils import *
from dreamai_ray.index.df import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class write_index_cb(Callback):
    "A `Callback` to write the index to disk."

    def after_batch(self, cls, **kwargs):
        cls.index = cls.udf_kwargs["index"]
        index_folder = cls.index_folder
        os.makedirs(index_folder, exist_ok=True)
        index_path = str(Path(index_folder) / f"{cls.block_counter}.faiss")
        if self.verbose and cls.verbose:
            msg.info(f"Writing Index to {index_path}")
            msg.info(f"Index Size: {cls.index.ntotal}")
        faiss.write_index(cls.index, index_path)


class reset_index_cb(Callback):
    "A `Callback` to reset the index."

    def after_batch(self, cls, **kwargs):
        cls.index.reset()
        if self.verbose and cls.verbose:
            msg.info(f"Index Size Post Reset: {cls.index.ntotal}")
        cls.udf_kwargs["index"] = cls.index
        cls.udf = partial(cls.udf, **cls.udf_kwargs)


class IndexCreator(Mapper):
    """
    Creates indexes from embeddings.
    """

    def __init__(
        self,
        index_dim=3,  # The dimension of the index.
        index_folder="indexes",  # The folder to write the index to.
        ems_col="embedding",  # The column to use to create the index.
        udf=df_to_index,  # The function to use to create the index.
        cbs=[write_index_cb, reset_index_cb],  # The `Callback`s to use.
        verbose=True,  # Whether to print out information.
        udf_verbose=False,  # Whether to print out information in the udf.
        udf_kwargs={},  # Additional kwargs to pass to the udf.
        **kwargs,
    ):
        self.index_folder = index_folder
        self.index = create_index(index_dim)
        udf_kwargs["index"] = self.index
        udf_kwargs["ems_col"] = ems_col
        udf_kwargs["verbose"] = udf_verbose
        self.verbose = verbose
        super().__init__(**locals_to_params(locals()))


def create_indexes(
    ems_folder="embeddings",  # The folder containing the embeddings.
    ems_col="embedding",  # The column to use to create the index.
    block_size=25,  # The number of embeddings per index.
    index_dim=768,  # The dimension of the index.
    index_folder="indexes",  # The folder to write the index to.
    udf=df_to_index,  # The function to use to create the index.
    cbs=[write_index_cb, reset_index_cb],  # The `Callback`s to use.
    verbose=True,  # Whether to print out information.
    udf_verbose=False,  # Whether to print out information in the udf.
    udf_kwargs={},  # Additional kwargs to pass to the udf.
    **kwargs,
):
    "Function to create indexes from embeddings."

    m = IndexCreator(**locals_to_params(locals(), omit=["ems_folder", "block_size"]))
    em_files = sorted(
        get_files(ems_folder, extensions=[".json"]),
        key=lambda x: int(x.stem.split("_")[-1]),
    )
    # ems = [json.load(open(em_file))["embedding"] for em_file in em_files]
    df = pd.DataFrame({ems_col: em_files})
    if verbose:
        msg.info(f"Embeddings DF created of length: {len(df)}")
    for i in range(0, len(df), block_size):
        df_block = df.iloc[i : i + block_size]
        m(df_block)
    return df


def search_indexes(ems, index_folder="indexes", k=2, verbose=True):
    "Function to search an embedding against indexes."

    indexes = sorted(get_files(index_folder), key=lambda x: int(x.stem.split(".")[0]))
    if not os.path.exists(index_folder) or len(indexes) == 0:
        raise Exception(
            f"No indexes found in '{index_folder}' folder. Please create indexes first."
        )
    qdf = pd.DataFrame(
        {
            "index": indexes,
            "embedding": [ems] * len(indexes),
        }
    )

    qdf = qdf.apply(lambda x: df_index_search(x, k=k, verbose=verbose), axis=1)
    # if verbose:
    # msg.info(f"First row of qdf: {qdf.iloc[0]}")
    res = index_heap(qdf, k=k, verbose=verbose)
    return res, qdf

## Usage Example

In [None]:
# | hide

np.random.seed(42)
data_path = Path("/media/hamza/data2/faiss_data")


In [None]:
# | eval: false

data_path = Path("")
ems_folder = data_path / "ems"
index_folder = data_path / "indexes"
num_ems = 50
block_size = 10
ems_dim = 768
random_ems(num_ems=num_ems, ems_dim=ems_dim, ems_folder=ems_folder)

In [None]:
# | eval: false

ems_df = create_indexes(
    ems_folder=ems_folder,
    index_folder=index_folder,
    block_size=block_size,
    index_dim=ems_dim,
    verbose=True,
)

ems_df[:5]



[38;5;4mℹ BLOCK COUNTER: 0[0m

[38;5;4mℹ Embeddings DF created of length: 50[0m

[38;5;4mℹ DF BATCH SIZE: 10[0m


[38;5;4mℹ BLOCK COUNTER: 1[0m

[38;5;4mℹ Writing Index to indexes/1.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 10[0m


[38;5;4mℹ BLOCK COUNTER: 2[0m

[38;5;4mℹ Writing Index to indexes/2.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 10[0m


[38;5;4mℹ BLOCK COUNTER: 3[0m

[38;5;4mℹ Writing Index to indexes/3.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 10[0m


[38;5;4mℹ BLOCK COUNTER: 4[0m

[38;5;4mℹ Writing Index to indexes/4.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Index Size Post Reset: 0[0m

[38;5;4mℹ DF BATCH SIZE: 10[0m


[38;5;4mℹ BLOCK COUNTER: 5[0m

[38;5;4mℹ Writing Index to indexes/5.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Index Siz

Unnamed: 0,embedding
0,ems/em_0.json
1,ems/em_1.json
2,ems/em_2.json
3,ems/em_3.json
4,ems/em_4.json


In [None]:
# | eval: false

qems = ems_df.iloc[16]["embedding"]
res, qdf = search_indexes(qems, index_folder=index_folder, k=2, verbose=True)
print(f'\n\nFinal Results:\n\tDistances: {res["distances"]}\n\tIDs: {res["ids"]}')


[38;5;4mℹ Index Col: indexes/1.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;2m✔ IDs: [[5 4]], Distances: [[121.6236  122.51336]][0m
[38;5;4mℹ Index Col: indexes/2.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;2m✔ IDs: [[6 0]], Distances: [[  0.      123.76789]][0m
[38;5;4mℹ Index Col: indexes/3.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;2m✔ IDs: [[6 3]], Distances: [[120.51416 121.15578]][0m
[38;5;4mℹ Index Col: indexes/4.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;2m✔ IDs: [[7 3]], Distances: [[118.380554 119.54465 ]][0m
[38;5;4mℹ Index Col: indexes/5.faiss[0m
[38;5;4mℹ Index Size: 10[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;2m✔ IDs: [[5 2]], Distances: [[123.06287  125.081055]][0m
[38;5;4mℹ Adding Result: [[121.6236  122.51336]], [[5 4]][0m
[38;5;2m✔ Added Result: [[121.6236  122.51336]], [[5 4]][0m
[38;5;4mℹ Addin

In [None]:
# | hide
# | eval: false

shutil.rmtree(index_folder, ignore_errors=True)
shutil.rmtree(ems_folder, ignore_errors=True)

In [None]:
# | hide

import nbdev

nbdev.nbdev_export()
