# Core

> Core functionality for indexing and searching.

In [None]:
# | default_exp index.core

In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *
from dreamai_ray.mapper import *
from dreamai_ray.index.utils import *
from dreamai_ray.index.df import *

In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


class write_index_cb(Callback):
    "A `Callback` to write the index to disk."

    def after_batch(self, cls, **kwargs):
        cls.index = cls.udf_kwargs["index"]
        index_folder = cls.index_folder
        os.makedirs(index_folder, exist_ok=True)
        index_path = str(Path(index_folder) / f"{cls.block_counter}_{cls.index.ntotal}.faiss")
        df_path = str(Path(index_folder) / f"{cls.block_counter}.csv")
        if self.verbose and cls.verbose:
            msg.info(f"Writing Index to {index_path}")
            msg.info(f"Index Size: {cls.index.ntotal}")
            msg.info(f"Writing DF to {df_path}")
        faiss.write_index(cls.index, index_path)
        kwargs["df"].to_csv(df_path, index=False)


class reset_index_cb(Callback):
    "A `Callback` to reset the index."

    def after_batch(self, cls, **kwargs):
        cls.index.reset()
        if self.verbose and cls.verbose:
            msg.info(f"Index Size Post Reset: {cls.index.ntotal}")
        cls.udf_kwargs["index"] = cls.index
        cls.udf = partial(cls.udf, **cls.udf_kwargs)


class IndexCreator(Mapper):
    "Creates indexes from embeddings."

    def __init__(
        self,
        index=None,
        index_dim=3,  # The dimension of the index.
        index_folder="indexes",  # The folder to write the index to.
        ems_col="embedding",  # The column to use to create the index.
        udf=df_to_index,  # The function to use to create the index.
        cbs=[write_index_cb, reset_index_cb],  # The `Callback`s to use.
        verbose=True,  # Whether to print out information.
        udf_verbose=False,  # Whether to print out information in the udf.
        udf_kwargs={},  # Additional kwargs to pass to the udf.
        **kwargs,
    ):
        self.index_folder = index_folder
        if index is None:
            self.index = create_index(index_dim)
        else:
            self.index = index
        udf_kwargs["index"] = self.index
        udf_kwargs["ems_col"] = ems_col
        udf_kwargs["verbose"] = udf_verbose
        self.verbose = verbose
        super().__init__(**locals_to_params(locals()))


def create_indexes(
    ems_folder="embeddings",  # The folder containing the embeddings.
    ems_col="embedding",  # The column to use to create the index.
    block_size=40000,  # The number of embeddings per index. if None, all embeddings will be used.
    index_dim=768,  # The dimension of the index.
    index_folder="indexes",  # The folder to write the index to.
    udf=df_to_index,  # The function to use to create the index.
    cbs=[write_index_cb, reset_index_cb],  # The `Callback`s to use.
    verbose=True,  # Whether to print out information.
    udf_verbose=False,  # Whether to print out information in the udf.
    udf_kwargs={},  # Additional kwargs to pass to the udf.
    task_id=gen_random_string(16),  # The task id to use.
    **kwargs,
):
    "Function to create indexes from embeddings."

    task_folder = f"/tmp/{task_id}"
    t1 = time()
    ems_folder, _ = handle_input_path(ems_folder, local_path=task_folder)
    t2 = time()
    if verbose:
        msg.info(f"Time taken to download embeddings: {t2-t1:.2f} seconds.", spaced=True)
    # index_folder, index_bucket = get_local_path(index_folder, local_path=local_index_folder)
    index_folder, index_bucket = get_local_path(index_folder, local_path=task_folder)
    bucket_indexes = max(bucket_count(index_bucket), 0) // 2
    if verbose:
        msg.info(f"Bucket Indexes: {bucket_indexes}")
    cbs = [block_counter_cb(bucket_indexes)] + cbs
    m = IndexCreator(
        **locals_to_params(
            locals(),
            omit=["ems_folder", "ems_bucket", "index_bucket", "block_size"],
        ),
    )
    em_files = sorted(
        get_files(ems_folder, extensions=[".json"], make_str=True),
        key=lambda x: int(Path(x).stem.split("_")[-1]),
    )
    # ems = [json.load(open(em_file))["embedding"] for em_file in em_files]
    df = pd.DataFrame({ems_col: em_files})
    if verbose:
        msg.info(f"Embeddings DF created of length: {len(df)}")
    if block_size is None:
        block_size = len(df)
    for i in range(0, len(df), block_size):
        df_block = df.iloc[i : i + block_size]
        df_block = m(df_block).reset_index(drop=True)
        # df_path = str(Path(index_folder) / f"{m.block_counter}.csv")
        # df_block.to_csv(df_path, index=False)
    bucket_up(index_folder, index_bucket, only_new=False)
    shutil.rmtree(task_folder)
    return df


def search_indexes(
    ems,  # The embedding to search. Can be pre-loaded or a path to a json file.
    index_folder,  # The remote folder containing the indexes.
    local_index_folder="/media/hamza/data2/faiss_data/saved_indexes",  # The local folder containing the indexes. Not required if `index_folder` is local.
    k=2,  # The number of nearest neighbors to return.
    verbose=True,  # Whether to print out information.
    task_id=gen_random_string(16),  # The task id to use.
):
    "Function to search an embedding against indexes."

    task_folder = f"/tmp/{task_id}"
    # if os.path.exists(local_index_folder):
    # index_folder = local_index_folder
    # else:
    if local_index_folder is None:
        index_folder, _ = handle_input_path(
            index_folder, local_path=local_index_folder, task_id=task_id
        )
    else:
        pre_index_folder, _ = get_local_path(index_folder, local_path=local_index_folder)
        if os.path.exists(pre_index_folder):
            if verbose:
                msg.info(f"Cached Index Folder: {pre_index_folder}", spaced=True)
            index_folder = pre_index_folder
        else:
            index_folder, _ = handle_input_path(
                index_folder, local_path=local_index_folder, task_id=task_id
            )
        
    bucket_dl(ems, task_folder)
    ems_file = get_files(task_folder, extensions=[".json"])[0]
    with open(ems_file) as f:
        ems = json.load(f)["embedding"]
    indexes = sorted(
        get_files(index_folder, extensions=[".faiss"]),
        key=lambda x: int(x.stem.split("_")[0]),
    )
    if not os.path.exists(index_folder) or len(indexes) == 0:
        raise Exception(
            f"No indexes found in '{index_folder}' folder. Please create indexes first."
        )
    qdf = pd.DataFrame(
        {
            "index": indexes,
            "embedding": [ems] * len(indexes),
        }
    )

    qdf = qdf.apply(lambda x: df_index_search(x, k=k, verbose=verbose), axis=1)
    res = index_heap(qdf, k=k, verbose=verbose, with_offset=True)
    dfs = sorted(get_files(index_folder, extensions=[".csv"]), key=lambda x: int(x.stem))
    print(dfs)
    df = pd.concat([pd.read_csv(df) for df in dfs]).reset_index(drop=True)
    res["meta_data"] = df.iloc[res["ids"][0]].to_dict(orient="records")
    shutil.rmtree(task_folder)
    return res, qdf

## Usage Example

In [None]:
# | hide

np.random.seed(42)
data_path = Path("/media/hamza/data2/faiss_data")


In [None]:
# # | eval: false

# data_path = Path("")
# ems_folder = data_path / "ems"
# index_folder = data_path / "indexes"
# num_ems = 50
# block_size = 10
# ems_dim = 768
# random_ems(num_ems=num_ems, ems_dim=ems_dim, ems_folder=ems_folder)

In [None]:
# | eval: false

bucket = "gs://gcsfuse-talentnet-dev"

ems_folder = f"{bucket}/ems_1_1"
index_folder = f"{bucket}/indexes_1"
ems_dim = 768


In [None]:
# | eval: false

ems_df = create_indexes(
    ems_folder=ems_folder,
    index_folder=index_folder,
    index_dim=ems_dim,
    verbose=True,
)

ems_df[:5]



[38;5;4mℹ Downloading gs://gcsfuse-talentnet-dev/ems_1_1 to
/tmp/e324b71dd9114694/ems_1_1.[0m



Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_1.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_11.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_12.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_14.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_15.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_13.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_16.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_2.json...
Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_3.json...
Copying gs://


[38;5;4mℹ Time taken to download embeddings: 2.06 seconds.[0m



CommandException: One or more URLs matched no objects.


[38;5;4mℹ Bucket Indexes: 0[0m
[38;5;4mℹ Embeddings DF created of length: 16[0m

[38;5;4mℹ DF BATCH SIZE: 16[0m


[38;5;4mℹ Uploading /tmp/e324b71dd9114694/indexes_1 to
gs://gcsfuse-talentnet-dev/indexes_1.[0m



Copying file:///tmp/e324b71dd9114694/indexes_1/1_16.faiss [Content-Type=application/octet-stream]...
Copying file:///tmp/e324b71dd9114694/indexes_1/1.csv [Content-Type=text/csv]... 
- [2/2 files][ 49.3 KiB/ 49.3 KiB] 100% Done                                    
Operation completed over 2 objects/49.3 KiB.                                     


Unnamed: 0,embedding
0,/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_1.json
1,/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_2.json
2,/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_3.json
3,/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_4.json
4,/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_5.json


In [None]:
# | eval: false

qems = f"{ems_folder}/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json"
res, qdf = search_indexes(
    qems,
    index_folder=index_folder,
    k=10,
    verbose=False,
)
print(f'\n\nFinal Results:\n\tDistances: {res["distances"]}\n\tIDs: {res["ids"]}')
print("\tMeta Data:")
for m in res["meta_data"]:
    print(f"\t\t{m}")


[38;5;4mℹ Downloading gs://gcsfuse-talentnet-dev/indexes_1 to
/media/hamza/data2/faiss_data/saved_indexes/indexes_1.[0m



Copying gs://gcsfuse-talentnet-dev/indexes_1/1.csv...
Copying gs://gcsfuse-talentnet-dev/indexes_1/1_16.faiss...                      
/ [2/2 files][ 49.3 KiB/ 49.3 KiB] 100% Done                                    
Operation completed over 2 objects/49.3 KiB.                                     



[38;5;4mℹ Downloading
gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json
to /tmp/0683652a48da4a1c.[0m



Copying gs://gcsfuse-talentnet-dev/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json...
/ [0/1 files][    0.0 B/ 16.7 KiB]   0% Done                                    

[Path('/media/hamza/data2/faiss_data/saved_indexes/indexes_1/1.csv')]


Final Results:
	Distances: [[0.0, 0.9240111708641052, 1.0372934341430664, 1.101623296737671, 1.1049132347106934, 1.1570426225662231, 1.209057092666626, 1.2113628387451172, 1.2239317893981934, 1.2323083877563477]]
	IDs: [[9, 11, 8, 13, 14, 1, 3, 7, 12, 4]]
	Meta Data:
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_10.json'}
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_12.json'}
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_9.json'}
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_14.json'}
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_15.json'}
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/resumes-4e2cdbeb-1e20-45ff-bded-a0a510350167_2.json'}
		{'embedding': '/tmp/e324b71dd9114694/ems_1_1/res

/ [1/1 files][ 16.7 KiB/ 16.7 KiB] 100% Done                                    
Operation completed over 1 objects/16.7 KiB.                                     


In [None]:
# | hide
# | eval: false

# shutil.rmtree(index_folder, ignore_errors=True)
# shutil.rmtree(ems_folder, ignore_errors=True)

In [None]:
# | hide

import nbdev

nbdev.nbdev_export()
