# DF Processors

> Functions to be applied on DataFrames for indexing and searching.

In [None]:
# | default_exp index.df


In [None]:
# | export

from dreamai_ray.imports import *
from dreamai_ray.utils import *
from dreamai_ray.mapper import *
from dreamai_ray.index.utils import *


In [None]:
#| hide

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
# | export


def df_to_index(
    df,
    index,
    ems_col="embedding",  # Column name of embeddings in df.
    ems_key="embedding",  # Key name of embeddings in json file.
    verbose=False,  # Whether to print out information.
):
    "Add the embeddings in df to the index."

    ems = read_ems(df, ems_col=ems_col, ems_key=ems_key)
    if verbose:
        msg.info(f"Ems Shape: {ems.shape}")
    index.add(ems)
    if verbose:
        msg.info(f"Index Size: {index.ntotal}")
    return df


def df_index_search(
    df,
    ems_col="embedding",  # Column name of embeddings in df.
    ems_key="embedding",  # Key name of embeddings in json file.
    index_col="index",  # Column name of index in df.
    k=1,  # Number of nearest neighbors to return.
    verbose=False,  # Whether to print out information.
):
    "Find the nearest neighbors of the embeddings in df."

    if verbose:
        msg.info(f"Index Col: {df[index_col]}")
    index = faiss.read_index(str(df[index_col]))
    if verbose:
        msg.info(f"Index Size: {index.ntotal}")
    ems = read_ems(df, ems_col=ems_col, ems_key=ems_key)
    if verbose:
        msg.info(f"Ems Shape: {ems.shape}")
    d, i = index.search(ems, k)
    if verbose:
        msg.good(f"IDs: {i}, Distances: {d}")
    df["index_size"] = index.ntotal
    df["distances"] = d  # .tolist()[0]
    df["ids"] = i  # .tolist()[0]
    return df

## Usage Example

In [None]:
# | eval: false

index_dim = 768
np.random.seed(42)
num_ems = 5
ems = [np.random.random((1, index_dim))[0].tolist() for i in range(num_ems)]
df = pd.DataFrame({"embedding": ems})
index = create_index(index_dim)

df = df.apply(lambda x: df_to_index(x, index, verbose=True), axis=1)
faiss.write_index(index, "index.faiss")


[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;4mℹ Index Size: 1[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;4mℹ Index Size: 2[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;4mℹ Index Size: 3[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;4mℹ Index Size: 4[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;4mℹ Index Size: 5[0m


In [None]:
# | eval: false

df = pd.DataFrame({"index": ["index.faiss"], "embedding": [ems[0]]})
df = df.apply(lambda x: df_index_search(x, k=2, verbose=True), axis=1)
df

[38;5;4mℹ Index Col: index.faiss[0m
[38;5;4mℹ Index Size: 5[0m
[38;5;4mℹ Ems Shape: (1, 768)[0m
[38;5;2m✔ IDs: [[0 3]], Distances: [[  0.      128.68584]][0m


Unnamed: 0,index,embedding,index_size,distances,ids
0,index.faiss,"[0.3745401188473625, 0.9507143064099162, 0.7319939418114051, 0.5986584841970366, 0.15601864044243652, 0.15599452033620265, 0.05808361216819946, 0.8661761457749352, 0.6011150117432088, 0.7080725777960455, 0.020584494295802447, 0.9699098521619943, 0.8324426408004217, 0.21233911067827616, 0.18182496720710062, 0.18340450985343382, 0.3042422429595377, 0.5247564316322378, 0.43194501864211576, 0.2912291401980419, 0.6118528947223795, 0.13949386065204183, 0.29214464853521815, 0.3663618432936917, 0.45606998421703593, 0.7851759613930136, 0.19967378215835974, 0.5142344384136116, 0.5924145688620425, 0....",5,"[[0.0, 128.68584]]","[[0, 3]]"


In [None]:
# | hide

import nbdev

nbdev.nbdev_export()