# Mocker DB

This class is a mock handler for simulating a vector database, designed primarily for testing and development scenarios.
It offers functionalities such as text embedding, hierarchical navigable small world (HNSW) search,
and basic data management within a simulated environment resembling a vector database.


In [1]:
import sys
import numpy as np
sys.path.append('../')
from python_modules.mocker_db import MockerDB, SentenceTransformerEmbedder, MockerSimilaritySearch

## Usage examples

The examples contain:
1. Basic data insertion and retrieval
2. Text embedding and searching
3. Advanced filtering and removal
4. Testing the HNSW search algorithm
5. Simulating database connection and persistence


### 1. Basic Data Insertion and Retrieval

In [2]:
# Initialization
handler = MockerDB(
    # optional
    embedder_params = {'model_name_or_path' : 'paraphrase-multilingual-mpnet-base-v2',
                        'processing_type' : 'batch',
                        'tbatch_size' : 500},
    embedder = SentenceTransformerEmbedder,
    ## optional/ for similarity search
    similarity_search_h = MockerSimilaritySearch,
    return_keys_list = [],
    search_results_n = 3,
    similarity_search_type = 'linear',
    similarity_params = {'space':'cosine'},
    ## optional/ inputs with defaults
    file_path = "./mock_persist",
    persist = True,
    embedder_error_tolerance = 0.0
    )
# Initialize empty database
handler.establish_connection()

# Insert Data
values_list = [
    {"text": "Sample text 1"},
    {"text": "Sample text 2"}
]
handler.insert_values(values_list, "text")

# Retrieve Data
handler.filter_keys(subkey="text", subvalue="Sample text 1")
handler.search_database_keys(query='text')
results = handler.get_dict_results(return_keys_list=["text"])
print(results)


[{'text': 'Sample text 1'}]


### 2. Text Embedding and Searching

In [3]:
ste = SentenceTransformerEmbedder(# optional / adaptor parameters
                                  processing_type = '',
                                  tbatch_size = 500,
                                  max_workers = 2,
                                  # sentence transformer parameters 
                                  model_name_or_path = 'paraphrase-multilingual-mpnet-base-v2',)

In [4]:
# Single Text Embedding
query = "Sample query"
embedded_query = ste.embed(query,
                           # optional
                           processing_type='')
print(embedded_query)

[-4.97358590e-02  9.52027068e-02 -1.21950833e-02  9.25386772e-02
 -2.30182875e-02 -2.72102132e-02  5.68394363e-02  9.71098617e-02
  1.06838785e-01  5.81228137e-02  1.32275507e-01  1.14283832e-02
 -6.95725083e-02  6.98074102e-02 -5.25936149e-02 -5.75598739e-02
  8.16179160e-03 -8.36839806e-03 -8.61259177e-03  1.44206854e-02
  1.18881613e-02 -9.50366855e-02  7.12573454e-02 -4.82778661e-02
  1.47316260e-02  1.08418241e-02 -1.04824871e-01  7.01252893e-02
 -4.72064875e-02  1.00300469e-01  4.45593484e-02  2.13189349e-02
  6.67915866e-03 -5.25918603e-02  6.82299361e-02 -9.52047110e-02
 -5.81367174e-03 -2.45188009e-02 -3.84987635e-03  2.75071915e-02
  6.96027279e-02  2.40137398e-01 -1.22002093e-02  5.89093380e-02
 -8.46866369e-02  1.13797039e-01 -3.59477289e-02 -5.65296523e-02
 -1.62181836e-02  9.54672769e-02  1.51199652e-02  2.33741164e-01
 -2.21628562e-01  1.07304186e-01  2.16172084e-01 -2.22787559e-02
 -4.15886305e-02 -4.08980586e-02  2.01920569e-01 -7.75575929e-04
  8.41789246e-02 -8.94407

In [5]:
# Batch Text Embedding
queries = ["Sample query", "Sample query 2"]
embedded_query = ste.embed(queries,
                           # optional
                           processing_type='batch')
print(embedded_query)

[array([-4.97358739e-02,  9.52026770e-02, -1.21950815e-02,  9.25386250e-02,
       -2.30182838e-02, -2.72101853e-02,  5.68394773e-02,  9.71098319e-02,
        1.06838770e-01,  5.81227541e-02,  1.32275537e-01,  1.14283515e-02,
       -6.95725083e-02,  6.98074028e-02, -5.25935926e-02, -5.75598925e-02,
        8.16177577e-03, -8.36837385e-03, -8.61255173e-03,  1.44207133e-02,
        1.18881436e-02, -9.50367153e-02,  7.12573156e-02, -4.82778512e-02,
        1.47316679e-02,  1.08418325e-02, -1.04824893e-01,  7.01252893e-02,
       -4.72064316e-02,  1.00300491e-01,  4.45593484e-02,  2.13188846e-02,
        6.67914608e-03, -5.25919013e-02,  6.82299435e-02, -9.52047110e-02,
       -5.81361959e-03, -2.45188419e-02, -3.84985283e-03,  2.75071766e-02,
        6.96028322e-02,  2.40137458e-01, -1.22002317e-02,  5.89093082e-02,
       -8.46866071e-02,  1.13796927e-01, -3.59476805e-02, -5.65296300e-02,
       -1.62181910e-02,  9.54673290e-02,  1.51199708e-02,  2.33741209e-01,
       -2.21628621e-01, 

In [6]:
# Search Database
handler.search_database(query)
search_results = handler.get_dict_results(return_keys_list=["text"])

# Display Results
print(search_results)


[{'text': 'Sample text 1'}]


### 3. Advanced Filtering and Removal

In [7]:
# Advanced Filtering
filter_criteria = {"text": "Sample text 1"}
handler.filter_database(filter_criteria)
filtered_data = handler.filtered_data
print(filtered_data)

# Data Removal
handler.remove_from_database(filter_criteria)
print(handler.data)


{'1faad290827464bcc5a2359fd802680c7dca91feb03604af4e929014955a5570': {'text': 'Sample text 1', 'embedding': array([-4.94664758e-02, -2.38676026e-01, -9.62016266e-03,  8.10620785e-02,
        3.10079753e-02, -1.32553905e-04,  2.23857164e-01,  1.36439845e-01,
        1.12349296e-03,  6.58008233e-02,  2.29892850e-01, -5.80043793e-02,
       -6.09662496e-02,  2.03570306e-01,  6.81651116e-04, -1.83527961e-01,
        7.52831623e-02, -7.59223476e-02, -3.35856862e-02,  5.57932705e-02,
        1.99186262e-02, -3.93868657e-03,  3.08720712e-02, -7.18625262e-02,
       -1.88894663e-02,  1.82226654e-02, -6.17382713e-02,  1.34449387e-02,
        5.45749255e-03,  2.80156974e-02,  4.40168716e-02,  1.43856227e-01,
        4.41634357e-02, -2.37560168e-01,  5.63159101e-02, -9.69012007e-02,
       -3.45691144e-02, -1.63727310e-02, -3.26099694e-02, -6.92172050e-02,
        2.26346269e-01,  8.00395161e-02, -6.81642676e-03,  9.40042734e-02,
       -1.26307502e-01,  1.34297997e-01,  1.36889428e-01,  1.187789

### 4. Testing the HNSW Search Algorithm

In [8]:
mss = MockerSimilaritySearch(
    # optional
    search_results_n = 3,
    similarity_params = {'space':'cosine'},
    similarity_search_type ='linear'
)

In [9]:
# Create embeddings
embeddings = [ste.embed("example1"), ste.embed("example2")]


# Assuming embeddings are pre-calculated and stored in 'embeddings'
data_with_embeddings = {"record1": {"embedding": embeddings[0]}, "record2": {"embedding": embeddings[1]}}
handler.data = data_with_embeddings

# HNSW Search
query_embedding = embeddings[0]  # Example query embedding
labels, distances = mss.hnsw_search(query_embedding, np.array(embeddings), k=1)
print(labels, distances)


[0] [1.1920929e-07]


### 5. Simulating Database Connection and Persistence

In [10]:
# Establish Connection
handler.establish_connection()

# Change and Persist Data
handler.insert_values([{"text": "New sample text"}], "text")
handler.save_data()

# Reload Data
handler.establish_connection()
print(handler.data)


{'1faad290827464bcc5a2359fd802680c7dca91feb03604af4e929014955a5570': {'text': 'Sample text 1', 'embedding': array([-4.94664758e-02, -2.38676026e-01, -9.62016266e-03,  8.10620785e-02,
        3.10079753e-02, -1.32553905e-04,  2.23857164e-01,  1.36439845e-01,
        1.12349296e-03,  6.58008233e-02,  2.29892850e-01, -5.80043793e-02,
       -6.09662496e-02,  2.03570306e-01,  6.81651116e-04, -1.83527961e-01,
        7.52831623e-02, -7.59223476e-02, -3.35856862e-02,  5.57932705e-02,
        1.99186262e-02, -3.93868657e-03,  3.08720712e-02, -7.18625262e-02,
       -1.88894663e-02,  1.82226654e-02, -6.17382713e-02,  1.34449387e-02,
        5.45749255e-03,  2.80156974e-02,  4.40168716e-02,  1.43856227e-01,
        4.41634357e-02, -2.37560168e-01,  5.63159101e-02, -9.69012007e-02,
       -3.45691144e-02, -1.63727310e-02, -3.26099694e-02, -6.92172050e-02,
        2.26346269e-01,  8.00395161e-02, -6.81642676e-03,  9.40042734e-02,
       -1.26307502e-01,  1.34297997e-01,  1.36889428e-01,  1.187789