In [1]:
from src.ingest_redis import *
from src.ingest_chroma import *
from src.ingest_milvus import * 
from src.search import *
import time
import pandas as pd

In [2]:
# set up dataframe to track experiment results
cols =  ['database', 'chunk_size', 'overlap', 'clean', 'embedding', 'chunks_processed', 'time_to_process', 'used_memory_mb', 'query_time']
results = pd.DataFrame(columns = cols)

In [3]:
with open('example_queries.txt', 'r') as file:

    # Skip lines that don't contain actual queries (headers, empty lines) and extract example queries
    queries = [line.strip() for line in file if line.strip() and not line.strip().startswith('#') and not line.strip().startswith('##')]
    queries = [q.split('. ', 1)[1] if '. ' in q else q for q in queries]
    
    # Print total count
    print(f"Total queries: {len(queries)}")

Total queries: 500


In [None]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
db = 'redis'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:

        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # clear store before starting
            clear_redis_store()

            # create and fill redis store
            start = time.time()
            create_hnsw_index()
            chunk_count = process_pdfs_redis("data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')


            # Get memory usage info
            memory_info = redis_client.info('memory')
            used_memory = memory_info['used_memory'] 
            used_memory_mb = used_memory / (1024 * 1024)

            # test retrieval speed
            start = time.time()
            for query in queries:
                query_redis(query)
            to_search = time.time()- start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

In [7]:
# clear store before starting
clear_redis_store()

# create and fill redis store
start = time.time()
create_hnsw_index()
chunk_count = process_pdfs_redis("data/", chunk_size=500, overlap=100, clean = True)

In [8]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'mxbai-embed-large'
embedding_size = 768
db = 'chroma'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # create and fill redis store
            start = time.time()
            collection = create_chroma_index(embedding_model)
            collection, chunk_count = process_pdfs_chroma(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

            num_vectors = len(collection.get()["ids"])  # Number of stored items
            embedding_size = 768  # Adjust based on your embedding model
            float_size = np.dtype(np.float32).itemsize  # 4 bytes per float

            memory_usage_bytes = num_vectors * embedding_size * float_size
            memory_usage_mb = memory_usage_bytes / (1024 * 1024)
            print(f"It uses {memory_usage_bytes} MB")

            # test searching speed
            start = time.time()
            query_chroma(collection, queries)
            to_search = time.time()-start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, memory_usage_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Index with chunk size 100 and overlap 0 created in 18.19 seconds
It uses 1274880 MB
Search with chunk size 100 and overlap 0 completed in 10.64 seconds


  results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)


Index with chunk size 100 and overlap 0 created in 24.29 seconds
It uses 1299456 MB
Search with chunk size 100 and overlap 0 completed in 10.88 seconds
Index with chunk size 100 and overlap 50 created in 30.07 seconds
It uses 1956864 MB
Search with chunk size 100 and overlap 50 completed in 10.88 seconds
Index with chunk size 100 and overlap 50 created in 30.17 seconds
It uses 2055168 MB
Search with chunk size 100 and overlap 50 completed in 11.04 seconds
Index with chunk size 300 and overlap 0 created in 19.06 seconds
It uses 930816 MB
Search with chunk size 300 and overlap 0 completed in 10.91 seconds
Index with chunk size 300 and overlap 0 created in 19.89 seconds
It uses 936960 MB
Search with chunk size 300 and overlap 0 completed in 9.19 seconds


KeyboardInterrupt: 

In [5]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
embedding_size = 768
db = 'milvus'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # create and fill redis store
            start = time.time()
            collection = create_milvus_collection(embed_model = embedding_model)
            collection, chunk_count = process_pdfs_milvus(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Collection with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

            # get number
            num_vectors = collection.num_entities 
            embedding_size = 768  
            float_size = np.dtype(np.float32).itemsize  

            memory_usage_bytes = num_vectors * embedding_size * float_size
            memory_usage_mb = memory_usage_bytes / (1024 * 1024)
            print(f"It uses {memory_usage_bytes} MB")

            # test searching speed
            start = time.time()
            query_milvus(collection, queries)
            to_search = time.time()-start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Collection with chunk size 100 and overlap 0 created in 10.15 seconds
It uses 1274880 MB


ValidationError: 1 validation error for EmbeddingsRequest
prompt
  Input should be a valid string [type=string_type, input_value=['What is a relational da... parallelization work?'], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type

In [3]:
embedding_model = 'mxbai-embed-large:latest'

collection = create_chroma_index(embedding_model)
collection, chunk_count = process_pdfs_chroma(collection, "data/", chunk_size=500, overlap=100, clean = True)

In [5]:
interactive_search("chroma", embed_model="mxbai-embed-large:latest", llm="mistral:latest")

🔍 RAG Search Interface
Type 'exit' to quit
[['193', '186', '187', '118', '116', '295', '117', '114', '253', '204']]
{'ids': [['193', '186', '187', '118', '116', '295', '117', '114', '253', '204']], 'distances': [[240.52713, 265.7791, 268.0335, 268.64575, 269.59592, 273.60944, 283.7926, 286.72388, 294.18234, 294.6806]], 'embeddings': None, 'metadatas': [[None, None, None, None, None, None, None, None, None, None]], 'documents': [['ACID Properties Durability Once a transaction is completed and committed successfully its changes are permanent Even in the event of a system failure committed transactions are preserved For more info on Transactions see Kleppmann Book Chapter 7 12', 'ACID Properties Atomicity transaction is treated as an atomic unit it is fully executed or no parts of it are executed Consistency a transaction takes a database from one consistent state to another consistent state consistent state all data meets integrity constraints 5', 'ACID Properties Isolation Two transacti