In [1]:
from src.ingest_redis import *
from src.ingest_chroma import *
from src.ingest_milvus import * 
from src.search import *
import time
import pandas as pd

from pymilvus import connections, utility


In [2]:
# set up dataframe to track experiment results
cols =  ['database', 'chunk_size', 'overlap', 'clean', 'embedding', 'chunks_processed', 'time_to_process', 'used_memory_mb', 'query_time']
results = pd.DataFrame(columns = cols)

In [3]:
with open('example_queries.txt', 'r') as file:

    # Skip lines that don't contain actual queries (headers, empty lines) and extract example queries
    queries = [line.strip() for line in file if line.strip() and not line.strip().startswith('#') and not line.strip().startswith('##')]
    queries = [q.split('. ', 1)[1] if '. ' in q else q for q in queries]
    
    # Print total count
    print(f"Total queries: {len(queries)}")

Total queries: 500


In [4]:
# test different chunk/overlap/clean combos for filling redis database
embedding_models = {'nomic-embed-text':768, 'mxbai-embed-large':512, 'bge-m3':1024}
db = 'redis'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        for embedding_model, embed_size in embedding_models.items():

            if overlap >= chunk:
                continue
            
            for clean in [True, False]:

                # clear store before starting
                clear_redis_store()

                # create and fill redis store
                start = time.time()
                create_hnsw_index()
                chunk_count = process_pdfs_redis("data/", chunk_size=chunk, overlap=overlap, clean = clean, model=embedding_model)
                to_fill = time.time() - start
                print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')


                # Get memory usage info
                memory_info = redis_client.info('memory')
                used_memory = memory_info['used_memory'] 
                used_memory_mb = used_memory / (1024 * 1024)

                # test retrieval speed
                start = time.time()
                for query in queries:
                    query_redis(query)
                to_search = time.time()- start
                print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

                # add results to result dataframe
                new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
                results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Index with chunk size 100 and overlap 0 created in 10.43 seconds
Search with chunk size 100 and overlap 0 completed in 9.21 seconds


  results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)


Index with chunk size 100 and overlap 0 created in 10.19 seconds
Search with chunk size 100 and overlap 0 completed in 9.45 seconds
Index with chunk size 100 and overlap 0 created in 11.47 seconds
Search with chunk size 100 and overlap 0 completed in 11.94 seconds
Index with chunk size 100 and overlap 0 created in 15.37 seconds
Search with chunk size 100 and overlap 0 completed in 11.93 seconds
Index with chunk size 100 and overlap 0 created in 15.41 seconds
Search with chunk size 100 and overlap 0 completed in 11.49 seconds
Index with chunk size 100 and overlap 0 created in 15.34 seconds
Search with chunk size 100 and overlap 0 completed in 10.43 seconds
Index with chunk size 100 and overlap 50 created in 21.64 seconds
Search with chunk size 100 and overlap 50 completed in 12.12 seconds
Index with chunk size 100 and overlap 50 created in 25.04 seconds
Search with chunk size 100 and overlap 50 completed in 12.15 seconds
Index with chunk size 100 and overlap 50 created in 23.0 seconds
S

In [11]:
db = 'chroma'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        for embedding_model, embed_size in embedding_models.items():

            if embedding_model == 'bge-m3':
                continue

            if overlap >= chunk:
                continue
            for clean in [True, False]:

                # create and fill redis store
                start = time.time()
                collection = create_chroma_index(embedding_model)
                collection, chunk_count = process_pdfs_chroma(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
                to_fill = time.time() - start
                print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

                num_vectors = len(collection.get()["ids"])  # Number of stored items
                embedding_size = embed_size
                float_size = np.dtype(np.float32).itemsize  # 4 bytes per float

                memory_usage_bytes = num_vectors * embedding_size * float_size
                memory_usage_mb = memory_usage_bytes / (1024 * 1024)
                print(f"It uses {memory_usage_bytes} MB")

                # test searching speed
                start = time.time()
                query_chroma(collection, queries)
                to_search = time.time()-start
                print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

                # add results to result dataframe
                new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, memory_usage_mb, to_search]
                results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Index with chunk size 100 and overlap 0 created in 16.76 seconds
It uses 1274880 MB
Search with chunk size 100 and overlap 0 completed in 7.62 seconds
Index with chunk size 100 and overlap 0 created in 14.78 seconds
It uses 1299456 MB
Search with chunk size 100 and overlap 0 completed in 6.63 seconds
Index with chunk size 100 and overlap 0 created in 18.42 seconds
It uses 849920 MB
Search with chunk size 100 and overlap 0 completed in 7.7 seconds
Index with chunk size 100 and overlap 0 created in 18.42 seconds
It uses 866304 MB
Search with chunk size 100 and overlap 0 completed in 7.78 seconds
Index with chunk size 100 and overlap 50 created in 19.52 seconds
It uses 1956864 MB
Search with chunk size 100 and overlap 50 completed in 7.13 seconds
Index with chunk size 100 and overlap 50 created in 21.87 seconds
It uses 2055168 MB
Search with chunk size 100 and overlap 50 completed in 7.54 seconds
Index with chunk size 100 and overlap 50 created in 26.42 seconds
It uses 1304576 MB
Search w

In [None]:
db = 'milvus'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        for embedding_model, embed_size in embedding_models.items():

            if overlap >= chunk:
                continue
            for clean in [True, False]:

                # create and fill redis store
                start = time.time()
                collection = create_milvus_collection(embed_model = embedding_model)
                collection, chunk_count = process_pdfs_milvus(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
                to_fill = time.time() - start
                print(f'Collection with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

                # # get memory usage
                # stats = collection.get_collection_stats()
                # stats_dict = ast.literal_eval(stats)
                
                # memory_bytes = 0
                # if "segments_stat" in stats_dict:
                #     for segment in stats_dict["segments_stat"]:
                #         if "memory_usage" in segment:
                #             memory_bytes += int(segment["memory_usage"])
                
                # # Convert to MB
                # memory_mb = memory_bytes / (1024 * 1024)

                # index_info = collection.get_index_info()
    
                # # Calculate total index size
                # total_size_bytes = 0
                # for info in index_info:
                #     if "index_size" in info:
                #         total_size_bytes += int(info["index_size"])
                
                # # Convert to MB
                # total_size_mb = total_size_bytes / (1024 * 1024)
                print(f"It uses {total_size_mb} MB")

                # test searching speed
                start = time.time()
                query_milvus(collection, queries)
                to_search = time.time()-start
                print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

                # add results to result dataframe
                new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
                results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Collection with chunk size 100 and overlap 0 created in 13.7 seconds


AttributeError: 'Collection' object has no attribute 'get_collection_stats'

In [12]:
results.to_csv('results.csv', index = False)