In [1]:
from src.ingest_redis import *
from src.ingest_chroma import *
from src.search import *
import time
import pandas as pd

In [2]:
# set up dataframe to track experiment results
cols =  ['database', 'chunk_size', 'overlap', 'clean', 'embedding', 'chunks_processed', 'time_to_process', 'used_memory_mb', 'query_time']
results = pd.DataFrame(columns = cols)

In [3]:
with open('example_queries.txt', 'r') as file:

    # Skip lines that don't contain actual queries (headers, empty lines) and extract example queries
    queries = [line.strip() for line in file if line.strip() and not line.strip().startswith('#') and not line.strip().startswith('##')]
    queries = [q.split('. ', 1)[1] if '. ' in q else q for q in queries]
    
    # Print total count
    print(f"Total queries: {len(queries)}")

Total queries: 500


In [4]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
db = 'redis'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:

        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # clear store before starting
            clear_redis_store()

            # create and fill redis store
            start = time.time()
            create_hnsw_index()
            chunk_count = process_pdfs_redis("data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')


            # Get memory usage info
            memory_info = redis_client.info('memory')
            used_memory = memory_info['used_memory'] 
            used_memory_mb = used_memory / (1024 * 1024)

            # test retrieval speed
            start = time.time()
            for query in queries:
                query_redis(query)
            to_search = time.time()- start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Index with chunk size 100 and overlap 0 created in 11.88 seconds
Search with chunk size 100 and overlap 0 completed in 8.34 seconds


  results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)


Index with chunk size 100 and overlap 0 created in 12.46 seconds
Search with chunk size 100 and overlap 0 completed in 10.61 seconds
Index with chunk size 100 and overlap 50 created in 18.46 seconds
Search with chunk size 100 and overlap 50 completed in 9.71 seconds
Index with chunk size 100 and overlap 50 created in 20.06 seconds
Search with chunk size 100 and overlap 50 completed in 10.91 seconds
Index with chunk size 300 and overlap 0 created in 8.63 seconds
Search with chunk size 300 and overlap 0 completed in 10.68 seconds
Index with chunk size 300 and overlap 0 created in 9.0 seconds
Search with chunk size 300 and overlap 0 completed in 9.23 seconds
Index with chunk size 300 and overlap 50 created in 9.65 seconds
Search with chunk size 300 and overlap 50 completed in 10.83 seconds
Index with chunk size 300 and overlap 50 created in 8.61 seconds
Search with chunk size 300 and overlap 50 completed in 10.65 seconds
Index with chunk size 300 and overlap 100 created in 8.97 seconds
Se

In [5]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
embedding_size = 768
db = 'chroma'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # create and fill redis store
            start = time.time()
            collection = create_chroma_index(embedding_model)
            collection, chunk_count = process_pdfs_chroma(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

            num_vectors = len(collection.get()["ids"])  # Number of stored items
            embedding_size = 768  # Adjust based on your embedding model
            float_size = np.dtype(np.float32).itemsize  # 4 bytes per float

            memory_usage_bytes = num_vectors * embedding_size * float_size
            memory_usage_mb = memory_usage_bytes / (1024 * 1024)
            print(f"It uses {memory_usage_bytes} MB")

            # test searching speed
            start = time.time()
            query_chroma(collection, queries)
            to_search = time.time()-start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

Index with chunk size 100 and overlap 0 created in 17.22 seconds
It uses 1274880 MB
Search with chunk size 100 and overlap 0 completed in 8.22 seconds
Index with chunk size 100 and overlap 0 created in 14.8 seconds
It uses 1299456 MB
Search with chunk size 100 and overlap 0 completed in 7.7 seconds
Index with chunk size 100 and overlap 50 created in 20.74 seconds
It uses 1956864 MB
Search with chunk size 100 and overlap 50 completed in 8.38 seconds
Index with chunk size 100 and overlap 50 created in 21.66 seconds
It uses 2055168 MB
Search with chunk size 100 and overlap 50 completed in 6.94 seconds
Index with chunk size 300 and overlap 0 created in 12.79 seconds
It uses 930816 MB
Search with chunk size 300 and overlap 0 completed in 7.68 seconds
Index with chunk size 300 and overlap 0 created in 13.11 seconds
It uses 936960 MB
Search with chunk size 300 and overlap 0 completed in 8.36 seconds
Index with chunk size 300 and overlap 50 created in 13.26 seconds
It uses 967680 MB
Search wit

In [8]:
results[results['database']=='chroma'].head()

Unnamed: 0,database,chunk_size,overlap,clean,embedding,chunks_processed,time_to_process,used_memory_mb,query_time
0,chroma,100,0,True,nomic-embed-text,"(Collection(name=hnsw_index), 415)",17.220127,6.475861,8.222336
0,chroma,100,0,False,nomic-embed-text,"(Collection(name=hnsw_index), 423)",14.803474,6.475861,7.699875
0,chroma,100,50,True,nomic-embed-text,"(Collection(name=hnsw_index), 637)",20.736971,6.475861,8.375115
0,chroma,100,50,False,nomic-embed-text,"(Collection(name=hnsw_index), 669)",21.660947,6.475861,6.938353
0,chroma,300,0,True,nomic-embed-text,"(Collection(name=hnsw_index), 303)",12.789091,6.475861,7.682276


In [17]:
results.reset_index().loc[35, 'chunks_processed'][1]

293

In [7]:
results.to_csv('experiment_results.csv')