In [None]:
from src.ingest_redis import *
from src.ingest_chroma import *
from src.ingest_milvus import * 
from src.search import *
import time
import pandas as pd

In [None]:
# set up dataframe to track experiment results
cols =  ['database', 'chunk_size', 'overlap', 'clean', 'embedding', 'chunks_processed', 'time_to_process', 'used_memory_mb', 'query_time']
results = pd.DataFrame(columns = cols)

In [None]:
with open('example_queries.txt', 'r') as file:

    # Skip lines that don't contain actual queries (headers, empty lines) and extract example queries
    queries = [line.strip() for line in file if line.strip() and not line.strip().startswith('#') and not line.strip().startswith('##')]
    queries = [q.split('. ', 1)[1] if '. ' in q else q for q in queries]
    
    # Print total count
    print(f"Total queries: {len(queries)}")

In [None]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
db = 'redis'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:

        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # clear store before starting
            clear_redis_store()

            # create and fill redis store
            start = time.time()
            create_hnsw_index()
            chunk_count = process_pdfs_redis("data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')


            # Get memory usage info
            memory_info = redis_client.info('memory')
            used_memory = memory_info['used_memory'] 
            used_memory_mb = used_memory / (1024 * 1024)

            # test retrieval speed
            start = time.time()
            for query in queries:
                query_redis(query)
            to_search = time.time()- start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

In [None]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
embedding_size = 768
db = 'chroma'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # create and fill redis store
            start = time.time()
            collection = create_chroma_index(embedding_model)
            collection, chunk_count = process_pdfs_chroma(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Index with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

            num_vectors = len(collection.get()["ids"])  # Number of stored items
            embedding_size = 768  # Adjust based on your embedding model
            float_size = np.dtype(np.float32).itemsize  # 4 bytes per float

            memory_usage_bytes = num_vectors * embedding_size * float_size
            memory_usage_mb = memory_usage_bytes / (1024 * 1024)
            print(f"It uses {memory_usage_bytes} MB")

            # test searching speed
            start = time.time()
            query_chroma(collection, queries)
            to_search = time.time()-start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

In [None]:
# test different chunk/overlap/clean combos for filling redis database
embedding_model = 'nomic-embed-text'
embedding_size = 768
db = 'milvus'

# loop through different options
for chunk in [100,300,500,1000]:
    for overlap in [0, 50, 100]:
        if overlap >= chunk:
            continue
        for clean in [True, False]:

            # create and fill redis store
            start = time.time()
            collection = create_milvus_collection(embedding_model)
            collection, chunk_count = process_pdfs_milvus(collection, "data/", chunk_size=chunk, overlap=overlap, clean = clean)
            to_fill = time.time() - start
            print(f'Collection with chunk size {chunk} and overlap {overlap} created in {round(to_fill, 2)} seconds')

            num_vectors = len(collection.get()["ids"])  # Number of stored items
            embedding_size = 768  # Adjust based on your embedding model
            float_size = np.dtype(np.float32).itemsize  # 4 bytes per float

            memory_usage_bytes = num_vectors * embedding_size * float_size
            memory_usage_mb = memory_usage_bytes / (1024 * 1024)
            print(f"It uses {memory_usage_bytes} MB")

            # test searching speed
            start = time.time()
            query_milvus(collection, queries)
            to_search = time.time()-start
            print(f'Search with chunk size {chunk} and overlap {overlap} completed in {round(to_search, 2)} seconds')

            # add results to result dataframe
            new_row = [db, chunk, overlap, clean, embedding_model, chunk_count, to_fill, used_memory_mb, to_search]
            results = pd.concat([results, pd.DataFrame([new_row], columns=cols)], axis = 0)

In [None]:
results[results['database']=='chroma'].head()

In [None]:
results[results['database']=='chroma'].head()

In [None]:
results.reset_index().loc[35, 'chunks_processed'][1]

In [None]:
results.to_csv('experiment_results.csv')