In [1]:
import pandas as pd
import chromadb 
import os 
from tqdm import tqdm 

In [None]:
%time df = pd.read_csv('merged_data.csv')

In [None]:
df.head()

In [6]:
df.shape

(680132, 3)

In [2]:
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(
    name="collection_name",
    metadata={"hnsw:space": "cosine"}
)

In [8]:
import ast
batch_size = 1000

num_batches = (len(df) + batch_size - 1) // batch_size
df['Embedding'] = df['Embedding'].apply(ast.literal_eval)

for i in tqdm(range(num_batches), desc="Adding data to ChromaDB"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    
    batch_df = df.iloc[start_idx:end_idx]
    
    collection.add(
        documents=batch_df['Text'].tolist(),
        embeddings=batch_df['Embedding'].tolist(),
        ids=batch_df['Id'].tolist()
    )

print("Data added to ChromaDB collection successfully.")

Adding data to ChromaDB: 100%|██████████████| 681/681 [1:18:08<00:00,  6.88s/it]

Data added to ChromaDB collection successfully.





In [3]:
coll = client.get_collection('collection_name')
coll.get(include=['documents'])

{'ids': ['id_0',
  'id_1',
  'id_10',
  'id_100',
  'id_1000',
  'id_10000',
  'id_100000',
  'id_100001',
  'id_100002',
  'id_100003',
  'id_100004',
  'id_100005',
  'id_100006',
  'id_100007',
  'id_100008',
  'id_100009',
  'id_10001',
  'id_100010',
  'id_100011',
  'id_100012',
  'id_100013',
  'id_100014',
  'id_100015',
  'id_100016',
  'id_100017',
  'id_100018',
  'id_100019',
  'id_10002',
  'id_100020',
  'id_100021',
  'id_100022',
  'id_100023',
  'id_100024',
  'id_100025',
  'id_100026',
  'id_100027',
  'id_100028',
  'id_100029',
  'id_10003',
  'id_100030',
  'id_100031',
  'id_100032',
  'id_100033',
  'id_100034',
  'id_100035',
  'id_100036',
  'id_100037',
  'id_100038',
  'id_100039',
  'id_10004',
  'id_100040',
  'id_100041',
  'id_100042',
  'id_100043',
  'id_100044',
  'id_100045',
  'id_100046',
  'id_100047',
  'id_100048',
  'id_100049',
  'id_10005',
  'id_100050',
  'id_100051',
  'id_100052',
  'id_100053',
  'id_100054',
  'id_100055',
  'id_100056'

In [4]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-mpnet-base-v2')
query = "I like to eat pizza"
query_embeddings = model.encode(query)
result = collection.query(
    query_embeddings = query_embeddings.tolist(),
    n_results=10,
    include = ['documents']
)

In [6]:
def transform_chunk(chunk):
    transformed_chunk = ' '.join(''.join(token.split('#')) for token in chunk.split())
    sentences = transformed_chunk.split('.')
    transformed_chunk = '. '.join(sentences) + '.'  
    return transformed_chunk

In [7]:
for doc_num, chunk_list in enumerate(result['documents'], start=1):
    print(f"Document {doc_num}:")
    for i, chunk in enumerate(chunk_list, start=1):
        transformed_chunk = transform_chunk(chunk)
        print(f"Document {i}:")
        print(transformed_chunk)
        print('-' * 50)  
    print('=' * 50)  


Document 1:
Document 1:
ed yeah that was me i could use a bread bowl right about now thanks for asking your e hungry you know tonight would be a good night for a cooking lesson all right yeah see you later so basically were making a fancy pizza ah ah no its mushroom rico tta gale tte its french mm dough cheese sauce that s a pizza it is a pizza isn t it yeah but you know and its the best pizza that you ll ever taste ill be the judge of that yeah you quick to judge imagine that judging me judge so how d you get into cooking tell you what im gonna cut mushrooms you sh red cheese and i will tell you the story about when my ex wife was in medical school and i was home a lot alone in the early years of our marriage and in order to keep myself entertained and a lot less lonely i started watching the cooking channel and i don t know i started challenging myself while making more elaborate meals and it sort of became a hobby what s a hobby okay i know your e joking but you know i haven t heard