In [82]:
import json
import re


with open("sahih_bukhari.json", "r") as file:
    data = json.load(file)

all_hadith = []

for volume in data:
    for book in volume["books"]:
        for hadith in book["hadiths"]:
            # hadith number is first number backwards in hadith['info'] string Eg "Volume 1, Book 1, Number 1 :"
            hadith_number = re.findall(r'\d+', hadith["info"])[-1]
            all_hadith.append({
                "id": f"{volume['name']}-Book-{book['name'].split()[0].strip('.')}-Hadith-{hadith_number}",
                "Volume": volume["name"],
                "Book": book["name"],
                "full_info_string": hadith["info"],
                "hadith_number": hadith_number,
                "text": hadith["text"],
                "narrated_by": hadith["by"],
                })
            

# print first 5 hadith
for hadith in all_hadith[:1]:
    print(f"ID: {hadith['id']}")
    print(f"Volume: {hadith['Volume']}")
    print(f"Book: {hadith['Book']}")
    print(f"Hadith Number: {hadith['hadith_number']}")
    print(f"Full Info String: {hadith['full_info_string']}")
    print(f"Narrated By: {hadith['narrated_by']}")
    print(f"Text: {hadith['text']}")
    
    print()

# print ids of first 100 hadith
for hadith in all_hadith[:100]:
    print(hadith['id'])


ID: Volume 1-Book-1-Hadith-1
Volume: Volume 1
Book: 1. Revelation
Hadith Number: 1
Full Info String:  Volume 1, Book 1, Number 1 :
Narrated By: Narrated by 'Umar bin Al-Khattab
Text: I heard Allah's Apostle saying, "The reward of deeds depends upon the intentions and every person will get the reward according to what he has intended. So whoever emigrated for worldly benefits or for a woman to marry, his emigration was for what he emigrated for."


Volume 1-Book-1-Hadith-1
Volume 1-Book-1-Hadith-2
Volume 1-Book-1-Hadith-3
Volume 1-Book-1-Hadith-4
Volume 1-Book-1-Hadith-5
Volume 1-Book-1-Hadith-6
Volume 1-Book-2-Hadith-7
Volume 1-Book-2-Hadith-8
Volume 1-Book-2-Hadith-9
Volume 1-Book-2-Hadith-10
Volume 1-Book-2-Hadith-11
Volume 1-Book-2-Hadith-12
Volume 1-Book-2-Hadith-13
Volume 1-Book-2-Hadith-14
Volume 1-Book-2-Hadith-15
Volume 1-Book-2-Hadith-16
Volume 1-Book-2-Hadith-17
Volume 1-Book-2-Hadith-18
Volume 1-Book-2-Hadith-19
Volume 1-Book-2-Hadith-20
Volume 1-Book-2-Hadith-21
Volume 1-Bo

In [None]:
import chromadb

client = chromadb.PersistentClient(path="sahih_bukhari_db")
collection = client.get_collection(name="sahih_bukhari_db")

print(f"Number of embeddings before emptying: {collection.count()}")

Number of embeddings before emptying: 0


In [None]:
import ollama
import chromadb


embeddings_list = []
metadatas_list = []
ids_list = []
documents_list = []

for hadith in all_hadith:
    response = ollama.embed(
        model="mxbai-embed-large",
        input=hadith["text"],
    )

    print(f"Response: {response}")
    embeddings_list.append(response["embeddings"][0])  # Extract the 'embedding' list
    metadatas_list.append({
        "Volume": hadith["Volume"],
        "Book": hadith["Book"],
        "hadith_number": hadith["hadith_number"],
        "narrated_by": hadith["narrated_by"]
    })
    ids_list.append(hadith["id"])
    documents_list.append(hadith["text"])
    print(f"ID: {hadith['id']}")

print(embeddings_list)

collection.add(
    embeddings=embeddings_list,  # Pass the list of embeddings directly
    metadatas=metadatas_list,
    ids=ids_list,
    documents=documents_list,
)

print("Embeddings added to ChromaDB.")

100
Response: model='mxbai-embed-large' created_at=None done=None done_reason=None total_duration=94802625 load_duration=9324833 prompt_eval_count=55 prompt_eval_duration=None eval_count=None eval_duration=None embeddings=[[-0.05402081, 0.010430036, -0.04812245, -0.007990584, -0.039170846, -0.026114943, 0.013774301, 0.04933454, 0.0076488843, -0.012067943, 0.04065354, 0.005512675, -0.0023703955, -0.005329825, -0.038968876, -0.049375515, -0.04037127, 0.011323509, -0.032330487, 0.03328555, -0.0024619054, 0.02212554, -0.045578327, 0.005769472, 0.026090566, -0.0031789881, 0.057307594, -0.008149664, 0.07607587, 0.07438157, -0.014405312, 0.02616309, 0.021642597, -0.07317127, 0.0029155274, -0.010955689, 0.0024354202, -0.10537089, 0.01725319, -0.050536472, -0.046994586, 0.0016634756, -0.020735193, 0.010001824, -0.050091885, -0.002652581, -0.023254965, -0.08175034, 0.043748725, -0.01959181, 0.022100724, -0.027480777, -0.007481333, -0.016800134, 0.06830394, 0.040307213, -0.002611858, 0.012183935,

In [None]:
import ollama
import chromadb

client = chromadb.PersistentClient(path="sahih_bukhari_db")
collection = client.get_collection(name="sahih_bukhari_db")
query = " intentions"

response = ollama.embed(
    model="mxbai-embed-large",
    input=query,
)

query_embedding = response["embeddings"][0]


results = collection.query(
    query_embeddings=query_embedding,
    n_results=1,
    include=["metadatas", "documents", "distances", "embeddings"] # Include embeddings
)
print("results", results)



results {'ids': [['Volume 1-Book-3-Hadith-57']], 'embeddings': [array([[ 0.0049017 , -0.01517427, -0.0224275 , ..., -0.02029693,
        -0.01141175,  0.01459433]], shape=(1, 1024))], 'documents': [['Once the Prophet remained behind us in a journey. He joined us while we were performing ablution for the prayer which was over-due. We were just passing wet hands over our feet (and not washing them properly) so the Prophet addressed us in a loud voice and said twice or thrice: "Save your heels from the fire." ']], 'uris': None, 'included': ['metadatas', 'documents', 'distances', 'embeddings'], 'data': None, 'metadatas': [[{'narrated_by': "Narrated by 'Abdullah bin 'Amr", 'Volume': 'Volume 1', 'Book': '3. Knowledge', 'hadith_number': '57'}]], 'distances': [[0.7138262987136841]]}
