In [71]:
import os
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
import json
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from html_to_markdown import convert, convert_with_inline_images

# Data Collection

In [3]:
url = "https://pokarnah.atlassian.net/wiki/api/v2/pages/65706"

auth = HTTPBasicAuth("pokarnah@gmail.com", os.environ.get("CONFLUENCE_API_KEY"))

headers = {
  "Accept": "application/json"
}

params = {
    "body-format": "storage"
}


In [4]:
response = requests.request(
   "GET",
   url,
   headers=headers,
   auth=auth,
   params=params
)

In [31]:
data = json.loads(response.text)
html = data['body']['storage']['value']
dataset = convert(html=html)
dataset = dataset.split(' ')

# Chunking

In [6]:
EMBEDDING_URL = 'https://router.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2'

In [7]:
from huggingface_hub import InferenceClient

In [8]:
client = InferenceClient(
    provider="nebius",
    api_key=os.environ.get('HF_API_KEY')
)

### Fixed Size Chunking

In [74]:
def create_embedding(chunk):
    embedding = client.feature_extraction(
        chunk,
        model="Qwen/Qwen3-Embedding-8B",
    )  

    return embedding[0]

In [75]:
chunk_size = 64
VECTOR_DB = []

for i in range(0, len(dataset), chunk_size):
    chunk = dataset[i:i+chunk_size]
    chunk = ' '.join(chunk)
    embedding = create_embedding(chunk=chunk)
    VECTOR_DB.append((chunk, embedding))

# Retrival

In [None]:
def cosine_similarity(a, b):
    if len(a) != len(b):
        raise Exception("Vectors of unequal length")

    a = np.array(a)
    b = np.array(b)
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)

    if norm_a == 0 or norm_b == 0:
        raise Exception("Zero in denominator")
    similarity = dot_product / (norm_a * norm_b)
    return similarity
    

In [82]:
cosine_similarity([1, 2], [2, 3])

0.9922778767136677

In [83]:
def retreive(query_embedding, top_n=3):
    similarities = []
    for chunk, embedding in VECTOR_DB:
        query_embedding = np.array(query_embedding)
        embedding = np.array(embedding)
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))

    similarities.sort(key=lambda x:x[1], reverse=True)
    return similarities[:top_n]

In [86]:
query = input()
query_embedding = create_embedding(query)

In [87]:
retreived_knowledge = retreive(query_embedding=query_embedding)
print(retreived_knowledge)


[('our embeddings, documents and metadata. Importantly, collections do not require a predefined schema, we can start storing data immediately which makes ChromaDB flexible for various use cases.\n\n- **Documents**: Documents are the raw chunks of text we store in ChromaDB. Each document is associated with an embedding (a numerical representation of its content). We can query these documents directly making retrieval efficient and intuitive.\n\n## Implementation\n\nLets', 0.6645738947387166), ('or custom model. For example, a sentence like "The cat is on the mat" can be transformed into a numerical vector using a model like BERT or SentenceTransformers.\n\n2. **Storing Embeddings:** The embeddings are stored in a ChromaDB collection, along with optional metadata like document ID, category or timestamp and unique identifiers.\n\n3. **Querying:** Users can query the database by providing a vector or raw data', 0.6631374242959193), ('back their details and identifiers which can be used for