In [27]:
import os
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
import json
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
from html_to_markdown import convert, convert_with_inline_images

# Data Collection

In [29]:
url = "https://pokarnah.atlassian.net/wiki/api/v2/pages/65706"

auth = HTTPBasicAuth("pokarnah@gmail.com", os.environ.get("CONFLUENCE_API_KEY"))

headers = {
  "Accept": "application/json"
}

params = {
    "body-format": "storage"
}


In [30]:
response = requests.request(
   "GET",
   url,
   headers=headers,
   auth=auth,
   params=params
)

In [31]:
data = json.loads(response.text)
html = data['body']['storage']['value']
dataset = convert(html=html)
dataset = dataset.split(' ')

# Chunking

In [32]:
EMBEDDING_URL = 'https://router.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2'

In [33]:
from huggingface_hub import InferenceClient

In [34]:
client = InferenceClient(
    provider="nebius",
    api_key=os.environ.get('HF_API_KEY')
)

### Fixed Size Chunking

In [35]:
def create_embedding(chunk):
    embedding = client.feature_extraction(
        chunk,
        model="Qwen/Qwen3-Embedding-8B",
    )  

    return embedding[0]

In [36]:
chunk_size = 64
VECTOR_DB = []

for i in range(0, len(dataset), chunk_size):
    chunk = dataset[i:i+chunk_size]
    chunk = ' '.join(chunk)
    embedding = create_embedding(chunk=chunk)
    VECTOR_DB.append((chunk, embedding))

# Retrival

In [37]:
def cosine_similarity(a, b):
    if len(a) != len(b):
        raise Exception("Vectors of unequal length")

    a = np.array(a)
    b = np.array(b)
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)

    if norm_a == 0 or norm_b == 0:
        raise Exception("Zero in denominator")
    similarity = dot_product / (norm_a * norm_b)
    return similarity
    

In [38]:
cosine_similarity([1, 2], [2, 3])

0.9922778767136677

In [39]:
def retreive(query_embedding, top_n=3):
    similarities = []
    for chunk, embedding in VECTOR_DB:
        query_embedding = np.array(query_embedding)
        embedding = np.array(embedding)
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))

    similarities.sort(key=lambda x:x[1], reverse=True)
    return similarities[:top_n]

In [40]:
query = input()
query_embedding = create_embedding(query)

In [41]:
retreived_knowledge = retreive(query_embedding=query_embedding)
prompt_knowledge = '\n'.join(chunk for chunk, similarity in retreived_knowledge) 

# Answer Generation Phase

In [42]:
generation_client = InferenceClient(
    api_key=os.environ["HF_API_KEY"]
)


In [43]:
prompt = f"DOCUMENT: {prompt_knowledge}" + "\n" + f"QUESTION: {query}" + "\n\n\n" + "Answer the users QUESTION using the DOCUMENT text above. Keep your answer ground in the facts of the DOCUMENT. If the DOCUMENT doesnâ€™t contain the facts to answer the QUESTION return {NONE}"

In [44]:
response = generation_client.chat.completions.create(
    model="google/gemma-3-27b-it",
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ],
)


In [45]:
print(response.choices[0].message)

ChatCompletionOutputMessage(role='assistant', content='Tenants in ChromaDB represent an organization or individual using ChromaDB. Each tenant logically groups together a set of databases, embeddings, documents, and metadata. A single tenant can manage multiple databases.', reasoning=None, tool_call_id=None, tool_calls=[], refusal=None, annotations=None, audio=None, function_call=None, reasoning_content=None)
