In [1]:
import os
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
import json
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from html_to_markdown import convert, convert_with_inline_images

# Data Collection

In [3]:
CONFLUENCE_SPACE_ID = '163973'
ALL_PAGES_IN_SPACE_URL = f"https://pokarnah.atlassian.net/wiki/api/v2/spaces/{CONFLUENCE_SPACE_ID}/pages"

auth = HTTPBasicAuth("pokarnah@gmail.com", os.environ.get("CONFLUENCE_API_KEY"))

headers = {
  "Accept": "application/json"
}



In [4]:
response = requests.request(
    "GET",
    ALL_PAGES_IN_SPACE_URL,
    headers=headers,
    auth=auth
)

confluence_page_ids = []
if response.status_code == 200:
    data = json.loads(response.text)
    results = data['results']
    for result in results:
        confluence_page_ids.append(result['id'])

print(confluence_page_ids)

['65706', '164078', '2555906', '2555913', '2555975', '2588685', '2654212', '2654222', '2719752', '2719770', '2719786', '2818118', '2883592', '2916355', '2981895', '3080227', '3112961', '3211265', '3211289', '3244043', '3276801', '3375105', '3375122']


In [5]:
params = {
    "body-format": "storage"
}

dataset = []

for page in confluence_page_ids:
    PAGE_URL = f"https://pokarnah.atlassian.net/wiki/api/v2/pages/{page}"
    
    response = requests.request(
        "GET",
        PAGE_URL,
        headers=headers,
        params=params,
        auth=auth
    )

    if response.status_code == 200:
        data = json.loads(response.text)
        html = data['body']['storage']['value']
        text = convert(html=html)
        text = text.split(' ')
        dataset.extend(text)


print(len(dataset))

25659


In [6]:
from openai import OpenAI

# Chunking

In [None]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

### Fixed Size Chunking

In [None]:
def create_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding


In [11]:
chunk_size = 512
VECTOR_DB = []

for i in range(0, len(dataset), chunk_size):
    chunk = dataset[i:i+chunk_size]
    chunk = ' '.join(chunk)
    embedding = create_embedding(text=chunk)
    print(len(embedding))
    VECTOR_DB.append((chunk, embedding))

1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536
1536


# Retrival

In [12]:
def cosine_similarity(a, b):
    if len(a) != len(b):
        raise Exception("Vectors of unequal length")

    a = np.array(a)
    b = np.array(b)
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)

    if norm_a == 0 or norm_b == 0:
        raise Exception("Zero in denominator")
    similarity = dot_product / (norm_a * norm_b)
    return similarity
    

In [13]:
cosine_similarity([1, 2], [2, 3])

0.9922778767136677

In [14]:
def retreive(query_embedding, top_n=3):
    similarities = []
    for chunk, embedding in VECTOR_DB:
        query_embedding = np.array(query_embedding)
        embedding = np.array(embedding)
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))

    similarities.sort(key=lambda x:x[1], reverse=True)
    return similarities[:top_n]

In [15]:
query = input()
query_embedding = create_embedding(query)

In [16]:
retreived_knowledge = retreive(query_embedding=query_embedding)
prompt_knowledge = '\n'.join(chunk for chunk, similarity in retreived_knowledge) 

# Answer Generation Phase

In [17]:
prompt = f"DOCUMENT: {prompt_knowledge}" + "\n" + f"QUESTION: {query}" + "\n\n\n" + "Answer the users QUESTION using the DOCUMENT text above. Keep your answer ground in the facts of the DOCUMENT. If the DOCUMENT doesnâ€™t contain the facts to answer the QUESTION return {NONE}"

In [18]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ],
)


In [19]:
print(response.choices[0].message)

ChatCompletionMessage(content='Chroma DB is an open-source vector database designed for efficiently storing, searching, and managing vector embeddings, which are numeric representations used in AI and machine learning for tasks like semantic search and recommendation systems. It enables fast similarity search and offers a simple API for developers, making it well-suited for building and deploying AI-driven applications. Chroma DB supports features such as vector storage and querying, ease of use with a Python-based API, flexible storage options, and integration with popular embedding models from platforms like Hugging Face and OpenAI.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)
