In [1]:
%load_ext autoreload
%autoreload

In [2]:
import dotenv
import sys

dotenv.load_dotenv("/mnt/arrakis/sietch/projects/NavigAItor/backend/.env")
sys.path.append("/mnt/arrakis/sietch/projects/NavigAItor/backend")

In [14]:
from unstructured.partition.pdf import partition_pdf
from tqdm.notebook import tqdm
from pydantic import BaseModel
import pyarrow as pa
import pandas as pd
import tiktoken
import logfire
import lancedb
import openai

import time

from services import mongo, docstore
from core.config import settings

openai_client = openai.AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
logfire.configure(pydantic_plugin=logfire.PydanticPlugin(record="all"))
logfire.instrument_openai(openai_client)

<contextlib._GeneratorContextManager at 0x790b9e0bba90>

In [4]:
class Chunk(BaseModel):
    document_id: str
    document_name: str
    tags: list[str]
    document_type: str
    title: str
    text: str
    vector: list[float]
    image: str | None = None


def count_tokens(text: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    return len(tokens)


async def get_embeddings(chunks):
    texts = [chunk["text"] for chunk in chunks]
    r = await openai_client.embeddings.create(input=texts, model="text-embedding-3-small")
    return [e.embedding for e in r.data]

In [5]:
user_id = "668340acd280347f6d24eb65"
doc_ids = [
  '66834372d280347f6d24ebd6',
  '66834374d280347f6d24ebd8',
  '66834375d280347f6d24ebda',
  '66834376d280347f6d24ebdc',
  '66834377d280347f6d24ebde',
  '66834378d280347f6d24ebe0',
  '66834379d280347f6d24ebe2',
  '6683437ad280347f6d24ebe4',
  '6683437bd280347f6d24ebe6',
  '6683437cd280347f6d24ebe9',
  '6683437dd280347f6d24ebec',
  '6683437ed280347f6d24ebee',
  '6683437fd280347f6d24ebf0',
  '66834380d280347f6d24ebf2',
  '66834381d280347f6d24ebf4',
  '66834382d280347f6d24ebf6',
  '66834384d280347f6d24ebf8',
  '66834385d280347f6d24ebfa',
  '66834386d280347f6d24ebfc',
  '66834387d280347f6d24ebfe',
  '66834388d280347f6d24ec00',
  '66834389d280347f6d24ec02',
  '6683438ad280347f6d24ec04',
  '6683438bd280347f6d24ec06',
  '6683438cd280347f6d24ec08',
  '6683438dd280347f6d24ec0a',
  '6683438ed280347f6d24ec0c',
  '6683438fd280347f6d24ec0e',
  '66834390d280347f6d24ec10',
  '66834392d280347f6d24ec12',
  '66834393d280347f6d24ec14',
  '66834394d280347f6d24ec16',
  '66834395d280347f6d24ec18',
  '66834396d280347f6d24ec1a',
  '66834397d280347f6d24ec1c',
  '66834398d280347f6d24ec1e',
  '66834399d280347f6d24ec20',
  '6683439ad280347f6d24ec22',
  '6683439bd280347f6d24ec24',
  '6683439cd280347f6d24ec26',
  '6683439ed280347f6d24ec28',
  '6683439fd280347f6d24ec2a',
  '668343a0d280347f6d24ec2c',
  '668343a1d280347f6d24ec2e',
  '668343a2d280347f6d24ec30',
  '668343a3d280347f6d24ec32',
  '668343a5d280347f6d24ec34',
  '668343a6d280347f6d24ec36'
]

db = await mongo.get_db(settings)
s3 = docstore.create_client(settings)

documents = []
for doc_id in tqdm(doc_ids, desc="Downloading documents"):
    doc, path = await docstore.download_doc(doc_id, user_id, db, s3, settings)
    documents.append({**doc.model_dump(), "path": path})

Downloading documents:   0%|          | 0/48 [00:00<?, ?it/s]

In [22]:
uri = "/home/muaddib/sietch/projects/NavigAItor/backend/data/user-documents"
db = await lancedb.connect_async(uri)

# DB_NAME = "8k_docs"
DB_NAME = "embedded_docs"
# await db.drop_table(DB_NAME)

# tbl = await db.create_table(
#     DB_NAME, 
#     schema=pa.schema([
#         pa.field("document_id", pa.string()),
#         pa.field("document_name", pa.string()),
#         pa.field("tags", pa.list_(pa.string())),
#         pa.field("document_type", pa.string()),
#         pa.field("title", pa.string()),
#         pa.field("text", pa.string()),
#         pa.field("vector", pa.list_(pa.float32(), list_size=1536)),
#         pa.field("image", pa.string()),
#     ])
# )

tbl = await db.open_table(DB_NAME)

In [23]:
for doc in tqdm(documents):
    doc_chunks = [
        {
            "document_id": doc['id'],
            "document_name": doc['metadata']['name'],
            "document_type": doc['metadata']['document_type'],
            "tags": doc['metadata']['tags'],
            "title": f"{doc['metadata']['name'].split()[0]} - Chunk {i}",
            "text": str(e)
        } for i, e in enumerate(partition_pdf(
            doc['path'], 
            chunking_strategy="by_title",
            include_metadata=True
        ))
    ]
    embeddings = await get_embeddings(doc_chunks)
 
    chunks = pd.DataFrame([
        Chunk(**chunk, vector=embedding).model_dump()
        for chunk, embedding in zip(doc_chunks, embeddings)
    ])
    await tbl.add(chunks)
    time.sleep(2)

  0%|          | 0/48 [00:00<?, ?it/s]

01:30:46.450 Embedding Creation with 'text-embedding-3-small' [LLM]
01:30:46.940 Pydantic Chunk validate_python
01:30:47.061 Pydantic Chunk validate_python
01:30:47.151 Pydantic Chunk validate_python
01:30:47.232 Pydantic Chunk validate_python
01:30:47.310 Pydantic Chunk validate_python
01:30:47.382 Pydantic Chunk validate_python
01:30:47.454 Pydantic Chunk validate_python
01:30:47.525 Pydantic Chunk validate_python
01:30:47.597 Pydantic Chunk validate_python
01:30:47.674 Pydantic Chunk validate_python
01:30:49.886 Embedding Creation with 'text-embedding-3-small' [LLM]
01:30:50.278 Pydantic Chunk validate_python
01:30:50.333 Pydantic Chunk validate_python
01:30:50.388 Pydantic Chunk validate_python
01:30:50.444 Pydantic Chunk validate_python
01:30:50.507 Pydantic Chunk validate_python
01:30:50.575 Pydantic Chunk validate_python
01:30:50.649 Pydantic Chunk validate_python
01:30:50.715 Pydantic Chunk validate_python
01:30:50.778 Pydantic Chunk validate_python
01:30:50.843 Pydantic Chunk 

In [24]:
TOP_K = 3

query = "how much did nvidia stock sell for in 2022"
r = await openai_client.embeddings.create(
    input=query, model="text-embedding-3-small"
)
q_embed = r.data[0].embedding

results = await tbl.vector_search(q_embed).limit(TOP_K).to_pandas()
for r in results.to_dict(orient='records'):
    print(">"*10)
    print(r['text'])

01:34:12.141 Embedding Creation with 'text-embedding-3-small' [LLM]
>>>>>>>>>>
On May 21, 2021, the board of directors of NVIDIA Corporation, or the Company, declared a four-for-one split of Company’s common stock in the form of a stock dividend, conditioned on obtaining stockholder approval at the Company’s 2021 Annual Meeting of Stockholders to be held on June 3, 2021, of an amendment to NVIDIA’s Amended and Restated Certificate of Incorporation to increase the number of authorized shares of common stock to 4 billion shares. The press release announcing the stock
>>>>>>>>>>
beginning April 1, 2021 and ending March 31, 2022 and (v) issue restricted stock units covering NVIDIA Stock with an aggregate value not to exceed $1.5 billion to employees of Arm following the Closing, in each case in accordance with the terms of the Purchase Agreement.
>>>>>>>>>>
closing price of NVIDIA Stock for the 30 trading days ending one trading day prior to the date of the Purchase Agreement; (iv) pay to 

In [21]:
results.sort_values(by="_distance", ascending=True).to_dict(orient="records")



[{'document_id': '66834375d280347f6d24ebda',
  'document_name': '12.pdf',
  'tags': array([], dtype=object),
  'document_type': 'upload',
  'title': '12.pdf - Chunk 11',
  'text': 'beginning April 1, 2021 and ending March 31, 2022 and (v) issue restricted stock units covering NVIDIA Stock with an aggregate value not to exceed $1.5 billion to employees of Arm following the Closing, in each case in accordance with the terms of the Purchase Agreement.',
  'vector': array([-0.008442  , -0.00503848,  0.06550023, ...,  0.00671797,
         -0.0016445 ,  0.03025632], dtype=float32),
  'image': None,
  '_distance': 0.7756586670875549},
 {'document_id': '66834375d280347f6d24ebda',
  'document_name': '12.pdf',
  'tags': array([], dtype=object),
  'document_type': 'upload',
  'title': '12.pdf - Chunk 10',
  'text': 'closing price of NVIDIA Stock for the 30 trading days ending one trading day prior to the date of the Purchase Agreement; (iv) pay to the Sellers earn out payments of up to $5 billion