In [1]:
%load_ext autoreload
%autoreload

In [2]:
import dotenv
import sys

dotenv.load_dotenv("/mnt/arrakis/sietch/projects/NavigAItor/backend/.env")
sys.path.append("/mnt/arrakis/sietch/projects/NavigAItor/backend")

In [3]:
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from pydantic import BaseModel, Field, ValidationError
from instructor.retry import InstructorRetryException
from pdf2image import convert_from_path
from tqdm.notebook import tqdm
from PIL import Image
import pandas as pd
import pyarrow as pa
import instructor
import anthropic
import lancedb
import openai
import logfire
import asyncio
import rich

from io import BytesIO
import base64
import time

from services import mongo, docstore
from core.config import Settings, get_settings

settings: Settings = get_settings()

openai_client = openai.AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
anthropic_client = anthropic.AsyncAnthropic(api_key=settings.ANTHROPIC_API_KEY)

logfire.configure(pydantic_plugin=logfire.PydanticPlugin(record="all"))
logfire.instrument_anthropic(anthropic_client)
logfire.instrument_openai(openai_client)

<contextlib._GeneratorContextManager at 0x71233325b4c0>

In [4]:
class Chunk(BaseModel):
    document_id: str
    document_name: str
    tags: list[str]
    document_type: str
    title: str
    text: str
    vector: list[float]
    image: str | None = None


def image_to_b64(image: Image) -> str:
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    img_byte = buffer.getvalue()
    img_b64 = base64.b64encode(img_byte).decode("utf-8")
    return img_b64


def b64_to_image(b64: str) -> Image.Image:
    return Image.open(BytesIO(base64.b64decode(b64)))


@retry(
    retry=retry_if_exception_type(InstructorRetryException), 
    stop=stop_after_attempt(3), 
    wait=wait_fixed(1)
)
async def describe_slide(index, image64) -> str:
    response = await anthropic_client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        temperature=0.1,
        system="""\
Your task is to describe the content of a slide from a presentation.
""",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe the content of the slide."
                    },
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": image64,
                        },
                    }
                ]
            }, 
        ],
    )
    return {
        "index": index,
        "image": image64,
        "title": f"Slide {index + 1}",
        "text": response.content[0].text
    }


async def rl_describe_slide(index, image64, sem):
    async with sem:
        return await describe_slide(index, image64)


async def describe_slides(images):
    slide_descriptions = []
    tasks = [rl_describe_slide(i, image_to_b64(image), sem) for i, image in enumerate(images)]
    for task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Describing slides"):
        slide_descriptions.append(await task)
    return slide_descriptions


async def get_embeddings(chunks):
    texts = [chunk["text"] for chunk in chunks]
    r = await openai_client.embeddings.create(input=texts, model="text-embedding-3-small")
    return [e.embedding for e in r.data]


sem = asyncio.Semaphore(2)

In [5]:
user_id = "664ed36aea629e38d9631fb3"
doc_ids = [
    "6658646bb929e25ed3e7a8a6",
    "66586480b929e25ed3e7a8a8",
    "66586490b929e25ed3e7a8aa",
    "6658649bb929e25ed3e7a8ac",
    "665864a5b929e25ed3e7a8ae",
    "665864b2b929e25ed3e7a8b0",
    "665864bcb929e25ed3e7a8b2",
    "665864c5b929e25ed3e7a8b4",
]

db = await mongo.get_db(settings)
s3 = docstore.create_client(settings)

documents = []
for doc_id in tqdm(doc_ids, desc="Downloading documents"):
    doc, path = await docstore.download_doc(doc_id, user_id, db, s3, settings)
    documents.append({**doc.model_dump(), "path": path})

Downloading documents:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
for i in tqdm(range(len(documents)), desc="Converting to images"):
    path = documents[i]["path"]
    img_path = path.parent / f"{path.stem}_images"
    img_path.mkdir(exist_ok=True)

    for f in img_path.glob("*"):
        f.unlink()

    documents[i]["images"] = convert_from_path(path, output_folder=img_path, fmt="jpeg")

Converting to images:   0%|          | 0/8 [00:00<?, ?it/s]

In [7]:
uri = "/home/muaddib/sietch/projects/NavigAItor/backend/data/user-documents"
db = await lancedb.connect_async(uri)

await db.drop_table("embedded_docs")

tbl = await db.create_table(
    "embedded_docs", 
    schema=pa.schema([
        pa.field("document_id", pa.string()),
        pa.field("document_name", pa.string()),
        pa.field("tags", pa.list_(pa.string())),
        pa.field("document_type", pa.string()),
        pa.field("title", pa.string()),
        pa.field("text", pa.string()),
        pa.field("vector", pa.list_(pa.float32(), list_size=1536)),
        pa.field("image", pa.string()),
    ])
)

tbl = await db.open_table("embedded_docs")

In [8]:
for doc in tqdm(documents, desc="Processing documents"):
    images = doc["images"]
    doc_chunks = await describe_slides(images)

    doc_chunks = sorted(doc_chunks, key=lambda x: x["index"])
    embeddings = await get_embeddings(doc_chunks)

    chunks = pd.DataFrame([
        Chunk(
            document_id=doc["id"],
            document_name=doc["metadata"]["name"],
            tags=doc["metadata"]["tags"],
            document_type=doc["metadata"]["document_type"],
            title=chunk["title"],
            text=chunk["text"],
            vector=embedding,
            image=chunk["image"]
        ).model_dump()
        for chunk, embedding in zip(doc_chunks, embeddings)
    ])

    await tbl.add(chunks)
    time.sleep(5)

Processing documents:   0%|          | 0/8 [00:00<?, ?it/s]

Describing slides:   0%|          | 0/85 [00:00<?, ?it/s]

16:41:05.683 Message with 'claude-3-haiku-20240307' [LLM]
16:41:05.698 Message with 'claude-3-haiku-20240307' [LLM]
16:41:08.092 Message with 'claude-3-haiku-20240307' [LLM]
16:41:09.337 Message with 'claude-3-haiku-20240307' [LLM]
16:41:11.215 Message with 'claude-3-haiku-20240307' [LLM]
16:41:11.330 Message with 'claude-3-haiku-20240307' [LLM]
16:41:14.088 Message with 'claude-3-haiku-20240307' [LLM]
16:41:14.117 Message with 'claude-3-haiku-20240307' [LLM]
16:41:16.340 Message with 'claude-3-haiku-20240307' [LLM]
16:41:17.385 Message with 'claude-3-haiku-20240307' [LLM]
16:41:19.020 Message with 'claude-3-haiku-20240307' [LLM]
16:41:19.996 Message with 'claude-3-haiku-20240307' [LLM]
16:41:20.157 Message with 'claude-3-haiku-20240307' [LLM]
16:41:23.080 Message with 'claude-3-haiku-20240307' [LLM]
16:41:23.147 Message with 'claude-3-haiku-20240307' [LLM]
16:41:25.349 Message with 'claude-3-haiku-20240307' [LLM]
16:41:25.664 Message with 'claude-3-haiku-20240307' [LLM]
16:41:27.627 M

Describing slides:   0%|          | 0/127 [00:00<?, ?it/s]

16:44:04.188 Message with 'claude-3-haiku-20240307' [LLM]
16:44:04.194 Message with 'claude-3-haiku-20240307' [LLM]
16:44:06.892 Message with 'claude-3-haiku-20240307' [LLM]
16:44:07.551 Message with 'claude-3-haiku-20240307' [LLM]
16:44:09.010 Message with 'claude-3-haiku-20240307' [LLM]
16:44:09.827 Message with 'claude-3-haiku-20240307' [LLM]
16:44:11.569 Message with 'claude-3-haiku-20240307' [LLM]
16:44:12.782 Message with 'claude-3-haiku-20240307' [LLM]
16:44:14.184 Message with 'claude-3-haiku-20240307' [LLM]
16:44:15.095 Message with 'claude-3-haiku-20240307' [LLM]
16:44:17.082 Message with 'claude-3-haiku-20240307' [LLM]
16:44:18.692 Message with 'claude-3-haiku-20240307' [LLM]
16:44:19.821 Message with 'claude-3-haiku-20240307' [LLM]
16:44:20.761 Message with 'claude-3-haiku-20240307' [LLM]
16:44:22.090 Message with 'claude-3-haiku-20240307' [LLM]
16:44:23.081 Message with 'claude-3-haiku-20240307' [LLM]
16:44:24.391 Message with 'claude-3-haiku-20240307' [LLM]
16:44:25.294 M

Describing slides:   0%|          | 0/93 [00:00<?, ?it/s]

16:48:20.905 Message with 'claude-3-haiku-20240307' [LLM]
16:48:20.911 Message with 'claude-3-haiku-20240307' [LLM]
16:48:24.030 Message with 'claude-3-haiku-20240307' [LLM]
16:48:24.314 Message with 'claude-3-haiku-20240307' [LLM]
16:48:26.776 Message with 'claude-3-haiku-20240307' [LLM]
16:48:26.788 Message with 'claude-3-haiku-20240307' [LLM]
16:48:29.005 Message with 'claude-3-haiku-20240307' [LLM]
16:48:29.309 Message with 'claude-3-haiku-20240307' [LLM]
16:48:31.710 Message with 'claude-3-haiku-20240307' [LLM]
16:48:31.735 Message with 'claude-3-haiku-20240307' [LLM]
16:48:35.112 Message with 'claude-3-haiku-20240307' [LLM]
16:48:35.699 Message with 'claude-3-haiku-20240307' [LLM]
16:48:37.572 Message with 'claude-3-haiku-20240307' [LLM]
16:48:37.847 Message with 'claude-3-haiku-20240307' [LLM]
16:48:40.236 Message with 'claude-3-haiku-20240307' [LLM]
16:48:40.671 Message with 'claude-3-haiku-20240307' [LLM]
16:48:43.894 Message with 'claude-3-haiku-20240307' [LLM]
16:48:44.168 M

Describing slides:   0%|          | 0/48 [00:00<?, ?it/s]

16:51:49.646 Message with 'claude-3-haiku-20240307' [LLM]
16:51:49.654 Message with 'claude-3-haiku-20240307' [LLM]
16:51:51.987 Message with 'claude-3-haiku-20240307' [LLM]
16:51:52.135 Message with 'claude-3-haiku-20240307' [LLM]
16:51:54.095 Message with 'claude-3-haiku-20240307' [LLM]
16:51:54.304 Message with 'claude-3-haiku-20240307' [LLM]
16:51:57.572 Message with 'claude-3-haiku-20240307' [LLM]
16:51:57.897 Message with 'claude-3-haiku-20240307' [LLM]
16:51:59.108 Message with 'claude-3-haiku-20240307' [LLM]
16:52:00.474 Message with 'claude-3-haiku-20240307' [LLM]
16:52:01.846 Message with 'claude-3-haiku-20240307' [LLM]
16:52:02.266 Message with 'claude-3-haiku-20240307' [LLM]
16:52:04.578 Message with 'claude-3-haiku-20240307' [LLM]
16:52:05.382 Message with 'claude-3-haiku-20240307' [LLM]
16:52:06.490 Message with 'claude-3-haiku-20240307' [LLM]
16:52:07.658 Message with 'claude-3-haiku-20240307' [LLM]
16:52:08.970 Message with 'claude-3-haiku-20240307' [LLM]
16:52:10.291 M

Describing slides:   0%|          | 0/36 [00:00<?, ?it/s]

16:53:06.388 Message with 'claude-3-haiku-20240307' [LLM]
16:53:06.393 Message with 'claude-3-haiku-20240307' [LLM]
16:53:08.777 Message with 'claude-3-haiku-20240307' [LLM]
16:53:09.181 Message with 'claude-3-haiku-20240307' [LLM]
16:53:11.380 Message with 'claude-3-haiku-20240307' [LLM]
16:53:14.257 Message with 'claude-3-haiku-20240307' [LLM]
16:53:18.558 Message with 'claude-3-haiku-20240307' [LLM]
16:53:18.808 Message with 'claude-3-haiku-20240307' [LLM]
16:53:20.919 Message with 'claude-3-haiku-20240307' [LLM]
16:53:22.175 Message with 'claude-3-haiku-20240307' [LLM]
16:53:23.169 Message with 'claude-3-haiku-20240307' [LLM]
16:53:24.161 Message with 'claude-3-haiku-20240307' [LLM]
16:53:25.118 Message with 'claude-3-haiku-20240307' [LLM]
16:53:26.055 Message with 'claude-3-haiku-20240307' [LLM]
16:53:27.113 Message with 'claude-3-haiku-20240307' [LLM]
16:53:28.279 Message with 'claude-3-haiku-20240307' [LLM]
16:53:28.579 Message with 'claude-3-haiku-20240307' [LLM]
16:53:30.404 M

Describing slides:   0%|          | 0/67 [00:00<?, ?it/s]

16:54:34.094 Message with 'claude-3-haiku-20240307' [LLM]
16:54:34.101 Message with 'claude-3-haiku-20240307' [LLM]
16:54:36.695 Message with 'claude-3-haiku-20240307' [LLM]
16:54:36.754 Message with 'claude-3-haiku-20240307' [LLM]
16:54:40.241 Message with 'claude-3-haiku-20240307' [LLM]
16:54:40.380 Message with 'claude-3-haiku-20240307' [LLM]
16:54:43.433 Message with 'claude-3-haiku-20240307' [LLM]
16:54:44.412 Message with 'claude-3-haiku-20240307' [LLM]
16:54:46.230 Message with 'claude-3-haiku-20240307' [LLM]
16:54:47.244 Message with 'claude-3-haiku-20240307' [LLM]
16:54:48.815 Message with 'claude-3-haiku-20240307' [LLM]
16:54:49.955 Message with 'claude-3-haiku-20240307' [LLM]
16:54:51.062 Message with 'claude-3-haiku-20240307' [LLM]
16:54:51.978 Message with 'claude-3-haiku-20240307' [LLM]
16:54:53.577 Message with 'claude-3-haiku-20240307' [LLM]
16:54:54.555 Message with 'claude-3-haiku-20240307' [LLM]
16:54:56.094 Message with 'claude-3-haiku-20240307' [LLM]
16:54:57.120 M

Describing slides:   0%|          | 0/42 [00:00<?, ?it/s]

16:56:47.110 Message with 'claude-3-haiku-20240307' [LLM]
16:56:47.115 Message with 'claude-3-haiku-20240307' [LLM]
16:56:49.169 Message with 'claude-3-haiku-20240307' [LLM]
16:56:51.163 Message with 'claude-3-haiku-20240307' [LLM]
16:56:51.797 Message with 'claude-3-haiku-20240307' [LLM]
16:56:54.041 Message with 'claude-3-haiku-20240307' [LLM]
16:56:55.580 Message with 'claude-3-haiku-20240307' [LLM]
16:56:57.640 Message with 'claude-3-haiku-20240307' [LLM]
16:56:58.454 Message with 'claude-3-haiku-20240307' [LLM]
16:57:00.525 Message with 'claude-3-haiku-20240307' [LLM]
16:57:02.144 Message with 'claude-3-haiku-20240307' [LLM]
16:57:03.342 Message with 'claude-3-haiku-20240307' [LLM]
16:57:05.129 Message with 'claude-3-haiku-20240307' [LLM]
16:57:07.053 Message with 'claude-3-haiku-20240307' [LLM]
16:57:08.603 Message with 'claude-3-haiku-20240307' [LLM]
16:57:09.983 Message with 'claude-3-haiku-20240307' [LLM]
16:57:11.995 Message with 'claude-3-haiku-20240307' [LLM]
16:57:12.675 M

Describing slides:   0%|          | 0/62 [00:00<?, ?it/s]

16:58:02.482 Message with 'claude-3-haiku-20240307' [LLM]
16:58:02.488 Message with 'claude-3-haiku-20240307' [LLM]
16:58:04.642 Message with 'claude-3-haiku-20240307' [LLM]
16:58:04.687 Message with 'claude-3-haiku-20240307' [LLM]
16:58:06.533 Message with 'claude-3-haiku-20240307' [LLM]
16:58:08.527 Message with 'claude-3-haiku-20240307' [LLM]
16:58:08.761 Message with 'claude-3-haiku-20240307' [LLM]
16:58:18.172 Message with 'claude-3-haiku-20240307' [LLM]
16:58:19.659 Message with 'claude-3-haiku-20240307' [LLM]
16:58:20.783 Message with 'claude-3-haiku-20240307' [LLM]
16:58:22.017 Message with 'claude-3-haiku-20240307' [LLM]
16:58:24.440 Message with 'claude-3-haiku-20240307' [LLM]
16:58:24.768 Message with 'claude-3-haiku-20240307' [LLM]
16:58:26.435 Message with 'claude-3-haiku-20240307' [LLM]
16:58:28.286 Message with 'claude-3-haiku-20240307' [LLM]
16:58:28.617 Message with 'claude-3-haiku-20240307' [LLM]
16:58:30.702 Message with 'claude-3-haiku-20240307' [LLM]
16:58:30.860 M

In [9]:
TOP_K = 3


query = "how do action potentials propagate through cardiac cells?"
r = await openai_client.embeddings.create(
    input=query, model="text-embedding-3-small"
)
q_embed = r.data[0].embedding

results = await tbl.vector_search(q_embed).limit(TOP_K).to_pandas()

results

17:01:07.878 Embedding Creation with 'text-embedding-3-small' [LLM]


Unnamed: 0,document_id,document_name,tags,document_type,title,text,vector,image,_distance
0,6658646bb929e25ed3e7a8a6,CV Anatomy - Electrophysiology.pdf,[BIO 201],slides,Slide 64,The slide appears to be about the cell-cell sp...,"[0.014561105, -0.020164335, 0.01443956, 0.0211...",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.762806
1,6658646bb929e25ed3e7a8a6,CV Anatomy - Electrophysiology.pdf,[BIO 201],slides,Slide 69,The slide discusses the action potential morph...,"[-0.028713048, -0.03312183, 0.005801915, -0.00...",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.857792
2,6658646bb929e25ed3e7a8a6,CV Anatomy - Electrophysiology.pdf,[BIO 201],slides,Slide 31,The slide appears to be describing the mechani...,"[0.0014885111, -0.013880957, -0.0006357183, 0....",/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.905021


In [10]:
results.sort_values(by="_distance", ascending=True).to_dict(orient="records")



[{'document_id': '6658646bb929e25ed3e7a8a6',
  'document_name': 'CV Anatomy - Electrophysiology.pdf',
  'tags': array(['BIO 201'], dtype=object),
  'document_type': 'slides',
  'title': 'Slide 64',
  'text': 'The slide appears to be about the cell-cell spread of action potentials (APs) in the heart. It depicts the electrotronic spread of current through a series of cells (labeled Cell A, Cell B, Cell C, etc.) and explains the process of how an AP travels from one cell to the next.\n\nThe key points illustrated on the slide are:\n\n1. The opening of channels or current injected from other cells depolarizes Cell A to a subthreshold level (Va).\n2. This depolarization of Cell A spreads to Cell B, causing its depolarization.\n3. As the depolarization spreads from cell to cell, the threshold is reached and regenerative action potential will be initiated in Cell A.\n4. More depolarizing current in Cell A causes the action potential to conduct faster because cells further away reach the thres