In [None]:
import pdfplumber
import chromadb
from chromadb.utils.embedding_functions import GoogleGenerativeAiEmbeddingFunction

import os
from dotenv import load_dotenv

# Load from .env
load_dotenv()

# Access key
gemini_key = os.getenv("GEMINI_API_KEY")


# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                # split into smaller chunks so embeddings work better
                chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
                for chunk in chunks:
                    text_chunks.append((f"page_{page_num}", chunk))
    return text_chunks

# Step 2: Initialize ChromaDB with Gemini embedding
def create_chroma_collection():
    embedding_func = GoogleGenerativeAiEmbeddingFunction(
        api_key=gemini_key,
        model_name="models/embedding-001"
    )
    client = chromadb.Client()
    collection = client.get_or_create_collection(
        name="rinvoq_collection",
        embedding_function=embedding_func
    )
    return collection



# Step 3: Store PDF text into Chroma
def store_pdf_in_chroma(pdf_path):
    text_chunks = extract_text_from_pdf(pdf_path)
    collection = create_chroma_collection()

    ids, docs, metadatas = [], [], []
    for idx, (page, chunk) in enumerate(text_chunks):
        ids.append(str(idx))
        docs.append(chunk)
        metadatas.append({"page": page})

    collection.add(
        ids=ids,
        documents=docs,
        metadatas=metadatas
    )
    print(f"Inserted {len(docs)} chunks into ChromaDB.")

    return collection

# Step 4: Query
def query_chroma(collection, query_text):
    results = collection.query(
        query_texts=[query_text],
        n_results=3
    )
    return results


if __name__ == "__main__":
    pdf_path = "rinvoq_pi.pdf"  # make sure the file is in your working directory
    collection = store_pdf_in_chroma(pdf_path)
    print(query_chroma(collection, "What is Rinvoq used for?"))
