# AI Experimentation Notebook

This notebook contains the Python translation of the AI logic found in `langchainService.js` and `vectorService.js`. 
It allows for isolated testing of prompts, chains, and vector operations.

## Setup
Ensure you have the following installed:
```bash
pip install langchain langchain-openai python-dotenv requests pypdf tiktoken
```

In [None]:
import os
import requests
import time
import json
import re
from typing import List, Optional, Dict, Any
from pathlib import Path
from dotenv import load_dotenv

# LangChain Imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import JsonOutputParser

: 

In [None]:
# Load Environment Variables
# Assuming the notebook is in backend/notebooks/, we look for .env in backend/ or root
env_path = Path("../.env")
if not env_path.exists():
    env_path = Path("../../.env")

load_dotenv(dotenv_path=env_path)

if not os.getenv("OPENAI_API_KEY"):
    print("WARNING: OPENAI_API_KEY not found in environment")
else:
    print("Environment loaded successfully.")

## 1. LlamaParse Integration
Manual implementation of LlamaCloud PDF parsing to match `langchainService.js`.

In [None]:
def parse_pdf_with_llama(source: str | bytes) -> str:
    api_key = os.getenv("LLAMA_CLOUD_API_KEY")
    if not api_key:
        raise ValueError("Missing LLAMA_CLOUD_API_KEY")

    # 1. Get File Content
    file_content = None
    filename = "upload.pdf"

    if isinstance(source, bytes):
        file_content = source
    elif isinstance(source, str):
        if source.startswith("http"):
            res = requests.get(source)
            res.raise_for_status()
            file_content = res.content
        else:
            with open(source, "rb") as f:
                file_content = f.read()
            filename = os.path.basename(source)

    # 2. Upload File
    base_url = "https://api.cloud.llamaindex.ai/api/parsing"
    headers = {
        "Authorization": f"Bearer {api_key}"
    }
    files = {
        "file": (filename, file_content, "application/pdf")
    }

    upload_res = requests.post(f"{base_url}/upload", headers=headers, files=files)
    upload_res.raise_for_status()
    
    job_id = upload_res.json().get("id")
    print(f"[LlamaParse] Job started: {job_id}")

    # 3. Poll for result
    max_retries = 300
    for _ in range(max_retries):
        time.sleep(1)
        job_res = requests.get(f"{base_url}/job/{job_id}", headers=headers)
        if not job_res.ok:
            continue
        
        job_data = job_res.json()
        status = job_data.get("status")

        if status == "SUCCESS":
            result_res = requests.get(f"{base_url}/job/{job_id}/result/markdown", headers=headers)
            result_res.raise_for_status()
            data = result_res.json()
            return data.get("markdown", str(data))
        elif status == "FAILED":
            raise Exception(f"LlamaParse Job Failed: {job_data}")
    
    raise Exception("LlamaParse Timed Out")

## 2. LangChain Service
Core LLM execution logic.

In [None]:
def get_langchain_llm(model: str = 'gpt-4o-mini', temperature: float = 0.2) -> ChatOpenAI:
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("Missing OPENAI_API_KEY")
    return ChatOpenAI(
        model_name=model,
        temperature=temperature,
        openai_api_key=api_key
    )

def extract_text_from_url(url: str) -> str:
    # Simplified for notebook: supports basic text fetching. 
    # For PDFs in list, logic usually routes to pdf extractors.
    try:
        res = requests.get(url)
        res.raise_for_status()
        content_type = res.headers.get('Content-Type', '')
        
        if 'application/pdf' in content_type or url.lower().endswith('.pdf'):
            # Start LlamaParse or use simple pypdf if needed
            return "[PDF Content Placeholder - Use parse_pdf_with_llama for full extraction]"
        
        return res.text
    except Exception as e:
        return f"Error fetching {url}: {str(e)}"

async def execute_with_langchain(
    system_prompt: str,
    user_prompt: str,
    urls: List[str] = [],
    model: str = 'gpt-4o-mini',
    temperature: float = 0.2,
    supports_images: bool = False,
    extra_context: str = '',
    schema: Optional[Dict] = None
) -> Dict[str, Any]:
    
    image_extensions = ('.png', '.jpg', '.jpeg', '.webp', '.gif')
    pdf_urls = [u for u in urls if '.pdf' in u.lower()]
    image_urls = [u for u in urls if u.lower().endswith(image_extensions)]
    text_urls = [u for u in urls if u not in pdf_urls and u not in image_urls]

    combined_text = ""

    # Process Text URLs
    for i, url in enumerate(text_urls):
        text = extract_text_from_url(url)
        combined_text += f"\n\n--- Document {i+1}: {url} ---\n{text}"

    # Process PDFs
    for i, url in enumerate(pdf_urls):
        try:
            # Note: In real app, we usually cache this or use lightweight PDF parser for speed if LlamaParse is too slow
            # Here we just use a placeholder text or call parse_pdf_with_llama if you want to wait
            # text = parse_pdf_with_llama(url) 
            text = "[PDF Content - Enable LlamaParse call in notebook to fetch]"
            combined_text += f"\n\n--- PDF {i+1}: {url} ---\n{text}"
        except Exception as e:
             combined_text += f"\n\n--- PDF {i+1}: {url} (Error: {e}) ---\n"

    if extra_context:
        combined_text += f"\n\n--- Additional Context ---\n{extra_context[:30000]}"

    # Build Messages
    messages = [SystemMessage(content=system_prompt)]
    
    user_content_parts = [user_prompt]
    if combined_text.strip():
        user_content_parts.append(f"Relevant document text:\n{combined_text[:200000]}")

    llm = None
    if supports_images and image_urls:
        vision_model = model if 'gpt-4o' in model else 'gpt-4o'
        llm = get_langchain_llm(vision_model, temperature)
        
        content_block = [
            {"type": "text", "text": "\n\n".join(user_content_parts)}
        ]
        for url in image_urls[:10]:
            content_block.append({
                "type": "image_url",
                "image_url": {"url": url}
            })
        messages.append(HumanMessage(content=content_block))
    else:
        llm = get_langchain_llm(model, temperature)
        messages.append(HumanMessage(content="\n\n".join(user_content_parts)))

    # Execute
    if schema:
        structured_llm = llm.with_structured_output(schema)
        result = structured_llm.invoke(messages)
        # Result is already a dict usually with structured output
        return {"data": result}
    else:
        response = llm.invoke(messages)
        return {
            "text": response.content,
            "usage": response.response_metadata.get("usage")
        }

## 3. Vector Service (In-Memory Mock)
Mocking the MongoDB Vector Store with a simple in-memory list.

In [None]:
class InMemoryVectorStore:
    def __init__(self):
        self.store = [] # List of { embedding, content, metadata }

    def add(self, embedding, content, metadata):
        self.store.append({
            "embedding": embedding,
            "content": content,
            "metadata": metadata
        })

    def search(self, query_vector, limit=5):
        # Simple Cosine Similarity
        import numpy as np
        
        if not self.store:
            return []
            
        q = np.array(query_vector)
        results = []

        for item in self.store:
            v = np.array(item["embedding"])
            # Cosine Sim: (A . B) / (||A|| * ||B||)
            score = np.dot(q, v) / (np.linalg.norm(q) * np.linalg.norm(v))
            results.append({ **item, "score": score })
        
        return sorted(results, key=lambda x: x["score"], reverse=True)[:limit]

# Global Store Instance
vector_store = InMemoryVectorStore()

def get_embedding(text: str) -> List[float]:
    api_key = os.getenv("OPENAI_API_KEY")
    embeddings = OpenAIEmbeddings(
        openai_api_key=api_key,
        model="text-embedding-3-small"
    )
    return embeddings.embed_query(text)

async def ingest_text(text: str, home_id: str, meta: Dict = {}):
    if not text or not text.strip(): return 0
    
    # Simple splitter (SentenceSplitter not standard in Py LangChain, using RecursiveCharacterTextSplitter)
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_text(text)

    print(f"Split into {len(chunks)} chunks...")
    
    for chunk in chunks:
        emb = get_embedding(chunk)
        vector_store.add(emb, chunk, { "homeId": home_id, **meta })
    
    return len(chunks)

async def search_similar(query: str, home_id: str, limit: int = 5):
    if not query:
        return []
    
    query_emb = get_embedding(query)
    results = vector_store.search(query_emb, limit)
    # Filter by home_id if needed, but in mock we might just show all or filter post-search
    return [r for r in results if r["metadata"].get("homeId") == home_id]

## 4. Playground
Use the cells below to experiment.

In [None]:
# Example: Chat
SYSTEM_PROMPT = "You are a helpful assistant."
USER_PROMPT = "Explain the benefits of vector databases briefly."

# Need to run this in async context or use await in top-level notebook
result = await execute_with_langchain(SYSTEM_PROMPT, USER_PROMPT)
print(result['text'])

In [None]:
# Example: Ingest & Search
sample_text = """
Buildwise is an AI-powered home building platform.
It helps manage permits, designs, and contractors.
Buildwise uses MongoDB and LangChain.
"""

await ingest_text(sample_text, home_id="test-home-1")

search_res = await search_similar("What tech stack does Buildwise use?", home_id="test-home-1")
for r in search_res:
    print(f"Score: {r['score']:.4f} | Content: {r['content']}")