## Environment variables

In [1]:
# Storage stuff
PROCESSED_FOLDER = "processed_data"
EXTRACTION_FOLDER = "extracted_data"

# Database stuff
LOCAL_DB_PATH = "mydatabase.db"

# Vectorization stuff
VECT_MODEL_NAME = "llamaindex/vdr-2b-multi-v1"
VECT_MODEL_LOCAL_PATH = "weights_vect/"

# Chat stuff
CHAT_MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"


## Imports

In [2]:
%pip install qwen_vl_utils
%pip install torchvision
%pip install pdf2image
%pip install pylatex
%pip install llama-index
%pip install llama-index-embeddings-huggingface
%pip install datasets
%pip install sqlite_vec
%pip install pymupdf
%pip install aiofiles
%pip install opencv-python
%pip install accelerate

%pip install llama-cloud==0.1.19
%pip install llama-cloud-services==0.6.22
%pip install llama-index==0.12.35
%pip install llama-index-agent-openai==0.4.7
%pip install llama-index-cli==0.4.1
%pip install llama-index-core==0.12.35
%pip install llama-index-embeddings-huggingface==0.5.4
%pip install llama-index-embeddings-openai==0.3.1
%pip install llama-index-indices-managed-llama-cloud==0.6.11
%pip install llama-index-llms-openai==0.3.38
%pip install llama-index-multi-modal-llms-openai==0.4.3
%pip install llama-index-program-openai==0.3.1
%pip install llama-index-question-gen-openai==0.3.0
%pip install llama-index-readers-file==0.4.7
%pip install llama-index-readers-llama-parse==0.4.0
%pip install llama-parse==0.6.22


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [3]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from typing import Dict, Any, List, Deque, Optional
from qwen_vl_utils import process_vision_info
from huggingface_hub import snapshot_download
from pdf2image import convert_from_path
from pylatex import Document, NoEscape
from datasets import load_dataset
from datetime import datetime
from collections import deque
from PIL import Image
import numpy as np
import accelerate
import sqlite_vec
import aiosqlite
import threading
import tempfile
import aiofiles
import asyncio
import struct
import shutil
import base64
import torch
import glob
import time
import json
import math
import cv2
import sys
import os
import re
import io

  from .autonotebook import tqdm as notebook_tqdm


## AI System Prompt

In [4]:
PROMPT = """
You are a useful assistant, expert in machine learning.
You are connected to a RAG system and can answer the user with relevant documents.
If and only if the user wants to edit a document, just answer with a single JSON like the following:
{"image_number": int, "original_value": int, "new_value": int}
Otherwise just answer the question with the provided documents."""

## DB Functions

In [5]:
async def db_get_vector_list(query_vector: list[float], amount: int) -> list[dict]:
    async with aiosqlite.connect(LOCAL_DB_PATH) as conn:
        await conn._execute(conn._conn.enable_load_extension, True)
        await conn._execute(sqlite_vec.load, conn._conn)

        conn.row_factory = aiosqlite.Row
        # Enable sqlite-vec extension
        await conn.execute("SELECT vec_version();")
        
        # Convert query_vector to blob for sqlite-vec
        query_vector_blob = np.array(query_vector, dtype=np.float32).tobytes()
        
        query = """
            SELECT 
                piv.page_id,
                1 - vec_distance_cosine(piv.vector_data, ?) as similarity
            FROM page_images_vectors piv
            ORDER BY vec_distance_cosine(piv.vector_data, ?)
            LIMIT ?
        """
        params = (query_vector_blob, query_vector_blob, amount)
        
        try:
            async with conn.execute(query, params) as cursor:
                results = await cursor.fetchall()
                return [{'page_id': row['page_id'], 'similarity': row['similarity']} for row in results]
        except Exception as e:
            print(f"Database query failed: {e}")
            raise
        finally:
            await conn._execute(conn._conn.enable_load_extension, False)



async def store_page_vector(document_id: str, page_number: int, vector: list, page_id: Optional[str] = None) -> None:
    """Store page vector in database with page_id if provided"""
    print(f"[{datetime.now()}] Storing page vector in database - Document: {document_id}, Page: {page_number}")
    print(f"SAMPLE VECTOR: {vector[:1]}")  # Log first element for verification

    # Ensure vector length matches the expected dimension (1536)
    if len(vector) != 1536:
        raise ValueError(f"Vector length {len(vector)} does not match expected dimension 1536")

    # Convert vector to binary blob
    vector_blob = struct.pack(f'<{len(vector)}f', *vector)

    async with aiosqlite.connect(LOCAL_DB_PATH) as conn:
        await conn._execute(conn._conn.enable_load_extension, True)
        await conn._execute(sqlite_vec.load, conn._conn)

        if page_id:
            # Use page_id if provided
            query = """
                INSERT OR REPLACE INTO page_images_vectors (page_id, vector_data) VALUES (?, ?)
            """
            try:
                await conn.execute(query, (page_id, vector_blob))
                await conn.commit()
                print(f"[{datetime.now()}] Successfully stored page vector - Document: {document_id}, Page: {page_number}, Page ID: {page_id}")
            except Exception as e:
                print(f"[{datetime.now()}] Database error while storing page vector: {str(e)}")
                raise
            finally:
                await conn._execute(conn._conn.enable_load_extension, False)
        else:
            # Fall back to document_id and page_number
            select_query = """
                SELECT page_id FROM page_images 
                WHERE document_id = ? AND page_number = ?
            """
            async with conn.execute(select_query, (document_id, page_number)) as cursor:
                row = await cursor.fetchone()
                if row:
                    page_id = row[0]
                    query = """
                        INSERT OR REPLACE INTO page_images_vectors (page_id, vector_data) VALUES (?, ?)
                    """
                    try:
                        await conn.execute(query, (page_id, vector_blob))
                        await conn.commit()
                        print(f"[{datetime.now()}] Successfully stored page vector - Document: {document_id}, Page: {page_number}")
                    except Exception as e:
                        print(f"[{datetime.now()}] Database error while storing page vector: {str(e)}")
                        raise
                    finally:
                        await conn._execute(conn._conn.enable_load_extension, False)
                else:
                    print(f"[{datetime.now()}] ERROR: Page not found for Document: {document_id}, Page: {page_number}")
                    await conn._execute(conn._conn.enable_load_extension, False)
                    raise ValueError("Page not found")



async def db_get_image_latex(image_id: str):
    async with aiosqlite.connect(LOCAL_DB_PATH) as conn:
        conn.row_factory = aiosqlite.Row
        query = """
            SELECT
                pi.latex_code
            FROM page_images pi
            WHERE pi.page_id = ?
        """
        async with conn.execute(query, (image_id,)) as cursor:
            row = await cursor.fetchone()
            return row[0] if row else None

## Download embedding model (around 40 seconds)

In [6]:
if not os.path.exists(VECT_MODEL_LOCAL_PATH):
    print(f"Downloading vect_model to {VECT_MODEL_LOCAL_PATH}")
    snapshot_download(
        repo_id=VECT_MODEL_NAME,
        local_dir=VECT_MODEL_LOCAL_PATH,
        local_dir_use_symlinks=False
    )
else:
    print(f"Model already exists at {VECT_MODEL_LOCAL_PATH}")

Model already exists at weights_vect/


## Load embedding model

In [7]:
if torch.backends.mps.is_available():
    DEVICE = "mps"
    print("MPS backend is available. Using Apple Silicon GPU.")
elif torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"CUDA Available: {torch.cuda.is_available()}")
    try:
        print(f"Device: {torch.cuda.current_device()}")
        print(f"Device Name: {torch.cuda.get_device_name(0)}")
    except Exception as e:
        print(f"Could not get CUDA device details: {e}")
else:
    DEVICE = "cpu"
    print("MPS and CUDA not available. Using CPU.")

print(f"Loading vect_model: {VECT_MODEL_LOCAL_PATH}")
vect_model = HuggingFaceEmbedding(
    model_name=VECT_MODEL_LOCAL_PATH,
    device=DEVICE,
    trust_remote_code=True,
    local_files_only=True
)
print(vect_model)

MPS backend is available. Using Apple Silicon GPU.
Loading vect_model: weights_vect/


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


model_name='weights_vect/' embed_batch_size=10 callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x320874210> num_workers=None max_length=32768 normalize=True query_instruction=None text_instruction=None cache_folder=None show_progress_bar=False


## Vectorization functions

In [8]:
async def embed_text(request: dict):
    try:
        embeddings = []
        
        for text in request["texts"]:
            print(f"Processing user query: {text}\n")
            
            query_embedding = vect_model.get_query_embedding(text)
            
            if isinstance(query_embedding, np.ndarray):
                query_embedding = query_embedding.tolist()
            
            embeddings.append(query_embedding)

        return embeddings

    except Exception as e:
        import traceback
        print(f"Error details: {traceback.format_exc()}")
        raise Exception(f"Text embedding failed: {str(e)}")



async def embed_images(files: dict):
    try:
        if len(files) != 1:
            raise ValueError("Expected exactly one file for embedding")
        file_tuple = list(files.values())[0]
        _, image_bytes, _ = file_tuple
        
        # Save to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
            tmp_file.write(image_bytes)
            tmp_file_path = tmp_file.name
        
        print(f"Processing image from temporary file: {tmp_file_path}")
        
        # Generate embedding using the file path
        image_embedding = vect_model.get_image_embedding(tmp_file_path)
        
        # Clean up the temporary file
        os.unlink(tmp_file_path)
        
        if isinstance(image_embedding, np.ndarray):
            image_embedding = image_embedding.tolist()
        
        return image_embedding

    except Exception as e:
        import traceback
        print(f"Error details: {traceback.format_exc()}")
        raise Exception(f"Image embedding failed: {str(e)}")

## Embedding class

In [9]:
class EmbeddingQueue:
    def __init__(self):
        self.image_queue: Deque[Dict[str, Any]] = deque()
        self.current_task = None
        self.lock = asyncio.Lock()
        self.processing = False
        self.processed_pages_count: Dict[str, int] = {}  # Track number of processed pages per document
        self.document_total_pages: Dict[str, int] = {}
        self.failed_pages: Dict[str, List[int]] = {}
        self.document_start_times: Dict[str, float] = {}

        print(f"[{datetime.now()}] Initialized EmbeddingQueue instance")


    def start_background_tasks(self):
        """Start background tasks that require a running event loop"""
        asyncio.create_task(self._check_stalled_documents_periodically())
        print(f"[{datetime.now()}] Started background task for checking stalled documents")


    async def _check_stalled_documents_periodically(self):
        """Periodically check for and process stalled documents"""
        while True:
            try:
                await asyncio.sleep(300)  # Check every 5 minutes
                await self._check_stalled_documents()
            except Exception as e:
                print(f"[{datetime.now()}] Error in stalled document checker: {str(e)}")


    def _start_stalled_checker(self):
        """Start a background task to periodically check for stalled documents"""
        asyncio.create_task(self._check_stalled_documents_periodically())


    async def _check_stalled_documents(self):
        """Check for documents that have stalled processing and compute their vectors"""
        current_time = time.time()
        documents_to_process = []
        
        print(f"[{datetime.now()}] Checking for stalled documents...")
        
        async with self.lock:
            for doc_id in list(self.processed_pages_count.keys()):
                # Skip documents that are already completed
                if doc_id not in self.document_total_pages:
                    continue
                    
                # Check if this document has been processing for more than 10 minutes
                if doc_id not in self.document_start_times:
                    self.document_start_times[doc_id] = current_time
                    continue
                    
                time_processing = current_time - self.document_start_times[doc_id]
                if time_processing < 600:  # Less than 10 minutes
                    continue
                
                # Calculate percentage processed
                processed_count = self.processed_pages_count[doc_id]
                total_expected = self.document_total_pages[doc_id]
                percent_processed = (processed_count / total_expected) * 100
                
                # Get count of pages still in queue for this document
                pages_in_queue = 0
                for task in self.image_queue:
                    if task['document_id'] == doc_id:
                        pages_in_queue += 1
                
                # If no tasks are pending and we have at least 70% of pages, 
                # or if more than 20 minutes have passed and we have at least 50% of pages
                stalled_timeout = (pages_in_queue == 0 and percent_processed >= 70) or \
                                (time_processing >= 1200 and percent_processed >= 50)
                                
                if stalled_timeout:
                    print(f"[{datetime.now()}] Document {doc_id} appears stalled:")
                    print(f"- Processing time: {time_processing:.1f} seconds")
                    print(f"- Processed {processed_count}/{total_expected} pages ({percent_processed:.1f}%)")
                    print(f"- Pages still in queue: {pages_in_queue}")
                    
                    # Document is considered complete if stalled conditions are met
                    self._mark_document_complete(doc_id, processed_count, total_expected)
        
        # Process any stalled documents (outside of lock)
        for doc_id in documents_to_process:
            print(f"[{datetime.now()}] Processing stalled document: {doc_id}")
            try:
                await self._process_document_vector(doc_id)
            except Exception as e:
                print(f"[{datetime.now()}] Error processing stalled document {doc_id}: {str(e)}")


    async def add_image_task(self, document_id: str, document_title: str, image_bytes: bytes, page_number: int, total_pages: int, page_id: Optional[str] = None):
        """Add a new image embedding task to the queue"""
        print(f"[{datetime.now()}] Adding new task - Document ID: {document_id}, Page: {page_number+1}/{total_pages}")

        if document_id not in self.processed_pages_count:
            self.processed_pages_count[document_id] = 0
            self.document_total_pages[document_id] = total_pages
            self.failed_pages[document_id] = []
            self.document_start_times[document_id] = time.time()
            print(f"[{datetime.now()}] Initialized new document tracking - ID: {document_id}, Total Pages: {total_pages}")

        task = {
            'document_id': document_id,
            'document_title': document_title,
            'image_bytes': image_bytes,
            'page_number': page_number,
            'page_id': page_id,
            'timestamp': time.time(),
            'retry_count': 0
        }
        self.image_queue.append(task)

        queue_stats = {
            'queue_size': len(self.image_queue),
            'documents_in_progress': len(self.processed_pages_count),
            'current_document': document_id,
            'page_number': page_number
        }
        print(f"[{datetime.now()}] Task added to queue. Queue stats: {json.dumps(queue_stats)}")

        if not self.processing:
            print(f"[{datetime.now()}] Queue processor not running. Starting new processing task.")
            asyncio.create_task(self.process_queue())


    async def process_queue(self):
        """Process all tasks in the queue with improved error handling and retries"""
        if self.processing:
            print(f"[{datetime.now()}] Queue processor already running")
            return

        self.processing = True
        print(f"[{datetime.now()}] Starting queue processor")

        failed_tasks = []  # Track failed tasks for retry

        while self.image_queue:
            async with self.lock:
                task = self.image_queue.popleft()
                self.current_task = task

            document_id = task['document_id']
            page_number = task['page_number']
            page_id = task.get('page_id')
            wait_time = time.time() - task['timestamp']

            print(f"""[{datetime.now()}] Starting task processing:
                - Document ID: {document_id}
                - Page: {page_number}
                - Queue size: {len(self.image_queue)}
                - Wait time: {wait_time:.2f} seconds""")

            try:
                processing_start = time.time()
                files = {
                    'files': ('image.png', task['image_bytes'], 'image/png')
                }

                try:
                    print(f"[{datetime.now()}] Calling embed_image API for document {document_id}, page {page_number}")
                    vector = await embed_images(files)  # This has built-in retry logic
                    
                    # Store vector in database
                    await store_page_vector(document_id, page_number, vector, page_id)
                    
                    # Update processed page count
                    self.processed_pages_count[document_id] = self.processed_pages_count.get(document_id, 0) + 1
                    
                    # Check if document is complete
                    await self._check_document_completion(document_id)
                    
                    processing_time = time.time() - processing_start

                    print(f"""[{datetime.now()}] Task completed successfully:
                        - Document ID: {document_id}
                        - Page: {page_number}
                        - Processing time: {processing_time:.2f} seconds
                        - Vector size: {len(vector)}""")

                except Exception as e:
                    print(f"[{datetime.now()}] Error embedding image for document {document_id}, page {page_number}: {str(e)}")
                    
                    # Track this failed page
                    if document_id in self.failed_pages:
                        self.failed_pages[document_id].append(page_number)
                    
                    # Add to failed tasks queue for later retry if under retry limit
                    retry_count = task.get('retry_count', 0) + 1
                    if retry_count <= 3:  # Limit retries to 3 attempts
                        task['retry_count'] = retry_count
                        task['last_error'] = str(e)
                        task['last_attempt'] = time.time()
                        failed_tasks.append(task)
                        print(f"[{datetime.now()}] Added to retry queue (attempt {retry_count}/3)")
                    else:
                        print(f"[{datetime.now()}] Max retries exceeded for document {document_id}, page {page_number}")
                        # Check if we should compute document vector despite this failure
                        await self._check_document_completion(document_id)

            except Exception as e:
                print(f"[{datetime.now()}] Error processing task for document {document_id}, page {page_number}: {str(e)}")
            finally:
                self.current_task = None

        # Process failed tasks if any
        if failed_tasks:
            print(f"[{datetime.now()}] Processing {len(failed_tasks)} failed tasks after a delay")
            # Wait a bit before retrying
            await asyncio.sleep(10)
            # Add failed tasks back to the queue
            async with self.lock:
                for task in failed_tasks:
                    self.image_queue.append(task)
            # Process the queue again
            return await self.process_queue()
            
        self.processing = False
        print(f"[{datetime.now()}] Queue processor finished - no more tasks in queue")


    async def _check_document_completion(self, document_id: str):
        """Check if a document's page processing is complete"""
        if document_id not in self.processed_pages_count or document_id not in self.document_total_pages:
            return
            
        total_pages = self.document_total_pages[document_id]
        processed_pages = self.processed_pages_count[document_id]
        
        # Calculate what percentage of pages are done
        percent_complete = (processed_pages / total_pages) * 100
        
        # Count pages still in queue for this document
        pages_in_queue = 0
        for task in self.image_queue:
            if task['document_id'] == document_id:
                pages_in_queue += 1
        
        # Document is considered complete if either:
        # 1. All pages processed successfully (100%)
        # 2. No more pages in queue AND at least 70% processed
        # 3. At least 95% processed (regardless of queue)
        is_complete = (
            (processed_pages == total_pages) or
            (pages_in_queue == 0 and percent_complete >= 70) or
            (percent_complete >= 95)
        )
        
        if is_complete:
            print(f"[{datetime.now()}] Document processing complete: {document_id} with {processed_pages}/{total_pages} pages ({percent_complete:.1f}%)")
            self._mark_document_complete(document_id, processed_pages, total_pages)


    def _mark_document_complete(self, document_id: str, processed_pages: int, total_pages: int):
        """Mark a document as complete and clean up tracking data"""
        print(f"[{datetime.now()}] Marking document as complete: {document_id}")
        print(f"[{datetime.now()}] - Processed pages: {processed_pages}/{total_pages}")
        
        # Clean up tracking data
        if document_id in self.processed_pages_count:
            del self.processed_pages_count[document_id]
        if document_id in self.document_total_pages:
            del self.document_total_pages[document_id]
        if document_id in self.failed_pages:
            del self.failed_pages[document_id]
        if document_id in self.document_start_times:
            del self.document_start_times[document_id]


    async def pause_current_task(self):
        """Pause the current task and return it to the queue"""
        async with self.lock:
            if self.current_task:
                print(f"[{datetime.now()}] Pausing current task and returning to queue")
                self.image_queue.appendleft(self.current_task)
                self.current_task = None
            self.processing = False
            print(f"[{datetime.now()}] Queue processing paused")


    async def resume_processing(self):
        """Resume queue processing if paused"""
        async with self.lock:
            if not self.processing and self.image_queue:
                print(f"[{datetime.now()}] Resuming queue processing")
                self.processing = True
                try:
                    await self.process_queue()
                except Exception as e:
                    print(f"[{datetime.now()}] Error during queue processing: {str(e)}")
                    self.processing = False

#### Run it

In [10]:
print(f"[{datetime.now()}] Initializing global embedding queue")
embedding_queue = EmbeddingQueue()

[2025-05-25 19:04:37.967035] Initializing global embedding queue
[2025-05-25 19:04:37.967132] Initialized EmbeddingQueue instance


## Download & Load Chat Model

In [11]:
# Set up device
if torch.backends.mps.is_available():
    DEVICE = "mps"
    torch.mps.empty_cache()
    print("MPS backend is available. Using Apple Silicon GPU")
elif torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"CUDA available: {torch.cuda.is_available()}")
    try:
        print(f"Device: {torch.cuda.current_device()}")
        print(f"Device name: {torch.cuda.get_device_name(0)}")
    except Exception as e:
        print(f"Could not get CUDA device details: {e}")
else:
    DEVICE = "cpu"
    print("MPS and CUDA not available, using CPU")


chat_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    CHAT_MODEL_NAME, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(CHAT_MODEL_NAME)

MPS backend is available. Using Apple Silicon GPU


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.15s/it]


## Generation functions

In [12]:
async def generate_answer(messages: list, images: list = None):
    content = []
    if images:
        for image_bytes in images:
            content.append({
                "type": "image", 
                "image": f"data:image;base64,{image_bytes}"
            })
    message_retriever = [{"role": "user", "content": content}]
    conv_to_send = messages + message_retriever
    
    try:
        text = processor.apply_chat_template(
            conv_to_send, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(conv_to_send)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(DEVICE)
        
        streamer = TextIteratorStreamer(
            processor.tokenizer, 
            skip_special_tokens=True, 
            skip_prompt=True
        )
        
        generation_kwargs = {
            **inputs,
            "streamer": streamer,
            "max_new_tokens": 1024
        }
        
        # Create a thread to run the generation
        thread = threading.Thread(target=chat_model.generate, kwargs=generation_kwargs)
        thread.start()
        
        # Return the streamer that will yield tokens
        return streamer
        
    except Exception as e:
        return iter(["Error generating response: " + str(e)])

## Retrieval functions

In [13]:
async def get_similar_vectors(query: str, amount: int) -> list:
    # Pause current image embedding if any
    await embedding_queue.pause_current_task()

    try:
        payload = {"texts": [query]}    
        text_vector = await embed_text(payload)

        image_vectors = await db_get_vector_list(text_vector, amount)
        return [img_vec['page_id'] for img_vec in image_vectors]

    except Exception as e:
        print(f"Error in vector processing: {str(e)}")
        raise

    finally:
        await embedding_queue.resume_processing()



async def combine_search_results(optimized_query: str, max_results: int = 3) -> list[str]:
    vector_image_ids = await get_similar_vectors(optimized_query, amount=max_results)

    print(f"Vector image IDs retrieved by the system: {vector_image_ids}\n")

    # Process image IDs
    seen_ids = set()
    best_image_ids = []

    for image_id in vector_image_ids:
        image_id = str(image_id)
        if image_id not in seen_ids and len(best_image_ids) < max_results:
            best_image_ids.append(image_id)
            seen_ids.add(image_id)

    return best_image_ids

## Image processing functions

In [14]:
async def resize_base64_image(base64_string: str, max_pixels: int = 500000) -> str:
    try:
        image_bytes = base64.b64decode(base64_string)
        img = Image.open(io.BytesIO(image_bytes))
        orig_width, orig_height = img.size
        total_pixels = orig_width * orig_height

        if total_pixels > max_pixels:
            scale_factor = math.sqrt(max_pixels / total_pixels)
            new_width = int(orig_width * scale_factor)
            new_height = int(orig_height * scale_factor)
            img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

        buffer = io.BytesIO()
        img.save(buffer, format="JPEG", quality=85)
        resized_bytes = buffer.getvalue()
        resized_base64 = base64.b64encode(resized_bytes).decode('utf-8')
        return resized_base64

    except Exception as e:
        print(f"Error processing base64 image: {str(e)}")
        return None

## Latex functions

In [15]:
def auto_complete_latex_environments(latex_str):
    """
    Properly close LaTeX environments in the correct nesting order
    """
    # Define environment patterns
    begin_pattern = r'\\begin\{([^}]+)\}'
    end_pattern = r'\\end\{([^}]+)\}'
    
    # Find all begin and end environments
    begins = [(m.start(), m.group(1)) for m in re.finditer(begin_pattern, latex_str)]
    ends = [(m.start(), m.group(1)) for m in re.finditer(end_pattern, latex_str)]
    
    # Create a stack to track unclosed environments
    stack = []
    begin_idx = 0
    end_idx = 0
    
    # Process environments in order of appearance
    all_positions = []
    for pos, env in begins:
        all_positions.append((pos, 'begin', env))
    for pos, env in ends:
        all_positions.append((pos, 'end', env))
    
    # Sort by position
    all_positions.sort()
    
    # Track unclosed environments
    for pos, action, env in all_positions:
        if action == 'begin':
            stack.append(env)
        elif action == 'end':
            if stack and stack[-1] == env:
                stack.pop()
            # If end doesn't match, we have malformed LaTeX, but continue
    
    # Add missing end environments in reverse order
    missing_ends = []
    while stack:
        env = stack.pop()
        missing_ends.append(f"\\end{{{env}}}")
    
    if missing_ends:
        latex_str += "\n" + "\n".join(missing_ends)
    
    return latex_str



def latex_to_image_object(latex_str):
    # Create temporary directory for processing
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_name = os.path.join(temp_dir, "temp_latex")
        
        print("Starting LaTeX compilation...")

        # Create LaTeX document object
        doc = Document()
        
        # Add required packages
        doc.packages.append(NoEscape(r'\usepackage{float}'))        
        doc.packages.append(NoEscape(r'\usepackage{booktabs}'))     
        doc.packages.append(NoEscape(r'\usepackage{graphicx}'))     
        doc.packages.append(NoEscape(r'\usepackage{multirow}'))     
        doc.packages.append(NoEscape(r'\usepackage{makecell}'))     
        doc.packages.append(NoEscape(r'\usepackage{cite}'))         
        doc.packages.append(NoEscape(r'\usepackage{threeparttable}'))
        doc.packages.append(NoEscape(r'\usepackage{xcolor}'))
        doc.packages.append(NoEscape(r'\usepackage{amssymb}'))      
        doc.packages.append(NoEscape(r'\usepackage{hyperref}'))     
        doc.packages.append(NoEscape(r'\usepackage{textcomp}'))     
        
        # Add specialcell macro definition
        doc.preamble.append(NoEscape(r'\newcommand{\specialcell}[2][c]{\begin{tabular}[#1]{@{}c@{}}#2\end{tabular}}'))

        # Auto-complete missing LaTeX structure with proper nesting
        latex_str = auto_complete_latex_environments(latex_str)
            
        doc.append(NoEscape(latex_str))  # Add table content

        # Generate PDF file
        try:
            doc.generate_pdf(temp_name, clean_tex=True)
        except Exception as e:
            print(f"LaTeX compilation failed: {e}")
            raise
            
        pdf_path = f"{temp_name}.pdf"
        
        if not os.path.exists(pdf_path):
            raise Exception("Failed to generate PDF file")
        
        print("PDF compiled successfully")

        # Convert PDF to PIL Image
        poppler_path = shutil.which("pdftoppm")
        images = convert_from_path(
            pdf_path,
            poppler_path=os.path.dirname(poppler_path) if poppler_path else None
        )
        
        if not images:
            raise Exception("Failed to convert PDF to image")
        
        print("Image conversion successful")
        return images[0]



def save_latex_as_image(latex_str, outname: str):
    os.makedirs("outputs", exist_ok=True)
    print("Starting render:", outname)
    image = latex_to_image_object(latex_str)
    png_path = f"{outname}.png"
    image.save(png_path, "PNG")
    print("PNG generated:", os.path.exists(png_path))
    return png_path



def latex_to_bytes(latex_str, format="PNG"):
    image = latex_to_image_object(latex_str)
    img_bytes = io.BytesIO()
    image.save(img_bytes, format=format)
    img_bytes.seek(0)
    return img_bytes.getvalue()



def latex_to_base64(latex_str, format="PNG"):
    img_bytes = latex_to_bytes(latex_str, format)
    return base64.b64encode(img_bytes).decode('utf-8')



def latex_to_io(latex_str, format="PNG"):
    image = latex_to_image_object(latex_str)
    img_io = io.BytesIO()
    image.save(img_io, format=format)
    img_io.seek(0)
    return img_io


def modify_numeric_values(latex_str, old_val, new_val):
    return latex_str.replace(old_val, new_val)

## Run the RAG system

In [16]:
# Global counter for unique window names
_window_counter = 0



async def get_files(image_id):
    file_path = os.path.join(PROCESSED_FOLDER, f"{image_id}_full.jpg")
    try:
        async with aiofiles.open(file_path, 'rb') as f : #f is the image that we open and f disappears once we get out of with 
            image_bytes = await f.read()
            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
            return image_base64
    except FileNotFoundError:
        print(f"Error file not found: {file_path}")
    except Exception as e:
        print(f"Error processing image {image_id}: {str(e)}")
    return None



async def catch_tool_call(text: str):
    text = text.replace("[", "").replace("]", "").strip()

    if "```json" in text and "```" in text:
        try:
            json_str = text[text.index("```json") + len("```json"):text.rindex("```")].strip()
            print("Extracted JSON:", json_str)
            json_code = json.loads(json_str)
            print(type(json_code))
            return (json_code['image_number'], json_code['original_value'], json_code['new_value'])
        except json.JSONDecodeError:
            print("Error decoding JSON")
            return None, None, None
    elif "{" in text and "}" in text:
        try:
            start = text.index("{")
            end = text.index("}") + 1
            json_str = text[start:end]
            print("Extracted JSON:", json_str)
            json_code = json.loads(json_str)
            print(type(json_code))
            return (json_code['image_number'], json_code['original_value'], json_code['new_value'])
        except json.JSONDecodeError:
            print("Error decoding JSON")
            return None, None, None
    return None, None, None


async def show_base64_image(base64_str, window_name=None):
    global _window_counter
    try:
        # Use provided window_name or generate a unique one
        if window_name is None:
            window_name = f"Base64 Image {_window_counter}"
            _window_counter += 1
        # Decode base64 string to bytes
        image_data = base64.b64decode(base64_str)
        # Convert bytes to NumPy array
        np_arr = np.frombuffer(image_data, np.uint8)
        # Decode image from NumPy array
        image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
        if image is None:
            print(f"Failed to decode image for window {window_name}")
            return
        # Display the image in a named window
        cv2.imshow(window_name, image)
        # Brief delay to allow window to render without blocking
        cv2.waitKey(1)  # Non-blocking, allows window to stay open
    except Exception as e:
        print(f"Error displaying image {window_name}: {str(e)}")



async def rag(messages: list, old_image_ids: list = None):
    query = messages[-1]["content"][0]["text"]
    image_ids = await combine_search_results(query)
    images_bytes = []
    for id in image_ids:
        bytes = await get_files(id)
        resized_bytes = await resize_base64_image(bytes)
        await show_base64_image(resized_bytes)
        images_bytes.append(resized_bytes)
    
    image_numbers = [i for i in range(len(images_bytes))]
    indexes_message = [{"role": "tool", "content": [{"type": "text", "text": str(image_numbers)}]}]
    messages = messages + indexes_message

    streamer = await generate_answer(messages, images_bytes)
    full_answer = ""
    for chunk in streamer:
        full_answer += chunk
        print(chunk, end="", flush=True)

    messages.append({
            "role": "assistant",
            "content": [{
                "type": "text",
                "text": full_answer
            }]
        })
    
    image_number, original_value, new_value = await catch_tool_call(full_answer)
    
    target_id = None
    
    # print("Image number:", image_number)
    # print("Original value:", original_value)
    # print("New value:", new_value)
    
    if image_number:
        # print("Image number found")
        target_id = old_image_ids[int(image_number) % 3]
        # print("Target image id:", target_id)
        latex_code = await db_get_image_latex(target_id)
        # print("Latex code generated")
        new_latex = modify_numeric_values(latex_code, str(original_value), str(new_value))
        # print("Latex code modified")
        image_base64 = latex_to_base64(new_latex)
        # print("Image base64 generated")
        await show_base64_image(image_base64)
        
    return messages, image_ids



async def main():
    messages = [{"role": "system",
            "content": [{
                "type": "text",
                "text": PROMPT}]}]

    old_image_ids = None

    while True:
        text = input("Enter your question:\n")
        messages.append({
            "role": "user",
            "content": [{
                "type": "text",
                "text": text
            }]
        })

        messages, image_ids = await rag(messages, old_image_ids)
        old_image_ids = image_ids



if __name__ == "__main__":
    await main()

[2025-05-25 19:05:19.258357] Queue processing paused
Processing user query: what is the Threashold Algorithm

Vector image IDs retrieved by the system: ['05e6004b-9db1-4e0d-98a7-0fdaf15c3009', '315fd4b8-6e87-4e66-8849-1457e2468b57', '39e0fbde-1f41-4595-a473-8793ab46b609']

The Variable Threshold Gaussian Process Optimization Algorithm (Algorithm 3) is designed to optimize hyperparameters for machine learning models, particularly focusing on maximizing expected improvement while minimizing uncertainty. This approach differs from traditional methods by dynamically adjusting the threshold value based on the probability of improvement at each step, which is inversely proportional to the probability of improvement at the highest uncertainty point.

Key aspects of this algorithm include:

1. **Dynamic Thresholding**: The algorithm uses a variable threshold that adapts based on the current state of the model's performance and uncertainty. This allows it to explore regions of high uncertainty 

CancelledError: 