Project Structure
- Setup Chunking and processing steps for inputs (PDFs, docx, txt)
- Setup PGVector Store (VS) + postgresql => Done
- Using LLMs (OpenAI, Gemini,...) to query vectors from VS

# Create PostgreDB 

In [1]:
import sys
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv('/Users/longnv/Coding/rag_llama_index/deployment/.env')

True

In [13]:
import psycopg2

db_name = os.environ['POSTGRES_DB']
host = "localhost"
password = os.environ['POSTGRES_PASSWORD']
port = "5432"
user = os.environ['POSTGRES_USER']
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

# with conn.cursor() as c:
#     c.execute(f"DROP DATABASE IF EXISTS {db_name}")
#     c.execute(f"CREATE DATABASE {db_name}")

In [16]:
print(conn.status)
print(conn.get_dsn_parameters())
with conn.cursor() as cursor:
    cursor.execute("SELECT 1")
    print("Connection is active")

1
{'user': 'admin', 'channel_binding': 'prefer', 'dbname': 'rag_db', 'host': 'localhost', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'}
Connection is active


# Checking file input

In [None]:
from pydantic import BaseModel, Field, field_validator
from pydantic_ai import Agent, RunContext
from uuid import UUID
from chonkie import TokenChunker, SemanticChunker

In [None]:
class ChunkingRequest(BaseModel):
    doc_id: UUID = Field(..., description="The ID of the document to chunk")
    chunk_size: int | None = Field(512, ge=100, le=4096, description="Maximum tokens per chunk")
    overlap: int | None = Field(50, ge=0, le=200, description="Token overlap between chunks")
    force_rechunk: bool | None = Field(False, description="Force re-chunking even if chunks exist")

class ChunkingAgent:
    def __init__(self, default_chunk_size: int = 1024, default_overlap: int = 50, use_semantic: bool = True):
        self.default_chunk_size = default_chunk_size
        self.default_overlap = default_overlap
        self.use_semantic = use_semantic
    
    def chunk_document(self, content: str, request: ChunkingRequest) -> list[str]:
        """Chunk a document using chonkie library.
        
        Args:
            content: The document content to chunk
            request: Chunking parameters
            
        Returns:
            List of chunk texts
        """
        chunk_size = request.chunk_size or self.default_chunk_size
        
        if self.use_semantic:
            # Use SemanticChunker for better content awareness
            try:
                chunker = SemanticChunker(
                    embedding_model="all-MiniLM-L6-v2",  # More compatible model
                    threshold=0.5,  # Lower threshold for more grouping
                    chunk_size=chunk_size,
                    mode="cumulative",  # Better for document structure
                    min_sentences=3,     # Ensure meaningful chunks
                    similarity_window=5  # Consider more sentences for similarity
                )
                raw_chunks = chunker.chunk(content)
            except Exception as e:
                print(f"SemanticChunker failed: {e}")
                print("Falling back to TokenChunker...")
                # Fallback to TokenChunker
                overlap = request.overlap or self.default_overlap
                chunker = TokenChunker(chunk_size=chunk_size, chunk_overlap=overlap)
                raw_chunks = chunker.chunk(content)
        else:
            # Fallback to TokenChunker
            overlap = request.overlap or self.default_overlap
            chunker = TokenChunker(chunk_size=chunk_size, chunk_overlap=overlap)
            raw_chunks = chunker.chunk(content)
        
        return [chunk.text for chunk in raw_chunks]