Project Structure
- Setup Chunking and processing steps for inputs (PDFs, docx, txt)
- Setup PGVector Store (VS) + postgresql => Done
- Using LLMs (OpenAI, Gemini,...) to query vectors from VS

# Create PostgreDB 

In [1]:
import sys
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv('/Users/longnv/Coding/rag_llama_index/deployment/.env')

True

In [13]:
import psycopg2

db_name = os.environ['POSTGRES_DB']
host = "localhost"
password = os.environ['POSTGRES_PASSWORD']
port = "5432"
user = os.environ['POSTGRES_USER']
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

# with conn.cursor() as c:
#     c.execute(f"DROP DATABASE IF EXISTS {db_name}")
#     c.execute(f"CREATE DATABASE {db_name}")

In [16]:
print(conn.status)
print(conn.get_dsn_parameters())
with conn.cursor() as cursor:
    cursor.execute("SELECT 1")
    print("Connection is active")

1
{'user': 'admin', 'channel_binding': 'prefer', 'dbname': 'rag_db', 'host': 'localhost', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'}
Connection is active


# Checking file input

In [None]:
from pydantic import BaseModel, Field, field_validator
from pydantic_ai import Agent, RunContext
from uuid import UUID
from chonkie import TokenChunker, SemanticChunker

In [20]:
import os
from pathlib import Path
import PyPDF2
from chonkie import SemanticChunker
from sentence_transformers import SentenceTransformer


def extract_text_from_pdf(pdf_path):
    """
    Extract text from PDF file using PyPDF2.

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        str: Extracted text from all pages
    """
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n"

    return text


def chunk_with_semantic_chunker(text, chunk_size=512, similarity_threshold=0.7, embedding_model=None):
    """
    Chunk text using Chonkie's SemanticChunker with custom embedding model.

    Args:
        text (str): Text to chunk
        chunk_size (int): Maximum tokens per chunk
        similarity_threshold (float): Similarity threshold for semantic chunking
        embedding_model: Custom embedding model (SentenceTransformer or similar)

    Returns:
        list: List of chunks
    """
    if embedding_model:
        chunker = SemanticChunker(
            chunk_size=chunk_size,
            similarity_threshold=similarity_threshold,
            embedding_model=embedding_model
        )
    else:
        # Use default embedding model
        chunker = SemanticChunker(
            chunk_size=chunk_size,
            similarity_threshold=similarity_threshold
        )

    chunks = chunker.chunk(text)
    return chunks


def save_chunks_to_file(chunks, output_path, chunker_type):
    """
    Save chunks to a text file.

    Args:
        chunks (list): List of chunks
        output_path (str): Output file path
        chunker_type (str): Type of chunker used
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"Chunks created using {chunker_type}\n")
        f.write("=" * 50 + "\n\n")

        for i, chunk in enumerate(chunks, 1):
            f.write(f"Chunk {i}:\n")
            f.write("-" * 20 + "\n")
            f.write(f"{chunk.text}\n\n")
            f.write(f"Tokens: {chunk.token_count}\n")
            if hasattr(chunk, 'start_index'):
                f.write(f"Start Index: {chunk.start_index}\n")
            if hasattr(chunk, 'end_index'):
                f.write(f"End Index: {chunk.end_index}\n")
            f.write("\n" + "="*50 + "\n\n")

In [None]:
print("\nStep 1: Loading documents")
texts = extract_text_from_pdf(r'D:\Project\rag_with_llama\docs\llama2.pdf')

# Create output directory
output_dir = "chunked_output"
os.makedirs(output_dir, exist_ok=True)

print("\nStep 2: Loading embedding models...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
model_name = "all-MiniLM-L6-v2"

print("\nStep 3: Chunking with SemanticChunker")
semantic_chunks = chunk_with_semantic_chunker(
    texts,
    chunk_size=512,
    similarity_threshold=0.7,
    embedding_model=embedding_model
    )


Step 1: Loading documents

Step 2: Loading embedding models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Step 3: Chunking with SemanticChunker


ValueError: embedding_model must be a string or a BaseEmbeddings object