In [12]:
import os
import glob
import re
import time
from dotenv import load_dotenv
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from tqdm import tqdm


In [7]:

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

if not api_key:
    raise ValueError("‚ùå GOOGLE_API_KEY not found. Please check your .env file.")

google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=api_key,
    model_name="models/text-embedding-004"
)


chroma_client = chromadb.PersistentClient(path="mkdocs_db/")

collection = chroma_client.get_or_create_collection(
    name="MkDocsGuide",
    embedding_function=google_ef
)

print("‚úÖ System initialized. Connected to ChromaDB.")

‚úÖ System initialized. Connected to ChromaDB.


In [8]:
def clean_text(text):
    """
    Removes HTML carousels, image tags, and excessive whitespace
    specific to the MkDocs user guide.
    """
    text = re.sub(r'<div id="mkdocs-theme-images".*?</div>', '', text, flags=re.DOTALL)
    
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    
    text = re.sub(r'', '', text, flags=re.DOTALL)
    
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text.strip()


headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

rec_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

print("‚úÖ Cleaning and splitting logic defined.")

‚úÖ Cleaning and splitting logic defined.


In [9]:
all_texts = []
all_metadatas = []
all_ids = []

md_files = glob.glob("user-guide/*.md")

print(f"üìÇ Found {len(md_files)} Markdown files.")

for file_path in tqdm(md_files, desc="Processing Files"):
  
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        

    cleaned_content = clean_text(content)
    
    md_splits = md_splitter.split_text(cleaned_content)
    
    final_splits = rec_splitter.split_documents(md_splits)
    
    file_name = os.path.basename(file_path)
    
    for i, doc in enumerate(final_splits):

        header_path = " > ".join([v for k, v in doc.metadata.items() if k.startswith("Header")])
        enriched_text = f"File: {file_name}\nSection: {header_path}\nContent:\n{doc.page_content}"
        
        all_texts.append(enriched_text)
        
        meta = doc.metadata.copy()
        meta["source"] = file_name
        all_metadatas.append(meta)
        
        all_ids.append(f"{file_name}-chunk-{i}")

print(f"\n‚úÖ Processing complete. Prepared {len(all_texts)} chunks.")

üìÇ Found 9 Markdown files.


Processing Files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:00<00:00, 254.33it/s]


‚úÖ Processing complete. Prepared 144 chunks.





In [10]:
def batch_insert(texts, metadatas, ids, batch_size=20):
    total = len(texts)
    print(f"üöÄ Starting insertion of {total} chunks...")
    
    for i in range(0, total, batch_size):
        batch_end = min(i + batch_size, total)
        batch_texts = texts[i:batch_end]
        batch_metas = metadatas[i:batch_end]
        batch_ids = ids[i:batch_end]
        
        print(f"   Processing batch {i} to {batch_end}...")
        
        
        existing = collection.get(ids=batch_ids)
        if len(existing['ids']) == len(batch_ids):
            print("      ‚è© Batch already exists. Skipping.")
            continue
        
        collection.add(
            documents=batch_texts,
            metadatas=batch_metas,
            ids=batch_ids
        )
        
       
        collection.add(
            documents=batch_texts,
            metadatas=batch_metas,
            ids=batch_ids
        )
        print("      ‚úÖ Retry successful.")
            
        
        time.sleep(1.5)

batch_insert(all_texts, all_metadatas, all_ids)

print("\nüéâ Database creation successful! You can now run RAG.py.")

üöÄ Starting insertion of 144 chunks...
   Processing batch 0 to 20...
      ‚è© Batch already exists. Skipping.
   Processing batch 20 to 40...
      ‚è© Batch already exists. Skipping.
   Processing batch 40 to 60...
      ‚è© Batch already exists. Skipping.
   Processing batch 60 to 80...
      ‚è© Batch already exists. Skipping.
   Processing batch 80 to 100...
      ‚è© Batch already exists. Skipping.
   Processing batch 100 to 120...
      ‚è© Batch already exists. Skipping.
   Processing batch 120 to 140...
      ‚è© Batch already exists. Skipping.
   Processing batch 140 to 144...
      ‚è© Batch already exists. Skipping.

üéâ Database creation successful! You can now run RAG.py.
