In [1]:
# %%
# DEBUG SCRIPT: Find out why "gdje_smo.md" isn't being retrieved

import chromadb
from google import genai
import os
from dotenv import load_dotenv

# Initialize
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

# Get embedding function
from google.genai import types
from chromadb import EmbeddingFunction, Documents, Embeddings
from google.api_core import retry

class GeminiEmbeddingFunction(EmbeddingFunction):
    document_mode = True

    def __init__(self, client):
        self.client = client
        self._retry = retry.Retry(predicate=lambda e: isinstance(e, genai.errors.APIError) and e.code in {429, 503})

    def __call__(self, input: Documents) -> Embeddings:
        embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
        response = self._retry(self.client.models.embed_content)(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(task_type=embedding_task),
        )
        return [e.values for e in response.embeddings]

embed_fn = GeminiEmbeddingFunction(client)

# Connect to collection
chroma_client = chromadb.PersistentClient(path="./output_hr")
DB_NAME = "hrstud-bot-hr"
collection = chroma_client.get_collection(DB_NAME, embedding_function=embed_fn)

print(f"Total documents in collection: {collection.count()}")
print()

# %%
# STEP 1: Check if the document exists in the collection
print("="*60)
print("STEP 1: Search for 'gdje_smo.md' in collection")
print("="*60)

# Get all documents with 'gdje_smo' in the source
all_docs = collection.get(
    where={"source": {"$eq": "fhs.hr_o_nama_gdje_smo.md"}},
    limit=100
)

print(f"Found {len(all_docs['ids'])} chunks from 'fhs.hr_o_nama_gdje_smo.md'")

if len(all_docs['ids']) > 0:
    print("\n--- Sample chunks from this document ---")
    for i, (doc_id, doc, meta) in enumerate(zip(all_docs['ids'][:5], all_docs['documents'][:5], all_docs['metadatas'][:5])):
        print(f"\nChunk {i+1}:")
        print(f"ID: {doc_id}")
        print(f"Header: {meta.get('header_path', 'N/A')}")
        print(f"Content preview: {doc[:200]}...")
else:
    print("❌ Document NOT FOUND in collection!")
    print("This means the document wasn't indexed properly.")

# %%
# STEP 2: Try different search queries
print("\n" + "="*60)
print("STEP 2: Test different search queries")
print("="*60)

embed_fn.document_mode = False

test_queries = [
    "Kako doći do Kampusa Borongaj",
    "ZET linija 215 236",
    "javni prijevoz Borongaj",
    "tramvaj autobus Kampus",
    "Čavićeva Lozarinska",
]

for query in test_queries:
    print(f"\n--- Query: '{query}' ---")
    results = collection.query(
        query_texts=[query],
        n_results=3
    )
    
    for i, (doc, meta, dist) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
        print(f"\nResult {i+1} (distance: {dist:.3f}):")
        print(f"Source: {meta.get('source', 'Unknown')}")
        print(f"Header: {meta.get('header_path', 'N/A')}")
        print(f"Content: {doc[:150]}...")

# %%
# STEP 3: Check if the original file exists
print("\n" + "="*60)
print("STEP 3: Check original markdown file")
print("="*60)

file_path = "./markdown_hr/fhs.hr_o_nama_gdje_smo.md"
if os.path.exists(file_path):
    print(f"✓ File exists: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    print(f"File size: {len(content)} characters")
    print("\n--- File content preview ---")
    print(content[:500])
else:
    print(f"❌ File NOT FOUND: {file_path}")

# %%
# STEP 4: Manual search in collection by content
print("\n" + "="*60)
print("STEP 4: Search for 'ZET' or 'Borongaj' in all documents")
print("="*60)

# This searches through all documents containing these keywords
# Note: This is a full scan, not efficient, but good for debugging

results = collection.get(
    where_document={"$contains": "ZET"}
)

print(f"Found {len(results['ids'])} chunks containing 'ZET'")

if len(results['ids']) > 0:
    print("\n--- First 3 chunks containing 'ZET' ---")
    for i, (doc, meta) in enumerate(zip(results['documents'][:3], results['metadatas'][:3])):
        print(f"\nChunk {i+1}:")
        print(f"Source: {meta.get('source', 'Unknown')}")
        print(f"Content: {doc[:300]}...")
else:
    print("❌ NO chunks found containing 'ZET'!")
    print("This confirms the document is not in the collection.")

# %%
# STEP 5: Recommendation
print("\n" + "="*60)
print("DIAGNOSIS & SOLUTION")
print("="*60)

if len(all_docs['ids']) == 0:
    print("""
❌ PROBLEM IDENTIFIED: The document 'fhs.hr_o_nama_gdje_smo.md' is NOT in the collection.

POSSIBLE CAUSES:
1. The file wasn't in the './markdown_hr' folder when you created the collection
2. The file had an encoding error during parsing
3. The parse_markdown_for_metadata() function skipped it

SOLUTION:
1. Verify the file exists at: ./markdown_hr/fhs.hr_o_nama_gdje_smo.md
2. Delete the collection: rm -rf ./output_hr
3. Re-run the document parsing and collection creation:
   
   md_documents = parse_markdown_for_metadata("./markdown_hr")
   print(f"Parsed {len(md_documents)} documents")
   
   # Check if gdje_smo is in the parsed documents
   gdje_smo_docs = [d for d in md_documents if 'gdje_smo' in d.metadata.get('source', '')]
   print(f"Found {len(gdje_smo_docs)} chunks from gdje_smo.md")
   
   # Then recreate collection
   create_collection(chroma_persistent_client, gemini_embedding_function, md_documents)
""")
else:
    print(f"""
✓ Document IS in collection ({len(all_docs['ids'])} chunks)

❌ PROBLEM: Vector search is not ranking it highly enough

POSSIBLE CAUSES:
1. Query embedding doesn't match document embedding well
2. Chunks are too small/fragmented
3. Important keywords are split across chunks

SOLUTION:
Try these alternative queries:
- "javni prijevoz kampus Borongaj ZET linije"
- "autobusne linije do Fakulteta"

Or improve chunking strategy (increase chunk_size to 1500)
""")

Total documents in collection: 19126

STEP 1: Search for 'gdje_smo.md' in collection
Found 2 chunks from 'fhs.hr_o_nama_gdje_smo.md'

--- Sample chunks from this document ---

Chunk 1:
ID: hrstud-bot-hr_doc_18599
Header: 
Content preview: [Article Link](https://www.fhs.hr/o_nama/gdje_smo)...

Chunk 2:
ID: hrstud-bot-hr_doc_18600
Header: 
Content preview: Kako stići do nas? Do Znanstveno – učilišnog kampusa Borongaj može se doći javnim prijevozom:
* - iz Ulice grada Gospića redovnom tramvajskom linijom broj 2, 3 ili 13: sići na stanici ”Čavićeva” odakl...

STEP 2: Test different search queries

--- Query: 'Kako doći do Kampusa Borongaj' ---

Result 1 (distance: 0.780):
Source: fhs.hr_predmet_pupd_b.md
Header: Obavijesti
Content: #####  [Upis ocjena](https://www.fhs.hr/predmet/pupd_b?@=21ft4#news_116095)
Poštovane studentice i poštovani studenti, upis ocjena biti će 12. srpnja ...

Result 2 (distance: 0.781):
Source: fhs.hr_predmet_mnp_a.md
Header: Obavijesti
Content: #####  [Upis ocjena