In [1]:
# Debug script to find out why "Tko predaje Opća povijest srednjeg vijeka" fails

import chromadb
from google import genai
import os
from dotenv import load_dotenv
from google.genai import types
from chromadb import EmbeddingFunction, Documents, Embeddings
from google.api_core import retry

# Setup
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

class GeminiEmbeddingFunction(EmbeddingFunction):
    document_mode = True

    def __init__(self, client):
        self.client = client
        self._retry = retry.Retry(predicate=lambda e: isinstance(e, genai.errors.APIError) and e.code in {429, 503})

    def __call__(self, input: Documents) -> Embeddings:
        embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
        response = self._retry(self.client.models.embed_content)(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(task_type=embedding_task),
        )
        return [e.values for e in response.embeddings]

embed_fn = GeminiEmbeddingFunction(client)

# Connect to collection
chroma_client = chromadb.PersistentClient(path="./output_hr")
DB_NAME = "hrstud-bot-hr"
collection = chroma_client.get_collection(DB_NAME, embedding_function=embed_fn)

print(f"Collection: {collection.name}")
print(f"Total documents: {collection.count()}")
print()

# ==============================================================================
# STEP 1: Check if the documents exist
# ==============================================================================
print("="*70)
print("STEP 1: Search for documents containing course info")
print("="*70)

# Search for documents about "Opća povijest srednjega vijeka"
course_docs = collection.get(
    where_document={"$contains": "Opća povijest srednjega vijeka"},
    limit=10
)

print(f"Found {len(course_docs['ids'])} chunks containing 'Opća povijest srednjega vijeka'")

if len(course_docs['ids']) > 0:
    print("\n--- First 3 chunks ---")
    for i, (doc_id, doc, meta) in enumerate(zip(
        course_docs['ids'][:3], 
        course_docs['documents'][:3], 
        course_docs['metadatas'][:3]
    )):
        print(f"\nChunk {i+1}:")
        print(f"ID: {doc_id}")
        print(f"Source: {meta.get('source', 'Unknown')}")
        print(f"Header: {meta.get('header_path', 'N/A')}")
        print(f"Content preview (first 300 chars):")
        print(doc[:300])
        print("...")
        
        # Check if it contains instructor info
        if "jerković" in doc.lower() or "marko" in doc.lower():
            print("✓ CONTAINS INSTRUCTOR INFO!")
        else:
            print("✗ No instructor info in this chunk")
else:
    print("❌ NO documents found with course name!")

# ==============================================================================
# STEP 2: Check if instructor document exists
# ==============================================================================
print("\n" + "="*70)
print("STEP 2: Search for Marko Jerković documents")
print("="*70)

jerkovic_docs = collection.get(
    where_document={"$contains": "Marko Jerković"},
    limit=5
)

print(f"Found {len(jerkovic_docs['ids'])} chunks containing 'Marko Jerković'")

if len(jerkovic_docs['ids']) > 0:
    print("\n--- Sample chunks ---")
    for i, (doc, meta) in enumerate(zip(jerkovic_docs['documents'][:2], jerkovic_docs['metadatas'][:2])):
        print(f"\nChunk {i+1}:")
        print(f"Source: {meta.get('source', 'Unknown')}")
        print(f"Content preview:")
        print(doc[:300])
        print("...")

# ==============================================================================
# STEP 3: Try different query variations
# ==============================================================================
print("\n" + "="*70)
print("STEP 3: Test different query variations")
print("="*70)

embed_fn.document_mode = False

test_queries = [
    "Tko predaje Opća povijest srednjeg vijeka",
    "Opća povijest srednjega vijeka nositelj",
    "Opća povijest srednjega vijeka profesor",
    "Marko Jerković povijest",
    "nositelj predmeta Opća povijest srednjega vijeka",
    "izv. prof. dr. sc. Marko Jerković Opća povijest",
]

for query in test_queries:
    print(f"\n--- Query: '{query}' ---")
    results = collection.query(
        query_texts=[query],
        n_results=3
    )
    
    for i, (doc, meta, dist) in enumerate(zip(
        results['documents'][0], 
        results['metadatas'][0], 
        results['distances'][0]
    )):
        print(f"\nResult {i+1} (distance: {dist:.3f}):")
        print(f"Source: {meta.get('source', 'Unknown')}")
        print(f"Content: {doc[:200]}...")

# ==============================================================================
# STEP 4: Check what the top result actually contains
# ==============================================================================
print("\n" + "="*70)
print("STEP 4: Detailed analysis of top result")
print("="*70)

embed_fn.document_mode = False
original_query = "Tko predaje Opća povijest srednjeg vijeka"
results = collection.query(
    query_texts=[original_query],
    n_results=1
)

top_doc = results['documents'][0][0]
top_meta = results['metadatas'][0][0]
top_dist = results['distances'][0][0]

print(f"Top result for: '{original_query}'")
print(f"Distance: {top_dist:.3f}")
print(f"Source: {top_meta.get('source', 'Unknown')}")
print(f"Header path: {top_meta.get('header_path', 'N/A')}")
print(f"\nFull content:\n{top_doc}")

# ==============================================================================
# STEP 5: Diagnosis
# ==============================================================================
print("\n" + "="*70)
print("DIAGNOSIS")
print("="*70)

# Count how many chunks have both course name AND instructor
both_count = 0
for doc_id, doc in zip(course_docs['ids'], course_docs['documents']):
    if "jerković" in doc.lower() or "marko" in doc.lower():
        both_count += 1

print(f"\nChunks containing course name: {len(course_docs['ids'])}")
print(f"Chunks containing BOTH course name AND instructor: {both_count}")

if both_count == 0:
    print("\n❌ PROBLEM IDENTIFIED:")
    print("The course name and instructor are NOT in the same chunk!")
    print("This is why the query fails - the embedding can't find a chunk with both pieces of info.")
    print("\nSOLUTION: Increase chunk_size to keep related info together")
elif top_dist > 0.75:
    print("\n❌ PROBLEM IDENTIFIED:")
    print("Documents exist but semantic similarity is too low (distance > 0.75)")
    print("The query phrasing doesn't match document phrasing well.")
    print("\nSOLUTION: Use query expansion to add keywords like 'nositelj', 'profesor'")
else:
    print("\n✓ Documents seem OK, but something else is wrong")
    print("Check the prompt in get_article_hr() - maybe it's not extracting the info correctly")

Collection: hrstud-bot-hr
Total documents: 8692

STEP 1: Search for documents containing course info
Found 10 chunks containing 'Opća povijest srednjega vijeka'

--- First 3 chunks ---

Chunk 1:
ID: hrstud-bot-hr_doc_4130
Source: fhs.hr_studiji_prijediplomski_dvopredmetni_studiji_povijest.md
Header: Povijest (dvopredmetni prijediplomski)
Content preview (first 300 chars):
5.0 |  [Opća povijest srednjega vijeka (38043)](https://www.fhs.hr/predmet/opsv) Jerković, M. |  | 30 30P |  [__](javascript:show_window\('/.cms/predmet_info?_v1=r__OG1FyzdMbIxOAwZHej_lCZ3BV8MngE5Ed7I09VTHFWwDwSlP7pB7_GZvayu9oQC9NjKX6LRK3DWgrTPeRXUKhYLIeJpAcZw4DslMNwj4lxJwBjQSaMLVQD-kAEUjCZHRLtGPomn
...
✓ CONTAINS INSTRUCTOR INFO!

Chunk 2:
ID: hrstud-bot-hr_doc_4182
Source: fhs.hr_studiji_prijediplomski_dvopredmetni_studiji_povijest.md
Header: Povijest (dvopredmetni prijediplomski)
Content preview (first 300 chars):
5.0 |  [Opća povijest srednjega vijeka (38043)](https://www.fhs.hr/predmet/opsv) Jerković, M. |  | 30 