In [1]:
# %%
import sys
import time
from google import genai
from google.genai import types
from IPython.display import Markdown
from IPython.display import display
from dotenv import load_dotenv
import os
import glob
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from langcodes import Language
from collections import defaultdict
from tqdm import tqdm
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re
from collections import defaultdict
from typing import List, Tuple
import fasttext
import re
from typing import Any, Dict, List, Tuple

# %%
def import_google_api():
    load_dotenv()
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

    client = genai.Client(api_key=GOOGLE_API_KEY)

    # Simple model check
    for m in client.models.list():
        if "embedContent" in m.supported_actions:
            print(m.name)

    return client

# %%
def embedding_function(client):
    class GeminiEmbeddingFunction(EmbeddingFunction):
        document_mode = True

        def __init__(self, client):
            self.client = client
            self._retry = retry.Retry(predicate=lambda e: isinstance(e, genai.errors.APIError) and e.code in {429, 503})

        def __call__(self, input: Documents) -> Embeddings:
            embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
            response = self._retry(self.client.models.embed_content)(
                model="models/text-embedding-004",
                contents=input,
                config=types.EmbedContentConfig(task_type=embedding_task),
            )
            return [e.values for e in response.embeddings]

    return GeminiEmbeddingFunction(client)

# %%
class Document:
    def __init__(self, page_content: str, metadata: dict = None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

In [2]:
# %%
def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    """
    Optimized parsing with lightweight metadata indexing.
    Uses compact metadata prepending only when beneficial.
    """
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        return []

    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"Error: {e}")
            continue

        filename_base = os.path.basename(filepath)
        name_no_ext = os.path.splitext(filename_base)[0]
        
        # Clean document identifier
        clean_id = name_no_ext.replace("fhs.hr_", "") 
        doc_name_clean = clean_id.replace("_", " ")

        md_header_splits = markdown_splitter.split_text(markdown_text)
        final_splits = text_splitter.split_documents(md_header_splits)

        for i, split in enumerate(final_splits):
            # Generate header context
            header_context = " > ".join([v for k, v in split.metadata.items() if "Header" in k])
            
            # OPTIMIZED: Lightweight metadata prepending
            # Only prepend for first chunk or when header exists
            if i == 0 or header_context:
                # Compact format: "DocName | Section: content"
                metadata_prefix = f"{doc_name_clean}"
                if header_context:
                    metadata_prefix += f" | {header_context}"
                searchable_content = f"{metadata_prefix}: {split.page_content}"
            else:
                # Subsequent chunks without headers: pure content
                searchable_content = split.page_content
            
            # Create Document with optimized content
            doc = Document(page_content=searchable_content)
            
            # Rich metadata for filtering (stored separately, not embedded)
            doc.metadata = split.metadata.copy()
            doc.metadata["source"] = filename_base 
            doc.metadata["article_link"] = clean_id 
            doc.metadata["doc_name"] = doc_name_clean 
            doc.metadata["source_path"] = google_drive_path or filepath
            doc.metadata["chunk_index"] = i
            doc.metadata["total_chunks"] = len(final_splits)
            doc.metadata["header_path"] = header_context
            doc.metadata["is_first_chunk"] = (i == 0)
            
            all_documents.append(doc)

    return all_documents

# %%
def create_collection(chroma_client, gemini_embedding_function, documents_list):
    """
    Create or update ChromaDB collection with optimized batch processing.
    """
    DB_NAME = "hrstud-bot-en"
    embed_fn = gemini_embedding_function
    embed_fn.document_mode = True

    db = chroma_client.get_or_create_collection(
        name=DB_NAME,
        metadata={"model": "models/text-embedding-004", "dimension": 768},
        embedding_function=embed_fn
    )

    documents = [doc.page_content for doc in documents_list]
    metadatas = [doc.metadata for doc in documents_list]
    ids = [f"{DB_NAME}_doc_{i}" for i in range(len(documents))]

    if db.count() == 0:
        print(f"Adding {len(documents)} documents to ChromaDB collection: {DB_NAME}")

        BATCH_SIZE = 100
        
        for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Adding documents", unit="batch"):
            batch_end = min(i + BATCH_SIZE, len(documents))
            db.add(
                documents=documents[i:batch_end],
                metadatas=metadatas[i:batch_end],
                ids=ids[i:batch_end]
            )
            time.sleep(0.2)

        print(f"\nCollection '{DB_NAME}' now contains {db.count()} documents.")
    else:
        print(f"Collection '{DB_NAME}' already has {db.count()} documents.")

# %%
def persistent_client(embed_fn):
    """
    Initialize persistent ChromaDB client.
    """
    persist_dir = "./output_en"
    chroma_client = chromadb.PersistentClient(path=persist_dir)

    DB_NAME = "hrstud-bot-en"
    collection = chroma_client.get_collection(DB_NAME, embedding_function=embed_fn)

    print(f"Connected to collection: {collection.name}")
    print(f"Documents: {collection.count()}")
    print(f"Metadata: {collection.metadata}")
    return embed_fn, collection

# %%
def extract_document_filter(query: str) -> dict:
    """
    Extract document/section filters from user query.
    Returns ChromaDB where clause if specific document mentioned.
    """
    query_lower = query.lower()
    
    # Common document name patterns
    doc_patterns = {
        "admission": ["admission", "enroll", "apply"],
        "program": ["program", "study", "course"],
        "schedule": ["schedule", "timetable"],
        "exam": ["exam", "test", "assessment"],
    }
    
    for doc_key, patterns in doc_patterns.items():
        if any(pattern in query_lower for pattern in patterns):
            # Check if asking specifically about a document
            if any(phrase in query_lower for phrase in ["in the document", "on the page", "document about"]):
                return {"doc_name": {"$contains": doc_key}}
    
    return None  # No specific filter

# %%
def extract_article_link_from_content(content: str) -> str:
    """
    Extract the article link from markdown content.
    Only matches links with the exact text "Article Link".
    """
    # Pattern: Exact match for [Article Link](URL)
    link_pattern = r'\[Article Link\]\((https?://[^\)]+)\)'
    match = re.search(link_pattern, content[:1000])
    
    if match:
        url = match.group(1)
        return f"[Article Link]({url})"
    
    # Fallback: return None if no "Article Link" found
    return None

In [3]:
def get_article_en(user_query, embed_fn, collection, client):
    """
    Optimized retrieval with query expansion and metadata filtering.
    Extracts article link from actual document content.
    """
    # 1. QUERY PREPARATION
    embed_fn.document_mode = False
    query_lower = user_query.lower()
    
    # Initial values
    expanded_query = user_query
    n_results_to_fetch = 12
    metadata_filter = extract_document_filter(user_query)

    # 2. QUERY EXPANSION LOGIC
    if any(word in query_lower for word in ["teaches", "who", "professor", "instructor", "course", "class"]):
        expanded_query = f"{user_query} professor instructor title teaching course class program study"
    
    elif any(word in query_lower for word in ["get to", "location", "where", "campus", "borongaj", "bus", "train"]):
        expanded_query = f"{user_query} location address campus borongaj bus 215 236 train station Trnava"

    elif any(word in query_lower for word in ["program", "offer", "study", "admission", "major"]):
        expanded_query = f"{user_query} list of programs undergraduate graduate doctoral study croatology history sociology psychology communication philosophy"
        n_results_to_fetch = 20
    
    # 3. VECTOR SEARCH WITH OPTIONAL FILTERING
    query_params = {
        "query_texts": [expanded_query], 
        "n_results": n_results_to_fetch
    }
    
    if metadata_filter:
        query_params["where"] = metadata_filter
        print(f"Applying metadata filter: {metadata_filter}")
    
    result = collection.query(**query_params)
    
    all_passages = result["documents"][0]
    all_metadatas = result["metadatas"][0]
    all_distances = result["distances"][0]

    # 4. EXTRACT ARTICLE LINK FROM TOP RESULT CONTENT
    article_link_markdown = None
    if all_passages:
        # Try to extract from the top result's content
        article_link_markdown = extract_article_link_from_content(all_passages[0])
        
        # Fallback to metadata if extraction fails
        if not article_link_markdown:
            main_url = all_metadatas[0].get("article_link", "")
            if main_url:
                article_link_markdown = f"[{main_url}](https://www.fhs.hr/{main_url})"
            else:
                article_link_markdown = "[Faculty of Croatian Studies](https://www.fhs.hr)"

    # 5. DEDUPLICATION & CONTEXT PREPARATION
    context_list = []
    seen_passages = set()
    
    # Dynamic threshold based on query type
    distance_threshold = 0.85 if "program" in query_lower or "study" in query_lower else 0.90
    
    for p, m, d in zip(all_passages, all_metadatas, all_distances):
        if d < distance_threshold:
            # Use first 200 chars as fingerprint
            fingerprint = p.strip()[:200]
            if fingerprint in seen_passages:
                continue
            seen_passages.add(fingerprint)
            
            chunk_url = m.get("article_link", "")
            header = m.get("header_path", "")
            
            # Include header context if available
            context_entry = f"Source URL: {chunk_url}"
            if header:
                context_entry += f"\nSection: {header}"
            context_entry += f"\n{p.strip()}"
            
            context_list.append(context_entry)

    if not context_list:
        return "Unfortunately, I cannot find information about your query in the knowledge base. Please contact the student office for additional information."

    context = "\n\n".join(context_list)
    query_oneline = user_query.replace("\n", " ")

    # 6. OPTIMIZED PROMPT
    prompt = f"""You are a kind, precise, and informative chatbot for the **Faculty of Croatian Studies**. Your task is to answer questions about the faculty.

**CRITICAL RULES:**
1. Use ONLY the provided documentation (CONTEXT).
2. Respond in **English**.
3. **GROUPING:** If the same person teaches multiple courses, mention their name ONLY ONCE. Group programs by level.
4. **LINKS:** Display courses, emails, and programs as Markdown links [Name](URL).
5. **NO INTRODUCTION:** Start directly with the relevant answer without phrases like "Of course...".

**FORMATTING:**
* First line: **Source:** The link that seems most relevant from the documentation.
* Blank line after source.
* **Bold text** for key terms.
* Lists (bullet points) for enumeration.
* If the provided link is in this format, e.g., for Email: E-mail: [idzinic@fhs.hr](javascript:startMail('qvvmva@pus.feu');)
    - display as a plain link without the javascript part, e.g., E-mail: idzinic@fhs.hr
    - remove the javascript part from the link for every link
**AVAILABLE DOCUMENTATION:**
{context}

**USER QUESTION:** {query_oneline}

**ANSWER:**"""
    
    # 7. GENERATE RESPONSE
    answer = client.models.generate_content(
        model="gemini-2.0-flash-exp",
        contents=prompt,
        config={
            "max_output_tokens": 2048,
            "temperature": 0.1,
            "top_p": 0.9
        }
    )
    
    return answer.text.strip()

In [4]:
# %%
# USAGE EXAMPLE

markdown_folder = "./markdown_en"

# STEP 1: Parse and chunk documents (run once or when documents change)
md_documents = parse_markdown_for_metadata(markdown_folder)

# STEP 2: Create collection and add documents (run once)
client = import_google_api()
gemini_embedding_function = embedding_function(client)
chroma_persistent_client = chromadb.PersistentClient(path="./output_en")
create_collection(chroma_persistent_client, gemini_embedding_function, md_documents)

# %%
# STEP 3: Query the system

client = import_google_api()
gemini_embedding_function = embedding_function(client)
embed_fn, collection = persistent_client(gemini_embedding_function)

user_query = "Who is Ivo Džinić?"
response = get_article_en(
    user_query=user_query,
    embed_fn=embed_fn,
    collection=collection,
    client=client
)
display(Markdown(response))

# %%
# ADVANCED: Test multiple queries

test_queries = [
    "What classes does Mato Škerbić teach?",
    "How do I get to campus?",
    "What study programs are available at the faculty?"
]

for query in test_queries:
    print(f"\n{'#'*60}")
    print(f"QUERY: {query}")
    print(f"{'#'*60}")
    response = get_article_en(
        user_query=query,
        embed_fn=embed_fn,
        collection=collection,
        client=client,
    )
    display(Markdown(response))
    print("\n")

Processing documents: 100%|██████████| 820/820 [00:01<00:00, 489.49it/s]


models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001
Adding 5857 documents to ChromaDB collection: hrstud-bot-en


Adding documents: 100%|██████████| 59/59 [01:43<00:00,  1.76s/batch]



Collection 'hrstud-bot-en' now contains 5857 documents.
models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001
Connected to collection: hrstud-bot-en
Documents: 5857
Metadata: {'dimension': 768, 'model': 'models/text-embedding-004'}


**Source:** [staff_ivo.dzinic](https://www.fhs.hr/staff/ivo.dzinic)

**Ivo Džinić** is a Full Professor at the Faculty of Croatian Studies, in the Department of Philosophy and Cultural Studies.

His contact information:
*   Cabinet: Building 78, room 23
*   Consultations: On Fridays, from 11 am to 12 am
*   Public phone number: 01 245-7622
*   Internal phone number: 7622
*   E-mail: [idzinic@fhs.hr](mailto:idzinic@fhs.hr)

He teaches the following courses:

**Undergraduate:**
*   [Ancient Philosophy (214822)](https://www.fhs.hr/en/course/ancphi_a)
*   [A Philosophical and Theological Approach to Population (214008)](https://www.fhs.hr/en/course/apatatp)
*   [Introduction to Cultural Studies (214820)](https://www.fhs.hr/en/course/itcs_b)
*   [Philosophical and Cultural Anthropology (214823)](https://www.fhs.hr/en/course/paca_a)
*   [Philosophy of Culture (214828)](https://www.fhs.hr/en/course/poc_f)

**Graduate:**
*   [Philosophy of Myth and Religion (187904)](https://www.fhs.hr/en/course/pomar)

**Doctoral:**
*   [Hermeneutics (258229)](https://www.fhs.hr/en/course/her)

He was born in Vinkovci in 1975. He speaks Croatian, German, English and Italian and reads Latin and Ancient Greek. He is married and the father of one child.


############################################################
QUERY: What classes does Mato Škerbić teach?
############################################################


**Source:** [staff_matija_mato.skerbic](https://www.fhs.hr/staff/matija_mato.skerbic)

**Matija Mato Škerbić** teaches the following courses:

**Undergraduate:**
*   [Ethics (214834)](https://www.fhs.hr/en/course/eth_a)
*   [Integrative Bioethics (214015)](https://www.fhs.hr/en/course/intbio)
*   [New ethical culture (187891)](https://www.fhs.hr/en/course/nec)

**Graduate:**
*   [Methods of Teaching Philosophy, Logic and Ethics (214635)](https://www.fhs.hr/en/course/motplae)
*   [Philosophical methodology (201535)](https://www.fhs.hr/en/course/phimet)
*   [Philosophy of Education (61957)](https://www.fhs.hr/en/course/poe)
*   [Philosophy of Game and Sports (187914)](https://www.fhs.hr/en/course/pogas)
*   [Master thesis (214814)](https://www.fhs.hr/en/course/masthe_b)




############################################################
QUERY: How do I get to campus?
############################################################


There is no information about how to get to campus in the provided documentation.




############################################################
QUERY: What study programs are available at the faculty?
############################################################


**Source:** [https://www.fhs.unizg.hr/en/study](https://www.fhs.unizg.hr/en/study)

The Faculty of Croatian Studies offers the following study programs:

*   [Undergraduate Study](https://www.fhs.unizg.hr/en/undergraduate_study)
*   [Graduate Study](https://www.fhs.unizg.hr/en/graduate_study)
*   [Postgraduate Study](https://www.fhs.unizg.hr/en/postgraduate_study)
*   [Lifelong learning programs](https://www.fhs.unizg.hr/en/study/lifelong_learning_programs)

Here is a breakdown of the single and double major programs at the undergraduate and graduate levels:

**Undergraduate Programs:**

*   Single Major:
    *   [Communication Studies](https://www.fhs.unizg.hr/en/undergraduate_study/communication_sciences)
    *   [Croatology](https://www.fhs.unizg.hr/en/undergraduate_study/croatology)
    *   [History](https://www.fhs.unizg.hr/en/undergraduate_study/history)
    *   [Psychology](https://www.fhs.unizg.hr/en/undergraduate_study/psychology)
    *   [Sociology](https://www.fhs.unizg.hr/en/undergraduate_study/sociology)
*   Double Major:
    *   [Philosophy and Culture](https://www.fhs.unizg.hr/en/undergraduate_study/double_major/philosophy)
    *   [Communication Studies](https://www.fhs.unizg.hr/en/undergraduate_study/double_major/com)
    *   [Croatology](https://www.fhs.unizg.hr/en/undergraduate_study/double_major/croatian_studies)
    *   [Latin Language](https://www.fhs.unizg.hr/en/undergraduate_study/double_major/latin_language)
    *   [History](https://www.fhs.unizg.hr/en/undergraduate_study/double_major/history)
    *   [Sociology](https://www.fhs.unizg.hr/en/undergraduate_study/double_major/sociology)

**Graduate Programs:**

*   Single Major:
    *   [Communication Studies](https://www.fhs.unizg.hr/en/graduade_study/communicology)
    *   [Croatology](https://www.fhs.unizg.hr/en/graduade_study/croatology/teaching_stream)
    *   [History](https://www.fhs.unizg.hr/en/graduade_study/history/science_stream)
    *   [Psychology](https://www.fhs.unizg.hr/en/graduade_study/psychology)
    *   [Sociology](https://www.fhs.unizg.hr/en/graduade_study/sociology/science_stream)
*   Double Major:
    *   [Croatian latinity](https://www.fhs.unizg.hr/en/graduade_study/croatian_latinity)
    *   [Croatology](https://www.fhs.unizg.hr/en/graduade_study/croatology/double_major)
    *   [History](https://www.fhs.unizg.hr/en/graduade_study/history/double_major)
    *   [Philosophy](https://www.fhs.unizg.hr/en/graduade_study/philosophy/double_major)
    *   [Sociology](https://www.fhs.unizg.hr/en/graduade_study/sociology/teaching_stream)

**Postgraduate Programs:**

*   [Croatology](https://www.fhs.hr/croatology)
*   [History](https://www.fhs.hr/history)
*   [Philosophy](https://www.fhs.hr/philosophy)



