In [None]:
import sys
import time
from google import genai
from google.genai import types
from IPython.display import Markdown
from IPython.display import display
from dotenv import load_dotenv
import os
import glob
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from langcodes import Language
from collections import defaultdict
from tqdm import tqdm
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re
from collections import defaultdict
from typing import List, Tuple
import fasttext
import re
from typing import Any, Dict, List, Tuple

In [None]:
def import_google_api():
    load_dotenv()
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

    client = genai.Client(api_key=GOOGLE_API_KEY)

    # Simple model check (keeping the original print logic)
    for m in client.models.list():
        if "embedContent" in m.supported_actions:
            print(m.name)

    return client

In [None]:
def embedding_function(client):
    class GeminiEmbeddingFunction(EmbeddingFunction):
        document_mode = True

        def __init__(self, client):
            self.client = client
            # Retry only on specific transient API errors
            self._retry = retry.Retry(predicate=lambda e: isinstance(e, genai.errors.APIError) and e.code in {429, 503})

        def __call__(self, input: Documents) -> Embeddings:
            embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
            response = self._retry(self.client.models.embed_content)(
                model="models/text-embedding-004",
                contents=input,
                config=types.EmbedContentConfig(task_type=embedding_task),
            )
            return [e.values for e in response.embeddings]

    return GeminiEmbeddingFunction(client)

In [None]:
# Assuming 'Document' is a class/dataclass with 'page_content' and 'metadata' attributes
# from a library like LangChain, LlamaIndex, etc.
class Document:
    def __init__(self, page_content: str, metadata: dict = None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# NOTE: The original doc_stats, filename_base, google_drive_path, and all_chunks 
# are not defined in the provided snippet. I'll define placeholders or 
# make assumptions for a runnable example.

'''def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    """
    Reads markdown files in a directory (and subdirectories) and creates a 
    single Document for each file, adding relevant metadata, but does not chunk.
    """
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        print("No markdown files found")
        return []

    print(f"Processing {len(markdown_files)} markdown files...")
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"\nWarning: Could not read file {filepath}: {e}")
            continue

        # Extract basic file name for metadata
        filename_base = os.path.basename(filepath)

        # Create a single Document for the entire file content
        doc = Document(page_content=markdown_text)

        # Add metadata
        doc.metadata["source"] = filename_base
        # Use google_drive_path if provided, otherwise use local path
        doc.metadata["source_path"] = google_drive_path or filepath 
        
        # NOTE: Since we are not chunking by headers, we won't have a specific header.
        # We set it to an empty string or a placeholder.
        doc.metadata["header"] = "" 
        
        # Since the entire file is one document, these values reflect that.
        doc.metadata["chunk_index"] = 0
        doc.metadata["total_chunks"] = 1
        doc.metadata["is_complete_doc"] = True
        
        all_documents.append(doc)

    print(f"\nSuccessfully processed {len(all_documents)} files into documents.")
    return all_documents'''

def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        return []

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"Error: {e}")
            continue

        filename_base = os.path.basename(filepath)
        
        # 1. Get filename without .md
        name_no_ext = os.path.splitext(filename_base)[0]
        
        # 2. Clean name for metadata (remove underscores as requested)
        # Result example: "fhs.hr en course masthe b"
        doc_name_clean = name_no_ext.replace("_", " ") 

        md_header_splits = markdown_splitter.split_text(markdown_text)
        final_splits = text_splitter.split_documents(md_header_splits)

        for i, split in enumerate(final_splits):
            doc = Document(page_content=split.page_content)
            doc.metadata = split.metadata
            doc.metadata["source"] = filename_base # Keeps the original filename
            doc.metadata["doc_name"] = doc_name_clean # Searchable clean name
            doc.metadata["source_path"] = google_drive_path or filepath
            doc.metadata["chunk_index"] = i
            
            header_context = " > ".join([v for k, v in split.metadata.items() if "Header" in k])
            doc.metadata["header_path"] = header_context
            
            all_documents.append(doc)

    return all_documents

In [None]:
def create_collection(chroma_client, gemini_embedding_function, documents_list):
    """
    Create or update ChromaDB collection with optimized batch processing.
    """
    DB_NAME = "hrstud-bot-en"
    embed_fn = gemini_embedding_function
    embed_fn.document_mode = True

    db = chroma_client.get_or_create_collection(
        name=DB_NAME,
        metadata={"model": "models/text-embedding-004", "dimension": 768},
        embedding_function=embed_fn
    )

    documents = [doc.page_content for doc in documents_list]
    metadatas = [doc.metadata for doc in documents_list]
    ids = [f"{DB_NAME}_doc_{i}" for i in range(len(documents))]

    if db.count() == 0:
        print(f"Adding {len(documents)} documents to ChromaDB collection: {DB_NAME}")

        # Optimized batch size for Gemini API
        BATCH_SIZE = 100
        
        for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Adding documents", unit="batch"):
            batch_end = min(i + BATCH_SIZE, len(documents))
            db.add(
                documents=documents[i:batch_end],
                metadatas=metadatas[i:batch_end],
                ids=ids[i:batch_end]
            )
            # Rate limiting for API stability
            time.sleep(0.2)

        print(f"\nCollection '{DB_NAME}' now contains {db.count()} documents.")
    else:
        print(f"Collection '{DB_NAME}' already has {db.count()} documents.")

In [None]:
def persistent_client(embed_fn):
    """
    Initialize persistent ChromaDB client.
    """
    persist_dir = "./output_en"
    chroma_client = chromadb.PersistentClient(path=persist_dir)

    DB_NAME = "hrstud-bot-en"
    collection = chroma_client.get_collection(DB_NAME, embedding_function=embed_fn)

    print(f"Connected to collection: {collection.name}")
    print(f"Documents: {collection.count()}")
    print(f"Metadata: {collection.metadata}")
    return embed_fn, collection

In [None]:
def get_article_en(user_query, embed_fn, collection, client):
    
    embed_fn.document_mode = False
    n_results_to_fetch = 7
    result = collection.query(query_texts=[user_query], n_results=n_results_to_fetch)
    
    all_passages = result["documents"][0]
    all_metadatas = result["metadatas"][0]

    query_oneline = user_query.replace("\n", " ")
    print(query_oneline)
    
    # Extract the main source link (from the first/most relevant result)
    main_source_link = all_metadatas[0].get("source_path", "Link not available")
    
    # Construct context
    context_list = []
    source_links = []  # Collect all unique source links
    
    for i, (passage, metadata) in enumerate(zip(all_passages, all_metadatas)):
        source_name = metadata.get("source", "Unknown source")
        source_path = metadata.get("source_path", "")
        
        # Collect unique sources for the bottom reference
        if source_path and source_path not in source_links:
            source_links.append(source_path)
        
        context_list.append(f"--- Source: {source_name} (Part {i+1} of {len(all_passages)}) ---\n{passage.strip()}")

    context = "\n\n".join(context_list)
    
    # Format sources for the bottom of the answer
    sources_text = "\n".join([f"- {link}" for link in source_links])
    
    prompt = f"""
    You are a kind, precise, and informative chatbot of the **Faculty of Croatian Studies**. 
    Your main task is to answer questions from students, prospective students, and staff about the faculty, 
    including information about study programs, courses, departments, admissions, and general school information.

    **CRITICAL RULES:**
    1.  Use ONLY the information provided in the supplied documentation.
    2.  Respond **in English**.
    3.  Be concise but complete — **synthesize all relevant details from ALL context sources into ONE cohesive answer**.
    4.  If the documentation does not contain the answer, clearly and politely state that you cannot find the answer 
        in the knowledge base and direct the user to contact the appropriate office.
    5.  Note if some classes are not offered in English.
    6.  **Do not use phrases like "Of course, I can help you!" or "Here is some information about...". 
        Start directly with the relevant answer.**
    7.  **IMPORTANT: Provide ONE unified answer, not multiple separate responses for each source.**

    **ANSWER FORMATTING:**
    * If there is ONE main source, start with: **Source: [Main Source Link]** followed by a blank line.
    * If there are MULTIPLE sources, provide the answer first, then at the end add a "**Sources:**" section listing all source links.
    * Use bold text for key terms (e.g., **Admissions**, **Philosophy**, **Head of Department**).
    * **When listing courses taught by a professor, organize them by level:**
    - **Undergraduate Courses:**
    - **Graduate Courses:**
    - **Doctoral Courses:**
    * For each course, include relevant details like ECTS credits, course hours, and language availability.
    * Responses should be professional and formal, yet polite in tone.
    * **Combine information from all sources into a single, coherent response.**
    * **DO NOT repeat "The source link is..." multiple times. Use it ONCE at the top if there's one main source, OR list all sources at the bottom.**

    **AVAILABLE DOCUMENTATION (Context):**
    {context}

    **USER QUESTION:** {query_oneline}

    **INSTRUCTIONS FOR SOURCE CITATION:**
    Main source link: {main_source_link}
    All source links: {sources_text}

    If the answer comes primarily from ONE source, start with "**Source:** {main_source_link}".
    If the answer uses MULTIPLE sources, end your response with:

    **Sources:**
    {sources_text}

    **ANSWER (provide ONE unified response with courses organized by academic level):**
    """
    
    answer = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config={
            "max_output_tokens": 2048,
            "temperature": 0.2,
            "top_p": 0.9
        }
    )

    return answer.text.strip()

In [None]:
# USAGE EXAMPLE - Uncomment to run

markdown_folder = "./markdown_en"
# 
# # STEP 1: Parse and chunk documents (run once or when documents change)
md_documents = parse_markdown_for_metadata(markdown_folder)
# 
# # STEP 2: Create collection and add documents (run once)
client = import_google_api()
gemini_embedding_function = embedding_function(client)
chroma_persistent_client = chromadb.PersistentClient(path="./output_en")
create_collection(chroma_persistent_client, gemini_embedding_function, md_documents)

In [None]:
# STEP 3: Query the system (run for each query)
# 
client = import_google_api()
gemini_embedding_function = embedding_function(client)
embed_fn, collection = persistent_client(gemini_embedding_function)
# 
user_query = "Who is Sandro Skansi?"  # Example query
response = get_article_en(
    user_query=user_query,
    embed_fn=embed_fn,
    collection=collection,
    client=client,
)
display(Markdown(response))

In [None]:
# ADVANCED: Test multiple queries
# 
test_queries = [
    "What classes does Marko Jerković teach",
    "What classes does Sandro Skansi teach?",
    "Who teaches Medieval European History?",
    "What History Undergraduate classes are offered in English?",
    "What is the number of Snježana Konovski?" # ili nema ili je premalo podataka na en stranici

]
# 
for query in test_queries:
    print(f"\n{'#'*60}")
    print(f"QUERY: {query}")
    print(f"{'#'*60}")
    response = get_article_en(
        user_query=query,
        embed_fn=embed_fn,
        collection=collection,
        client=client,
    )
    display(Markdown(response))
    print("\n")

In [None]:
# ADVANCED: Test multiple queries
# 
test_queries = [
    "What History Undergraduate classes are offered in English?",
    "What is the number of Snježana Konovski?"

]
# 
for query in test_queries:
    print(f"\n{'#'*60}")
    print(f"QUERY: {query}")
    print(f"{'#'*60}")
    response = get_article_en(
        user_query=query,
        embed_fn=embed_fn,
        collection=collection,
        client=client,
    )
    display(Markdown(response))
    print("\n")