In [1]:
import sys
import time
from google import genai
from google.genai import types
from IPython.display import Markdown
from IPython.display import display
from dotenv import load_dotenv
import os
import glob
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from langcodes import Language
from collections import defaultdict
from tqdm import tqdm
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import re
from collections import defaultdict
from typing import List, Tuple
import fasttext
import re
from typing import Any, Dict, List, Tuple

In [2]:
def import_google_api():
    load_dotenv()
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

    client = genai.Client(api_key=GOOGLE_API_KEY)

    # Simple model check (keeping the original print logic)
    for m in client.models.list():
        if "embedContent" in m.supported_actions:
            print(m.name)

    return client

In [3]:
def embedding_function(client):
    class GeminiEmbeddingFunction(EmbeddingFunction):
        document_mode = True

        def __init__(self, client):
            self.client = client
            # Retry only on specific transient API errors
            self._retry = retry.Retry(predicate=lambda e: isinstance(e, genai.errors.APIError) and e.code in {429, 503})

        def __call__(self, input: Documents) -> Embeddings:
            embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"
            response = self._retry(self.client.models.embed_content)(
                model="models/text-embedding-004",
                contents=input,
                config=types.EmbedContentConfig(task_type=embedding_task),
            )
            return [e.values for e in response.embeddings]

    return GeminiEmbeddingFunction(client)

In [4]:
# Assuming 'Document' is a class/dataclass with 'page_content' and 'metadata' attributes
# from a library like LangChain, LlamaIndex, etc.
class Document:
    def __init__(self, page_content: str, metadata: dict = None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# NOTE: The original doc_stats, filename_base, google_drive_path, and all_chunks 
# are not defined in the provided snippet. I'll define placeholders or 
# make assumptions for a runnable example.

'''def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    """
    Reads markdown files in a directory (and subdirectories) and creates a 
    single Document for each file, adding relevant metadata, but does not chunk.
    """
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        print("No markdown files found")
        return []

    print(f"Processing {len(markdown_files)} markdown files...")
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"\nWarning: Could not read file {filepath}: {e}")
            continue

        # Extract basic file name for metadata
        filename_base = os.path.basename(filepath)

        # Create a single Document for the entire file content
        doc = Document(page_content=markdown_text)

        # Add metadata
        doc.metadata["source"] = filename_base
        # Use google_drive_path if provided, otherwise use local path
        doc.metadata["source_path"] = google_drive_path or filepath 
        
        # NOTE: Since we are not chunking by headers, we won't have a specific header.
        # We set it to an empty string or a placeholder.
        doc.metadata["header"] = "" 
        
        # Since the entire file is one document, these values reflect that.
        doc.metadata["chunk_index"] = 0
        doc.metadata["total_chunks"] = 1
        doc.metadata["is_complete_doc"] = True
        
        all_documents.append(doc)

    print(f"\nSuccessfully processed {len(all_documents)} files into documents.")
    return all_documents'''
'''def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        return []

    # 1. Definiraj zaglavlja koja želiš pratiti
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    # Inicijaliziraj splittere
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    # Sekundarni splitter koji pazi na veličinu (da chunk ne bude prevelik za embedding)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, 
        chunk_overlap=100
    )
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"Error: {e}")
            continue

        # 2. Prvo splitaj po Markdown naslovima
        md_header_splits = markdown_splitter.split_text(markdown_text)

        # 3. Dodatno splitaj ako su sekcije preduge i dodaj metapodatke
        final_splits = text_splitter.split_documents(md_header_splits)

        for i, split in enumerate(final_splits):
            filename_base = os.path.basename(filepath)
            
            # Kreiraj novi Document objekt (kompatibilan s tvojim ChromaDB setupom)
            doc = Document(page_content=split.page_content)
            
            # Kopiraj metapodatke iz splittera (tu su sada Header 1, Header 2 itd.)
            doc.metadata = split.metadata
            doc.metadata["source"] = filename_base
            doc.metadata["source_path"] = google_drive_path or filepath
            doc.metadata["chunk_index"] = i
            
            # Spoji naslove u jedan string radi lakše pretrage (opcionalno)
            header_context = " > ".join([v for k, v in split.metadata.items() if "Header" in k])
            doc.metadata["header_path"] = header_context
            
            all_documents.append(doc)

    return all_documents'''
'''def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        return []

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    # CHANGED: Increased chunk size and overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,    # Changed from 800
        chunk_overlap=300   # Changed from 100
    )
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"Error: {e}")
            continue

        md_header_splits = markdown_splitter.split_text(markdown_text)
        final_splits = text_splitter.split_documents(md_header_splits)

        for i, split in enumerate(final_splits):
            filename_base = os.path.basename(filepath)
            
            doc = Document(page_content=split.page_content)
            
            doc.metadata = split.metadata
            doc.metadata["source"] = filename_base
            doc.metadata["source_path"] = google_drive_path or filepath
            doc.metadata["chunk_index"] = i
            
            header_context = " > ".join([v for k, v in split.metadata.items() if "Header" in k])
            doc.metadata["header_path"] = header_context
            
            all_documents.append(doc)

    return all_documents'''
def parse_markdown_for_metadata(directory: str, google_drive_path: str = None) -> List[Document]:
    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    if not markdown_files:
        return []

    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
    
    all_documents = []

    for filepath in tqdm(markdown_files, desc="Processing documents"):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                markdown_text = f.read()
        except Exception as e:
            print(f"Error: {e}")
            continue

        filename_base = os.path.basename(filepath)
        name_no_ext = os.path.splitext(filename_base)[0]
        
        # Strip domain and underscores for cleaner metadata
        clean_id = name_no_ext.replace("fhs.hr_", "") 
        doc_name_clean = clean_id.replace("_", " ") 

        md_header_splits = markdown_splitter.split_text(markdown_text)
        final_splits = text_splitter.split_documents(md_header_splits)

        for i, split in enumerate(final_splits):
            # 1. Generate the header context first so we can use it in searchable_content
            header_context = " > ".join([v for k, v in split.metadata.items() if "Header" in k])
            
            # 2. THE ACCURACY BOOST: Prepend metadata to the content
            # This ensures names and sections are indexed by the vector search
            searchable_content = f"Dokument: {doc_name_clean}\nSekcija: {header_context if header_context else 'Opće'}\n\n{split.page_content}"
            
            # 3. Create the Document with the enriched content
            doc = Document(page_content=searchable_content)
            
            # 4. Standard Metadata Assignments
            doc.metadata = split.metadata
            doc.metadata["source"] = filename_base 
            doc.metadata["article_link"] = clean_id 
            doc.metadata["doc_name"] = doc_name_clean 
            doc.metadata["source_path"] = google_drive_path or filepath
            doc.metadata["chunk_index"] = i
            doc.metadata["header_path"] = header_context
            
            all_documents.append(doc)

    return all_documents

In [5]:
def create_collection(chroma_client, gemini_embedding_function, documents_list):
    """
    Create or update ChromaDB collection with optimized batch processing.
    """
    DB_NAME = "hrstud-bot-hr"
    embed_fn = gemini_embedding_function
    embed_fn.document_mode = True

    db = chroma_client.get_or_create_collection(
        name=DB_NAME,
        metadata={"model": "models/text-embedding-004", "dimension": 768},
        embedding_function=embed_fn
    )

    documents = [doc.page_content for doc in documents_list]
    metadatas = [doc.metadata for doc in documents_list]
    ids = [f"{DB_NAME}_doc_{i}" for i in range(len(documents))]

    if db.count() == 0:
        print(f"Adding {len(documents)} documents to ChromaDB collection: {DB_NAME}")

        # Optimized batch size for Gemini API
        BATCH_SIZE = 100
        
        for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Adding documents", unit="batch"):
            batch_end = min(i + BATCH_SIZE, len(documents))
            db.add(
                documents=documents[i:batch_end],
                metadatas=metadatas[i:batch_end],
                ids=ids[i:batch_end]
            )
            # Rate limiting for API stability
            time.sleep(0.2)

        print(f"\nCollection '{DB_NAME}' now contains {db.count()} documents.")
    else:
        print(f"Collection '{DB_NAME}' already has {db.count()} documents.")

In [6]:
def persistent_client(embed_fn):
    """
    Initialize persistent ChromaDB client.
    """
    persist_dir = "./output_hr"
    chroma_client = chromadb.PersistentClient(path=persist_dir)

    DB_NAME = "hrstud-bot-hr"
    collection = chroma_client.get_collection(DB_NAME, embedding_function=embed_fn)

    print(f"Connected to collection: {collection.name}")
    print(f"Documents: {collection.count()}")
    print(f"Metadata: {collection.metadata}")
    return embed_fn, collection

In [27]:
# NOTE: Placeholder for helper function (used in the original snippet)
def _no_answer_response():
    """Standard no-answer response."""
    return ("Ispričavamo se, ali ne mogu pronaći relevantan odgovor u bazi znanja. "
            "Molimo kontaktirajte odgovarajuću službu za dodatne informacije.")

# FastText model loading assumed to be successful from the previous block
LID_MODEL = fasttext.load_model('./fasttext/lid.176.ftz') 

'''def get_article_hr(user_query, embed_fn, collection, client):
    
    # Switch to query mode when generating embeddings
    embed_fn.document_mode = False

    # Retrieve top 1 document (based on your n_results=1 in the original code)
    # The result structure is a dict: {'ids': [[]], 'distances': [[]], 'documents': [[]], 'metadatas': [[]], ...}
    n_results_to_fetch = 30 # Fetch more results for a richer context
    result = collection.query(query_texts=[user_query], n_results=n_results_to_fetch)
    
    # Extract documents (list of passages) and metadatas (list of dicts)
    all_passages = result["documents"][0]
    all_metadatas = result["metadatas"][0]

    query_oneline = user_query.replace("\n", " ")
    print(query_oneline)
    
    # 1. CONSTRUCT THE CONTEXT
    context_list = []
    # Use the metadata from the top result to define the main source link
    # Assuming 'source_path' contains the URL or relevant file path
    #document_link = all_metadatas[0].get("source_path", "Link nije dostupan")
    
    for i, (passage, metadata) in enumerate(zip(all_passages, all_metadatas)):
        # Format the context for the model
        source_name = metadata.get("source", "Nepoznat izvor")
        # I removed the redundant "PASSAGE: " wrapper that was causing issues
        context_list.append(f"--- Izvor: {source_name} (Dio {i+1} od {len(all_passages)}) ---\n{passage.strip()}")

    # Join all context chunks into a single string
    context = "\n\n".join(context_list)
    
    # 2. CONSTRUCT THE PROMPT
    # The document_link is now a defined variable
    prompt = f"""
    Ti si ljubazan, precizan i informativan chatbot **Fakulteta Hrvatskih studija**. Tvoja je glavna zadaća odgovarati na pitanja studenata, potencijalnih studenata i osoblja o fakultetu, uključujući informacije o studijima, nastavi, smjerovima, prijavama, i općenitim informacijama o školi.

    **KRITIČNA PRAVILA:**
    1.  Koristi ISKLJUČIVO informacije iz dostavljene dokumentacije.
    2.  Odgovaraj na **Hrvatskom jeziku**.
    3.  Budi koncizan ali potpun — navedi sve relevantne detalje iz konteksta.
    4.  Ako dokumentacija ne sadrži odgovor, jasno i ljubazno reci da ne možeš pronaći odgovor u bazi znanja i uputi na kontaktiranje odgovarajuće službe.
    5.  **Ne smiješ koristiti fraze poput "Naravno, mogu vam pomoći!" ili "Evo nekoliko informacija o...". Odmah započni s relevantnim odgovorom.**

    **FORMATIRANJE ODGOVORA:**
    * Sve odgovore započni s **Izvorni link je [LINK](url)**, nakon čega slijedi prazan red.
    * Nemoj navoditi izvorni link dokumenta samo URL. npr nemoj navoditi: Izvorni link: ./markdown/fhs.hr_predmet_opsv.md
    * Koristi podebljani tekst za ključne pojmove (npr. **Upisi**, **Filozofija**, **Pročelnik**).
    * Koristi popise (liste) za nabrajanje informacija (studiji, uvjeti, rokovi).
    * Odgovori trebaju biti profesionalni i službeni, ali s ljubaznim tonom.

    **DOSTUPNA DOKUMENTACIJA (Kontekst):**
    {context}

    **KORISNIČKO PITANJE:** {query_oneline}

    **ODGOVOR:**
    """
    
    # 3. Call the model
    answer = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt, # Use the full prompt
        config={
            "max_output_tokens": 2048,
            "temperature": 0.2,
            "top_p": 0.9
        }
    )
    
    # Prepend the link as per your strict instruction, since Gemini might not format the first line perfectly
    #final_response = f"Izvorni link: {document_link}\n\n{answer.text.strip()}"
    
    #return final_response

    return answer.text.strip()'''

'''def get_article_hr(user_query, embed_fn, collection, client):
    
    # Switch to query mode when generating embeddings
    embed_fn.document_mode = False

    # Retrieve top N documents for richer context
    n_results_to_fetch = 10
    result = collection.query(query_texts=[user_query], n_results=n_results_to_fetch)
    
    # Extract documents (list of passages) and metadatas (list of dicts)
    all_passages = result["documents"][0]
    all_metadatas = result["metadatas"][0]
    all_distances = result["distances"][0]

    query_oneline = user_query.replace("\n", " ")
    print(f"Query: {query_oneline}")
    print(f"Top 5 results (distances): {all_distances[:5]}")
    
    # 1. CONSTRUCT THE CONTEXT with source tracking
    context_list = []
    sources_used = set()  # Track unique sources
    
    for i, (passage, metadata, distance) in enumerate(zip(all_passages, all_metadatas, all_distances)):
        source_name = metadata.get("source", "Nepoznat izvor")
        source_path = metadata.get("source_path", "")
        header_path = metadata.get("header_path", "")
        
        # Track sources for final reference
        sources_used.add((source_name, source_path))
        
        # Format context with all relevant metadata
        context_entry = f"""--- Izvor {i+1}: {source_name} ---
Sekcija: {header_path if header_path else "Opće informacije"}
Relevantnost: {distance:.3f}

{passage.strip()}"""
        
        context_list.append(context_entry)

    # Join all context chunks
    context = "\n\n".join(context_list)
    
    # Format sources for the model to reference
    sources_formatted = "\n".join([f"- {name}: {path}" for name, path in sorted(sources_used)])
    
    # 2. CONSTRUCT THE PROMPT
    prompt = f"""Ti si ljubazan, precizan i informativan chatbot **Fakulteta Hrvatskih studija**. Tvoja je glavna zadaća odgovarati na pitanja studenata, potencijalnih studenata i osoblja o fakultetu.

**KRITIČNA PRAVILA:**
1. Koristi ISKLJUČIVO informacije iz dostavljene dokumentacije.
2. Odgovaraj na **Hrvatskom jeziku**.
3. Budi koncizan ali potpun — navedi sve relevantne detalje iz konteksta.
4. Ako dokumentacija ne sadrži odgovor, jasno reci da ne možeš pronaći odgovor u bazi znanja.
5. **Ne koristi fraze poput "Naravno, mogu vam pomoći!" - odmah započni s relevantnim odgovorom.**

**FORMATIRANJE ODGOVORA:**
* Na POČETKU odgovora, u prvoj liniji, navedi izvor(e) u formatu: **Izvor:** [naziv dokumenta]
* Nakon izvora ostavi prazan red, pa nastavi s odgovorom.
* Koristi podebljani tekst za ključne pojmove (npr. **ZET linija 215**, **Kampus Borongaj**).
* Koristi popise za nabrajanje informacija.
* Ton: profesionalan i ljubazan.

**DOSTUPNI IZVORI:**
{sources_formatted}

**KONTEKST (30 najrelevantnijih dijelova dokumentacije):**
{context}

**KORISNIČKO PITANJE:** {query_oneline}

**TVOJ ODGOVOR (započni s **Izvor:** na prvoj liniji):**"""
    
    # 3. Call the model
    answer = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config={
            "max_output_tokens": 2048,
            "temperature": 0.2,
            "top_p": 0.9
        }
    )
    
    return answer.text.strip()'''
import re
from typing import List

def get_article_hr(user_query, embed_fn, collection, client):
    # 1. PRIPREMA I QUERY EXPANSION
    embed_fn.document_mode = False
    query_lower = user_query.lower()
    
    # Inicijalne vrijednosti
    expanded_query = user_query
    n_results_to_fetch = 12

    # Logika za profesore i kolegije
    if any(word in query_lower for word in ["predaje", "tko", "nastavnik", "profesor", "kolegij"]):
        expanded_query = f"{user_query} profesor nositelj zvanje nastava kolegij studij"
    
    # Logika za lokaciju
    elif any(word in query_lower for word in ["doći", "lokacija", "gdje", "kampus", "borongaj", "autobus", "vlak"]):
        expanded_query = f"{user_query} lokacija adresa kampus borongaj autobus 215 236 vlak stanica Trnava"

    # Logika za studije (Povećavamo broj rezultata na 25 jer su studiji raspršeni po datotekama)
    if any(word in query_lower for word in ["studij", "nudite", "program", "upisi", "smjer"]):
        expanded_query = f"{user_query} popis svih studija prijediplomski diplomski doktorski studij kroatologija povijest sociologija psihologija komunikologija filozofija"
        n_results_to_fetch = 20
    
    # 2. VECTOR SEARCH (Sada koristimo ispravan expanded_query i n_results)
    result = collection.query(
        query_texts=[expanded_query], 
        n_results=n_results_to_fetch
    )
    
    all_passages = result["documents"][0]
    all_metadatas = result["metadatas"][0]
    all_distances = result["distances"][0]

    # 3. DEDUPLIKACIJA I PRIPREMA KONTEKSTA
    context_list = []
    seen_passages = set()
    
    for p, m, d in zip(all_passages, all_metadatas, all_distances):
        # Nešto labaviji prag (0.90) za studije kako ne bismo propustili neki odsjek
        if d < 0.90: 
            fingerprint = p.strip()[:150] 
            if fingerprint in seen_passages:
                continue
            seen_passages.add(fingerprint)
            
            chunk_url = m.get("article_link", "")
            context_list.append(f"Izvor URL: {chunk_url}\n{p.strip()}")

    if not context_list:
        return "Nažalost, ne mogu pronaći informacije o vašem upitu u bazi znanja."

    context = "\n\n".join(context_list)
    query_oneline = user_query.replace("\n", " ")

    # 4. IZVLAČENJE GLAVNOG LINKA
    main_url = all_metadatas[0].get("article_link", "https://www.fhs.hr")
    article_link_markdown = f"[Article Link]({main_url})"

    # 5. PROMPT ZA GEMINI (Postavljen na 2.0 Flash ili 2.0 Flash-Lite)
    prompt = f"""
    Ti si ljubazan, precizan i informativan chatbot **Fakulteta Hrvatskih studija**. Tvoja je zadaća odgovarati na pitanja o fakultetu.

    **KRITIČNA PRAVILA:**
    1. Koristi ISKLJUČIVO dostavljenu dokumentaciju (KONTEKST). Ako se informacija o studiju nalazi u kontekstu, navedi je.
    2. Odgovaraj na **Hrvatskom jeziku**.
    3. **GRUPIRANJE:** Ako ista osoba predaje više kolegija, navedi ime osobe SAMO JEDNOM. Grupiraj studije po razinama (Prijediplomski, Diplomski, Doktorski).
    4. **POVEZNICE:** Kolegije, emailove i studije prikaži kao Markdown poveznice (npr. [Naziv](URL)) koristeći URL-ove iz konteksta.
    5. **BEZ UVODA:** Odmah započni s relevantnim odgovorom.

    **FORMATIRANJE ODGOVORA:**
    * Prva linija odgovora: **Izvor:** {article_link_markdown}
    * Nakon toga prazan red.
    * Koristi **podebljani tekst** za ključne pojmove.
    * Koristi liste (bullet points).

    **DOSTUPNA DOKUMENTACIJA (Kontekst):**
    {context}

    **KORISNIČKO PITANJE:** {query_oneline}

    **ODGOVOR:**
    """
    
    # 6. GENERIRANJE ODGOVORA
    answer = client.models.generate_content(
        model="gemini-2.5-flash-lite", # Ili "gemini-2.0-flash-lite"
        contents=prompt,
        config={
            "max_output_tokens": 2048,
            "temperature": 0.1,
            "top_p": 0.9
        }
    )
    
    return answer.text.strip()



In [30]:
# USAGE EXAMPLE - Uncomment to run

markdown_folder = "./markdown_hr"
# 
# # STEP 1: Parse and chunk documents (run once or when documents change)
md_documents = parse_markdown_for_metadata(markdown_folder)
# 
# # STEP 2: Create collection and add documents (run once)
client = import_google_api()
gemini_embedding_function = embedding_function(client)
chroma_persistent_client = chromadb.PersistentClient(path="./output_hr")
create_collection(chroma_persistent_client, gemini_embedding_function, md_documents)

Processing documents: 100%|██████████| 1095/1095 [00:00<00:00, 1216.87it/s]


models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001
Collection 'hrstud-bot-hr' already has 6776 documents.


In [38]:
# STEP 3: Query the system (run for each query)
# 
client = import_google_api()
gemini_embedding_function = embedding_function(client)
embed_fn, collection = persistent_client(gemini_embedding_function)
# 
#user_query = "Tko je Sandro Skansi?"  # Example query
user_query = "razredbeni postupak?"  # Example query
response = get_article_hr(
    user_query=user_query,
    embed_fn=embed_fn,
    collection=collection,
    client=client
)
display(Markdown(response))

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001
Connected to collection: hrstud-bot-hr
Documents: 6776
Metadata: {'dimension': 768, 'model': 'models/text-embedding-004'}


Izvor: [Article Link](studiji_upis_na_fhs_razredbeni_postupak)

*   **Razredbeni postupak** za upis na studije Fakulteta hrvatskih studija opisan je na poveznici: [https://www.fhs.hr/studiji/upis_na_fhs/razredbeni_postupak](https://www.fhs.hr/studiji/upis_na_fhs/razredbeni_postupak)

In [None]:
'''# ADVANCED: Test multiple queries
# 
test_queries = [
    "Koje predmete predaje Mato Škerbić?",
    "Koje predmete predaje Sandro Skansi?",
    "Tko predaje Opća povijest srednjeg vijeka?"
]
# 
for query in test_queries:
    print(f"\n{'#'*60}")
    print(f"QUERY: {query}")
    print(f"{'#'*60}")
    response = get_article_hr(
        user_query=query,
        embed_fn=embed_fn,
        collection=collection,
        client=client
    )
    display(Markdown(response))
    print("\n")'''

In [None]:
# ADVANCED: Test multiple queries
# 
test_queries = [
    #"Tko predaje Opća povijest srednjeg vijeka?"
    "Koje predmete predaje Mato Škerbić?"

]
# 
for query in test_queries:
    print(f"\n{'#'*60}")
    print(f"QUERY: {query}")
    print(f"{'#'*60}")
    response = get_article_hr(
        user_query=query,
        embed_fn=embed_fn,
        collection=collection,
        client=client,
    )
    display(Markdown(response))
    print("\n")