In [3]:
! python -m pip install google-genai lxml numpy

Collecting numpy
  Downloading numpy-2.4.2-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Downloading numpy-2.4.2-cp313-cp313-win_amd64.whl (12.3 MB)
   ---------------------------------------- 0.0/12.3 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/12.3 MB 9.3 MB/s eta 0:00:02
   ---------- ----------------------------- 3.1/12.3 MB 8.3 MB/s eta 0:00:02
   --------------- ------------------------ 4.7/12.3 MB 8.3 MB/s eta 0:00:01
   ---------------------- ----------------- 6.8/12.3 MB 8.8 MB/s eta 0:00:01
   ----------------------------- ---------- 9.2/12.3 MB 9.2 MB/s eta 0:00:01
   ------------------------------------- -- 11.5/12.3 MB 9.6 MB/s eta 0:00:01
   ---------------------------------------- 12.3/12.3 MB 9.5 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-2.4.2



[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from lxml import etree
from google.genai import client, types
from google import genai
import numpy as np

api_key = '----'
client = genai.Client(api_key=api_key)

#### Läsa in XML/XSLT-files.

In [2]:
file_path = "xml_files/food.xml"
file_type = "xml"

In [3]:
tree = etree.parse(file_path)
root = tree.getroot()

#### Chunks för XML-files

In [4]:
def chunks_xml(element, path=""):
    current_path = f"{path}/{element.tag}"
    text = f"Element Path: {current_path}\nTag: {element.tag}\nAttributes: {element.attrib}\n"
    if element.text and element.text.strip():
        text += f"Text: {element.text.strip()}\n"
    
    for child in element:
        if child.text and child.text.strip():
            text += f"{child.tag}: {child.text.strip()}\n"
        if child.attrib:
            text += f"{child.tag} attributes: {child.attrib}\n"

    chunks = [text.strip()]
    
    for child in element:
        chunks.extend(chunks_xml(child, current_path))
    return chunks

#### Chunks för XSLT-files

In [5]:
def chunks_xsl(element, path=""):
    full_tag = str(element.tag)

    if "}" in full_tag:
        ns, tag = full_tag[1:].split("}", 1)
        if ns.endswith("1999/XSL/Transform"):
            full_tag = f"xsl:{tag}"
        elif ns.endswith("1999/XSL/Format"):
            full_tag = f"fo:{tag}"
        else:
            full_tag

    current_path = f"{path}/{full_tag}"
    text = f"Element Path: {current_path}\nTag: {full_tag}\n"
    
    if element.attrib:
        text += f"Attributes: {element.attrib}\n"
    if element.text and element.text.strip():
        text += f"Text: {element.text.strip()}\n"

    for child in element:
        if child.text and child.text.strip():
            text += f"{child.tag}: {child.text.strip()}\n"
        if child.attrib:
            text += f"{child.tag} attributes: {child.attrib}\n"
    
    chunks = [text.strip()]

    for child in element:
        chunks.extend(chunks_xsl(child, current_path))
    return chunks

In [6]:
if file_type == "xml":
    chunks = chunks_xml(root)
elif file_type == "xsl":
    chunks = chunks_xsl(root)
    
print("Filetype: ", file_type)
print("Amount of chunks: ", len(chunks))

Filetype:  xml
Amount of chunks:  26


#### Embeddings

In [7]:
def create_embeddings(text_list, model="gemini-embedding-001", batch_size=100):
    all_embeddings =  []
    
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]

        response = client.models.embed_content(
            model=model,
            contents=batch,
            config={"task_type": "SEMANTIC_SIMILARITY"}
        )
        all_embeddings.extend([e.values for e in response.embeddings])

    return all_embeddings

In [8]:
embeddings = create_embeddings(chunks)
print("Amount of embeddings: ", len(chunks))

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': 'API key not valid. Please pass a valid API key.', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'API_KEY_INVALID', 'domain': 'googleapis.com', 'metadata': {'service': 'generativelanguage.googleapis.com'}}, {'@type': 'type.googleapis.com/google.rpc.LocalizedMessage', 'locale': 'en-US', 'message': 'API key not valid. Please pass a valid API key.'}]}}

#### Semantisk Sökning

In [20]:
def cosine_similarity(a, b):
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a == 0 or norm_b == 0:
        return 0
    return np.dot(a,b) / (norm_a * norm_b)

In [21]:
def semantic_search(query, chunks, embeddings, top_k=30):
    query_embedding = client.models.embed_content(
        model="gemini-embedding-001",
        contents=[query],
        config={"task_type": "SEMANTIC_SIMILARITY"}
    ).embeddings[0].values
    scores = [cosine_similarity(query_embedding, emb) for emb in embeddings]
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return "\n\n".join([chunks[i] for i in top_indices])

In [None]:
def generate_response(query, top_k=30):
    context = semantic_search(query, chunks, embeddings, top_k=top_k)

    system_prompt = (
    "Du är en AI-assistent som analyserar XML och XSLT-filer. "
    "All information ska hämtas via RAG från den relevanta kontexten, inga hårdkodade värden. "
    "Svara kort, exakt och korrekt baserat på kontexten. "
    "Svara endast direkt på frågan, inga extra förklaringar. "
    "Om frågan gäller root-element, ge endast taggen. "
    "Om frågan gäller värde, attribut, id eller text, ge endast värdet. "
    "Om frågan innehåller ordet 'path' eller 'XPath', returnera fullständig absolut path från root och inkludera predikat som behövs för att särskilja element. "
    "Om flera element med samma tagg finns, måste den fullständiga absoluta XPathen inkludera ett predikat som särskiljer rätt element baserat på relevant attribut eller child-element från kontexten, och får aldrig vara generell. "
    "Om frågan gäller ett element som inte finns i den chunkade RAG-kontexten, svara: 'Informationen finns inte i kontexten'. "
    "Om frågan gäller XSLT-element, hämta information från select- eller match-attribut enligt frågan."
)

    final_prompt = f"""
        {system_prompt}

        Fråga:
        {query}

        Relevant kontext:
        {context}
    """

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=final_prompt
    )
    return response.text

In [57]:
queries = [
    "Vilka är de unika elementen i filen?"]

for q in queries:
    print("Question:", q)
    print("Answer:", generate_response(q))

Question: Vilka är de unika elementen i filen?
Answer: breakfast_menu, name, calories, price, food, description

