## Notebook: Metadata usage, Metadata extraction.

This notebook illustrates exercising the metadata feature of the solution. It walks through :-
* Ingestion of a few documents with metadata
* Q & A with no metadata filtering
* Q & A with metadata filtering
* **[Extra]** Example of extracting metadata from user queries for inclusion in the RAG /generate API call.

### Install Dependencies and import required modules

In [None]:
!pip install aiohttp

In [None]:
import aiohttp
import os
import json

### Base Configuration

* Helper functions in the following cell
* The code assumes a docker installation of the RAG Blueprint on the same server that is running this notebook

In [None]:
IPADDRESS = "localhost" # Replace this with the correct IP address if required
INGESTOR_SERVER_PORT = "8082"
INGESTOR_BASE_URL = f"http://{IPADDRESS}:{INGESTOR_SERVER_PORT}"  # Replace with your server URL if required

async def print_response(response, to_print=True):
    """Helper to print API response."""
    try:
        response_json = await response.json()
        if to_print:
            print(json.dumps(response_json, indent=2))
        return response_json
    except aiohttp.ClientResponseError:
        print(await response.text())


RAG_SERVER_PORT = "8081"
RAG_BASE_URL = f"http://{IPADDRESS}:{RAG_SERVER_PORT}"  # Replace with your server URL

rag_url = f"{RAG_BASE_URL}/v1/generate"

import json
import base64
from IPython.display import display, Image, Markdown

async def print_streaming_response_and_citations(response_generator):
    first_chunk_data = None

    async for chunk in response_generator:
        if chunk.startswith("data: "):
            chunk = chunk[len("data: "):].strip()

        if not chunk:
            continue

        try:
            data = json.loads(chunk)
        except Exception as e:
            print(f"JSON decode error: {e}")
            print(f"⚠️ Raw chunk content: {repr(chunk)}")
            continue

        choices = data.get("choices", [])
        if not choices:
            continue

        # Capture first chunk with citations (if any)
        if first_chunk_data is None and data.get("citations"):
            first_chunk_data = data

        # Stream the content
        delta = choices[0].get("delta", {})
        text = delta.get("content")
        if not text:
            message = choices[0].get("message", {})
            text = message.get("content", "")
        print(text, end='', flush=True)

    print()  # Newline after completion

    # Display citations if any
    if first_chunk_data and first_chunk_data.get("citations"):
        citations = first_chunk_data["citations"]
        for idx, citation in enumerate(citations.get("results", [])):
            doc_type = citation.get("document_type", "text")
            content = citation.get("content", "")
            doc_name = citation.get("document_name", f"Citation {idx+1}")

            display(Markdown(f"\n**Citation {idx+1}: {doc_name}**"))

            if doc_type == "image":
                try:
                    image_bytes = base64.b64decode(content)
                    display(Image(data=image_bytes))
                except Exception as e:
                    display(Markdown(f"⚠️ Failed to render image:\n```\n{e}\n```"))
            else:
                display(Markdown(f"```\n{content}\n```"))

async def generate_answer(payload):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(url=rag_url, json=payload) as response:
                buffer = ""
                async for chunk in response.content.iter_chunked(1024):
                    buffer += chunk.decode()
                    while "\n" in buffer:
                        line, buffer = buffer.split("\n", 1)
                        yield line.strip()
        except aiohttp.ClientError as e:
            print(f"Error: {e}")

### Ensure the solution is up and running 
#### Health Check Endpoint

**Purpose:**
This endpoint performs a health check on the server. It returns a 200 status code if the server is operational.

In [None]:
async def fetch_health_status():
    """Fetch health status asynchronously."""
    url = f"{INGESTOR_BASE_URL}/v1/health"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            await print_response(response)

# Run the async function
await fetch_health_status()

### Create collection with the specified metadata schema

In [None]:
COLLECTION_NAME = "cars"

async def create_collection(
    collection_name: list = None,
    embedding_dimension: int = 2048,
    metadata_schema: list = []
):

    data = {
        "collection_name": collection_name,
        "embedding_dimension": embedding_dimension,
        "metadata_schema": metadata_schema
    }

    HEADERS = {"Content-Type": "application/json"}

    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(f"{INGESTOR_BASE_URL}/v1/collection", json=data, headers=HEADERS) as response:
                await print_response(response)
        except aiohttp.ClientError as e:
            return 500, {"error": str(e)}

metadata_schema = [
    {
        "name": "manufacturer",
        "type": "string",
        "description": "manufacturer"
    },
    {
        "name": "model",
        "type": "string",
        "description": "model"
    },
    {
        "name": "year",
        "type": "string",
        "description": "year"
    }
]

# Create the collection
await create_collection(
    collection_name=COLLECTION_NAME,
    metadata_schema=metadata_schema # Optional argument, can be commented if metadata is not to be inserted
)

### Prepare the files and metadata

* Download files
* Prepare the metadata
* Upload files into the newly created collection

In [None]:
import os
import requests

os.makedirs('./data', exist_ok=True)

# Mapping of filenames to source URLs
files_to_download = {
    '2024_Ford_Escape_Owners_Manual_version_1_om_EN-US.pdf': 'https://www.fordservicecontent.com/Ford_Content/Catalog/owner_information/2024_Ford_Escape_Owners_Manual_version_1_om_EN-US.pdf',
    '2023_Edge_Owners_Manual_version_2_om_EN-US.pdf': 'https://www.fordservicecontent.com/Ford_Content/Catalog/owner_information/2023_Edge_Owners_Manual_version_2_om_EN-US.pdf',
    '2015-Edge-Owner-Manual-version-2_om_EN-US_06_2015.pdf': 'https://www.fordservicecontent.com/Ford_Content/Catalog/owner_information/2015-Edge-Owner-Manual-version-2_om_EN-US_06_2015.pdf'
}

print("Downloading files ...")
for filename, url in files_to_download.items():
    destination = os.path.join('./data', filename)
    print(f"Downloading {filename}...")
    
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(destination, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"✅ Downloaded: {filename}")
    else:
        print(f"❌ Failed to download {filename} - HTTP {response.status_code}")

FILEPATHS = [
    "./data/2015-Edge-Owner-Manual-version-2_om_EN-US_06_2015.pdf",
    "./data/2023_Edge_Owners_Manual_version_2_om_EN-US.pdf",
    "./data/2024_Ford_Escape_Owners_Manual_version_1_om_EN-US.pdf"
]

CUSTOM_METADATA = [
    {
        "filename": "2015-Edge-Owner-Manual-version-2_om_EN-US_06_2015.pdf",
        "metadata": {
            "manufacturer": "ford",
            "model": "edge",
            "year": 2015
        }
    },
    {
        "filename": "2023_Edge_Owners_Manual_version_2_om_EN-US.pdf",
        "metadata": {
            "manufacturer": "ford",
            "model": "edge",
            "year": 2023
        }
    },
    {
        "filename": "2024_Ford_Escape_Owners_Manual_version_1_om_EN-US.pdf",
        "metadata": {
            "manufacturer": "ford",
            "model": "escape",
            "year": 2024
        }
    }
]

def stringify_metadata_values(obj):
    if isinstance(obj, dict):
        return {k: stringify_metadata_values(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [stringify_metadata_values(item) for item in obj]
    else:
        return str(obj)

print("Stringifying relevant metadata ...")

for entry in CUSTOM_METADATA:
    entry["metadata"] = stringify_metadata_values(entry["metadata"])

### Upload Documents

In [None]:
import aiohttp
import json
import os

async def upload_documents(collection_name: str = "") -> str:
    data = {
        "collection_name": collection_name,
        "blocking": False,
        "split_options": {
            "chunk_size": 512,
            "chunk_overlap": 150
        },
        "custom_metadata": CUSTOM_METADATA,
        "generate_summary": False
    }

    form_data = aiohttp.FormData()
    for file_path in FILEPATHS:
        form_data.add_field(
            "documents",
            open(file_path, "rb"),
            filename=os.path.basename(file_path),
            content_type="application/pdf"
        )

    form_data.add_field("data", json.dumps(data), content_type="application/json")

    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(f"{INGESTOR_BASE_URL}/v1/documents", data=form_data) as response:
                resp_json = await response.json()
                print("Response:", resp_json)
                return resp_json.get("task_id")  # return the task_id
        except aiohttp.ClientError as e:
            print(f"Error: {e}")
            return None


async def get_task_status(
    task_id: str
):

    params = {
        "task_id": task_id,
    }

    HEADERS = {"Content-Type": "application/json"}

    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(f"{INGESTOR_BASE_URL}/v1/status", params=params, headers=HEADERS) as response:
                returnval = await print_response(response, False)
                return returnval
        except aiohttp.ClientError as e:
            return 500, {"error": str(e)}

import asyncio

async def wait_until_task_complete(task_id):
    while True:
        result = await get_task_status(task_id=[task_id])
        
        if result is None:
            print("❌ No response received, retrying...")
            await asyncio.sleep(5)
            continue

        state = result.get("state", None)
        print(f"Current state: {state}")

        if state != "PENDING":
            print("✅ Task completed.")
            break
        
        await asyncio.sleep(15)  # wait 5 seconds before polling again

### Wait for the documents to be uploaded.

In [None]:
task_id = await upload_documents(collection_name=COLLECTION_NAME)
print("Upload documents ...task_id: {}".format(task_id))
if task_id:
    await get_task_status(task_id=task_id)
else:
    print("⚠️ Upload failed or no task_id returned.")
    
await wait_until_task_complete(task_id)

### Fetch documents

Ensuring the files exist in the collection.

In [None]:
async def fetch_documents(collection_name: str = ""):
    url = f"{INGESTOR_BASE_URL}/v1/documents"
    params = {"collection_name": collection_name}
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url, params=params) as response:
                await print_response(response)
        except aiohttp.ClientError as e:
            print(f"Error: {e}")

await fetch_documents(collection_name=COLLECTION_NAME)

### Query: No specification of metadata etc. 

Notice the response citations come from manuals belonging to the "escape" and the "edge"

In [None]:
payload = {
  "messages": [
    {
      "role": "user",
      "content": "What precautions should you take when jump starting the edge 2015?"
    }
  ],
  "use_knowledge_base": True,
  "temperature": 0.2,
  "top_p": 0.7,
  "max_tokens": 1024,
  "reranker_top_k": 4,
  "vdb_top_k": 10,
  "vdb_endpoint": "http://milvus:19530",
  "collection_names": ["cars"],
  "enable_query_rewriting": True,
  "enable_reranker": True,
  "enable_citations": True,
  "model": "nvidia/llama-3.3-nemotron-super-49b-v1",
  "reranker_model": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
  "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
  # Provide url of the model endpoints if deployed elsewhere
  # "llm_endpoint": "",
  #"embedding_endpoint": "",
  #"reranker_endpoint": "",
  "stop": [],
  "filter_expr": ''
}
await print_streaming_response_and_citations(generate_answer(payload))

Notice: The results are from the 2015 and the 2023 car models. The question was specifically for the 2015 model. 

### Query: Specification of simple metadata

In the previous query the responses were from multiple car models but the user really wanted the response only for their car model 
which might be the "edge"
Direct the query to a given car model i.e "edge"

Notice the citations are confined to the "edge" model.
We successfully used the metadata to limit the search to "edge" models alone. 

In [None]:
payload = {
  "messages": [
    {
      "role": "user",
      "content": "What precautions should you take when jump starting the edge 2015?"
    }
  ],
  "use_knowledge_base": True,
  "temperature": 0.2,
  "top_p": 0.7,
  "max_tokens": 1024,
  "reranker_top_k": 2,
  "vdb_top_k": 10,
  "vdb_endpoint": "http://milvus:19530",
  "collection_names": ["cars"],
  "enable_query_rewriting": True,
  "enable_reranker": True,
  "enable_citations": True,
  "model": "nvidia/llama-3.3-nemotron-super-49b-v1",
  "reranker_model": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
  "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
  # Provide url of the model endpoints if deployed elsewhere
  # "llm_endpoint": "",
  #"embedding_endpoint": "",
  #"reranker_endpoint": "",
  "stop": [],
  "filter_expr": 'content_metadata["model"] == "edge"'
}
await print_streaming_response_and_citations(generate_answer(payload))

Now notice, the answer generated and all citations are from the edge manuals, but they also refer to the 2023 year model despite being asked about the "2015 edge".

### Query: Specification of compound metadata

Notice in the previous example we got results from the 2015 and 2023 "edge" model. 
We'd likely want to get th results from a specific model of the car. 

In the next example we limit the query to a "2015 edge" model. 
Notice, the citations are only from a 2015 edge model car.

In [None]:
payload = {
  "messages": [
    {
      "role": "user",
      "content": "What precautions should you take when jump starting the edge 2015?"
    }
  ],
  "use_knowledge_base": True,
  "temperature": 0.2,
  "top_p": 0.7,
  "max_tokens": 1024,
  "reranker_top_k": 2,
  "vdb_top_k": 10,
  "vdb_endpoint": "http://milvus:19530",
  "collection_names": ["cars"],
  "enable_query_rewriting": True,
  "enable_reranker": True,
  "enable_citations": True,
  "model": "nvidia/llama-3.3-nemotron-super-49b-v1",
  "reranker_model": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
  "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
  # Provide url of the model endpoints if deployed elsewhere
  # "llm_endpoint": "",
  #"embedding_endpoint": "",
  #"reranker_endpoint": "",
  "stop": [],
  "filter_expr": 'content_metadata["model"] == "edge" and content_metadata["year"] == "2015"'
}
await print_streaming_response_and_citations(generate_answer(payload))

### Extra: Determine the metadata from the query.

It can be envisioned that the relevant metadata flags could be extracted from a user query (wherever applicable)
The below cell provides an example of how an LLM could be used to extract metadata K-V pairs that could further be used to build a filter.

In [None]:
payload = {
  "messages": [
    {
      "role": "user",
      "content": """Extract elements from the user query if and only if they exist.
           There are two possible elements: "year" and "model".
           Return a dictionary containing only the elements found.
           Omit any keys that are not present in the query.
           All returned strings must be lowercase.
           Valid output examples:
           {"year": "2023", "model": "edge"}
           {}
           {"year": "2023"}
           {"model": "edge"}
           The only allowed values are:
           For "year": "2015", "2023".
           For "model": "edge", "escape".
           User Query:
           "How do you enable and use the Rear Occupant Alert System in the 2015 escape?"
           The response should be "model": "escape", "year": "2023"
        """
    }
  ],
  "use_knowledge_base": False,
  "temperature": 0.2,
  "top_p": 0.7,
  "max_tokens": 1024,
  "reranker_top_k": 2,
  "vdb_top_k": 10,
  "vdb_endpoint": "http://milvus:19530",
  "collection_names": ["cars"],
  "enable_query_rewriting": False,
  "enable_reranker": False,
  "enable_citations": False,
  "model": "nvidia/llama-3.3-nemotron-super-49b-v1",
  "reranker_model": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
  "embedding_model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
  # Provide url of the model endpoints if deployed elsewhere
  # "llm_endpoint": "",
  #"embedding_endpoint": "",
  #"reranker_endpoint": "",
  "stop": [],
}
extracted_metadata = await print_streaming_response_and_citations(generate_answer(payload))

Notice the response is {"model": "escape", "year": "2023"} which can then be used 
to <b>construct the query filter</b>. The LLM could easily be used to generated the "filter" itself.

### Cleanup: Delete the collection

In [None]:
from typing import List

async def delete_collections(collection_names: List[str] = ""):
    url = f"{INGESTOR_BASE_URL}/v1/collections"
    async with aiohttp.ClientSession() as session:
        try:
            async with session.delete(url, json=collection_names) as response:
                await print_response(response)
        except aiohttp.ClientError as e:
            print(f"Error: {e}")

await delete_collections(collection_names=[COLLECTION_NAME])