In [1]:
import os

# ✅ Fix OpenMP crash (OMP: Error #15)
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import xml.etree.ElementTree as ET
import json
import chromadb
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import glob

In [2]:
# Initialize Hugging Face embedding model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Change if needed
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chromadb")  # Persistent storage
collection = chroma_client.get_or_create_collection(name="hospice_rules")

In [3]:
def parse_xml_element(element):
    """
    Recursively parses an XML element into a nested dictionary.
    """
    parsed_data = {"tag": element.tag}

    # If the element has attributes, store them
    if element.attrib:
        parsed_data["attributes"] = element.attrib

    # If the element has text, store it
    if element.text and element.text.strip():
        parsed_data["text"] = element.text.strip()

    # If the element has children, process them
    children = list(element)
    if children:
        parsed_data["children"] = [parse_xml_element(child) for child in children]

    return parsed_data


def flatten_hierarchy(data, parent_key=""):
    """
    Converts hierarchical XML data into a flat format suitable for embedding.
    """
    flat_data = []
    if isinstance(data, dict):
        tag = data.get("tag", "UNKNOWN")
        text = data.get("text", "")
        attributes = json.dumps(data.get("attributes", {}))
        key = f"{parent_key}/{tag}" if parent_key else tag

        # Create a formatted string for embedding
        entry = {
            "key": key,
            "content": f"Tag: {tag}, Attributes: {attributes}, Text: {text}".strip()
        }
        flat_data.append(entry)

        # Process children recursively
        for child in data.get("children", []):
            flat_data.extend(flatten_hierarchy(child, key))

    return flat_data


def chunk_text(text, chunk_size=200):
    """
    Splits long text into smaller chunks to improve embeddings and retrieval.
    """
    words = text.split()
    return [" ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size)]


def generate_embedding(text):
    """
    Generate embeddings using a Hugging Face transformer model.
    """
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**tokens)
        embedding = output.last_hidden_state[:, 0, :].squeeze().tolist()
    return embedding


def process_xml_file(file_path):
    """
    Parses and processes an XML file, returning a list of structured data.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    parsed_data = parse_xml_element(root)
    flat_data = flatten_hierarchy(parsed_data)
    return flat_data

In [4]:
# ✅ **Automatically Find All XML Files in `data/` Folder**
xml_folder = "data/"
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))  # Finds all XML files in `data/`
if not xml_files:
    print(f"No XML files found in {xml_folder}. Please place XML files there.")
    exit()

print(f"📂 Found {len(xml_files)} XML files in `{xml_folder}`: {xml_files}")


📂 Found 2 XML files in `data/`: ['data\\2025 Hospice Final Rule.xml', 'data\\2025 Hospice Proposed Rule.xml']


In [5]:
# ✅ **Process Each XML File**
for file in xml_files:
    print(f"📄 Processing `{file}`...")

    # Parse and flatten XML data
    structured_data = process_xml_file(file)

    # Generate embeddings and store in ChromaDB
    for i, entry in tqdm(enumerate(structured_data), total=len(structured_data), desc=f"🔄 Embedding `{file}`"):
        chunks = chunk_text(entry["content"])  # ✅ **Chunk long text**
        
        for j, chunk in enumerate(chunks):
            embedding = generate_embedding(chunk)

            # Add to ChromaDB
            collection.add(
                ids=[f"{file}-{i}-{j}"],
                embeddings=[embedding],
                metadatas=[{"source": file, "key": entry["key"], "text": chunk}]
            )

print("✅ Data successfully stored in ChromaDB!")

📄 Processing `data\2025 Hospice Final Rule.xml`...


🔄 Embedding `data\2025 Hospice Final Rule.xml`:   0%|          | 0/1139 [00:00<?, ?it/s]Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-610-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-611-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-612-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-613-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-614-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-615-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-616-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-617-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-618-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-619-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-620-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-621-0
Add of existing embedding ID: data\2025 Hospice 

📄 Processing `data\2025 Hospice Proposed Rule.xml`...


🔄 Embedding `data\2025 Hospice Proposed Rule.xml`:   0%|          | 0/900 [00:00<?, ?it/s]Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-0-0
Insert of existing embedding ID: data\2025 Hospice Proposed Rule.xml-0-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-1-0
Insert of existing embedding ID: data\2025 Hospice Proposed Rule.xml-1-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-2-0
Insert of existing embedding ID: data\2025 Hospice Proposed Rule.xml-2-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-3-0
Insert of existing embedding ID: data\2025 Hospice Proposed Rule.xml-3-0
Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-4-0
Insert of existing embedding ID: data\2025 Hospice Proposed Rule.xml-4-0
🔄 Embedding `data\2025 Hospice Proposed Rule.xml`:   1%|          | 5/900 [00:00<00:19, 46.30it/s]Add of existing embedding ID: data\2025 Hospice Proposed Rule.xml-5-0
Insert of existing embeddin

✅ Data successfully stored in ChromaDB!





In [6]:
query_text = "hospice payment update"
query_embedding = generate_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5  # Number of top results
)

print("🔎 Search Results:", results)


🔎 Search Results: {'ids': [['data\\2025 Hospice Proposed Rule.xml-213-1', 'data\\2025 Hospice Final Rule.xml-24-0', 'data\\2025 Hospice Proposed Rule.xml-39-0', 'data\\2025 Hospice Final Rule.xml-377-1', 'data\\2025 Hospice Proposed Rule.xml-64-0']], 'embeddings': None, 'documents': [[None, None, None, None, None]], 'uris': None, 'data': None, 'metadatas': [[{'key': 'PRORULE/SUPLINF/P', 'source': 'data\\2025 Hospice Proposed Rule.xml', 'text': 'hospice payment update percentage in the FY 2025 final rule.'}, {'key': 'RULE/PREAMB/FURINF/P', 'source': 'data\\2025 Hospice Final Rule.xml', 'text': 'Tag: P, Attributes: {}, Text: For general questions about hospice payment policy, send your inquiry via email to:'}, {'key': 'PRORULE/PREAMB/FURINF/P', 'source': 'data\\2025 Hospice Proposed Rule.xml', 'text': 'Tag: P, Attributes: {}, Text: For general questions about hospice payment policy, send your inquiry via email to:'}, {'key': 'RULE/SUPLINF/P', 'source': 'data\\2025 Hospice Final Rule.xml'

In [7]:
from ctransformers import AutoModelForCausalLM

# ✅ Update the model path
MODEL_PATH = "models/falcon-7b-instruct.ggccv1.q5_1.bin"  # Adjust path if needed

# ✅ Load Falcon-7B with `ctransformers`
llm = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    model_type="falcon",  # Specify Falcon model
    gpu_layers=0  # Set > 0 if you have GPU
)

# ✅ Function to chat with Falcon
def ask_falcon(question):
    prompt = f"### Instruction: Answer the following question using your knowledge.\n\nQuestion: {question}\n\nAnswer:"
    response = llm(prompt, max_new_tokens=512)
    return response

In [8]:
# Example
question = "What are the updates to the hospice payment rates for 2025?"
response = ask_falcon(question)
print("🤖 Falcon-7B Response:", response)

🤖 Falcon-7B Response:  In 2025, the hospice payment rates are increasing by 1.76% and will impact reimbursements paid by insurance companies to hospice care providers. Hospice providers are expected to receive an additional $7.8 billion in funding. Hospice services covered by Medicare will continue to be reimbursed at the current rate of $13.44 per day.
