In [30]:
import pandas as pd
import re

# === STEP 1: Load the file ===
file_path = "data/Hospice Text.xlsx"  # Update this if necessary
df = pd.read_excel(file_path)

# === STEP 2: Clean the data ===
df_cleaned = df[['Year', 'Type', 'Section', 'Text']].copy()
df_cleaned.dropna(subset=['Year', 'Type', 'Section', 'Text'], inplace=True)
df_cleaned['Year'] = df_cleaned['Year'].astype(int)
df_cleaned['doc_id'] = df_cleaned.index

# === STEP 3: Extract tags from Section ===
def extract_tags(section):
    tags = []

    # Final or Proposed
    if "Final" in section:
        tags.append("Final")
    elif "Proposed" in section:
        tags.append("Proposed")

    # Year like FY 2025
    year_match = re.search(r"FY\s?(\d{4})", section)
    if year_match:
        tags.append(year_match.group(1))

    # Keywords starting with "Hospice ..."
    keyword_matches = re.findall(r"Hospice [A-Za-z ]+", section)
    for match in keyword_matches:
        tags.append(match.strip())

    return list(set(tags))  # Unique tags

df_cleaned['tags'] = df_cleaned['Section'].apply(extract_tags)

# === STEP 4: Combine Section + Text for embedding ===
df_cleaned['combined_text'] = (
    "Section: " + df_cleaned['Section'] + "\n\n" + df_cleaned['Text']
)

# === STEP 5: Prepare for embedding/vector DB ===
documents = df_cleaned.apply(lambda row: {
    "doc_id": row["doc_id"],
    "year": row["Year"],
    "type": row["Type"],
    "section": row["Section"],
    "tags": row["tags"],
    "text": row["combined_text"]
}, axis=1).tolist()

# === STEP 6: View sample output ===
for doc in documents[:3]:
    print(doc)
    print("------")



{'doc_id': 0, 'year': 2025, 'type': 'Final', 'section': 'III.A.1-Final FY 2025 Hospice Wage Index', 'tags': ['2025', 'Hospice Wage Index', 'Final'], 'text': "Section: III.A.1-Final FY 2025 Hospice Wage Index\n\nThe hospice wage index is used to adjust payment rates for hospices under the Medicare program to reflect local differences in area wage levels, based on the location where services are furnished. Our regulations at §\u2009418.306(c) require each labor market to be established using the most current hospital wage data available, including any changes made by the Office of Management and Budget (OMB) to Metropolitan Statistical Area (MSA) definitions.\nIn general, OMB issues major revisions to statistical areas every 10 years, based on the results of the decennial census. However, OMB occasionally issues minor updates and revisions to statistical areas in the years between the decennial censuses. On September 14, 2018, OMB issued OMB Bulletin No. 18-04, which superseded the April

In [31]:
df_cleaned

Unnamed: 0,Year,Type,Section,Text,doc_id,tags,combined_text
0,2025,Final,III.A.1-Final FY 2025 Hospice Wage Index,The hospice wage index is used to adjust payme...,0,"[2025, Hospice Wage Index, Final]",Section: III.A.1-Final FY 2025 Hospice Wage In...
1,2025,Proposed,III.A.1-Proposed FY 2025 Hospice Wage Index,The hospice wage index is used to adjust payme...,1,"[2025, Proposed, Hospice Wage Index]",Section: III.A.1-Proposed FY 2025 Hospice Wage...
2,2024,Final,III.B.1-FY 2024 Hospice Wage Index,The hospice wage index is used to adjust payme...,2,"[Hospice Wage Index, 2024]",Section: III.B.1-FY 2024 Hospice Wage Index\n\...
3,2023,Final,IV.A.1-FY 2023 Hospice Wage Index,The hospice wage index is used to adjust payme...,3,"[Hospice Wage Index, 2023]",Section: IV.A.1-FY 2023 Hospice Wage Index\n\n...
4,2023,Proposed,III.A.1-Proposed FY 2023 Hospice Wage Index,The hospice wage index is used to adjust payme...,4,"[Proposed, Hospice Wage Index, 2023]",Section: III.A.1-Proposed FY 2023 Hospice Wage...
5,2025,Final,III.A.5-Hospice Cap Amount for FY 2025,As discussed in the FY 2016 Hospice Wage Index...,5,"[Hospice Cap Amount for FY, 2025]",Section: III.A.5-Hospice Cap Amount for FY 202...
6,2025,Proposed,III.A.5-Proposed Hospice Cap Amount for FY 2025,As discussed in the FY 2016 Hospice Wage Index...,6,"[Hospice Cap Amount for FY, 2025, Proposed]",Section: III.A.5-Proposed Hospice Cap Amount f...
7,2024,Final,III.B.4-Hospice Cap Amount for FY 2024,As discussed in the FY 2016 Hospice Wage Index...,7,"[Hospice Cap Amount for FY, 2024]",Section: III.B.4-Hospice Cap Amount for FY 202...
8,2023,Final,III.A.5-Hospice Cap Amount for FY 2023,As discussed in the FY 2016 Hospice Wage Index...,8,"[Hospice Cap Amount for FY, 2023]",Section: III.A.5-Hospice Cap Amount for FY 202...
9,2023,Final,III.A.5-Proposed Hospice Cap Amount for FY 2023,As discussed in the FY 2016 Hospice Wage Index...,9,"[Hospice Cap Amount for FY, Proposed, 2023]",Section: III.A.5-Proposed Hospice Cap Amount f...


In [None]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import faiss
import textwrap
import re
from openai import OpenAI

# === SETUP ===
load_dotenv()  # Load variables from .env

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
EMBED_MODEL = "text-embedding-3-small"

# === 1. LOAD & CLEAN EXCEL DATA ===
df = pd.read_excel("data/Hospice Text.xlsx")
df = df[['Year', 'Type', 'Section', 'Text']].dropna()
df['Year'] = df['Year'].astype(int)
df['doc_id'] = df.index

# === 2. TAG EXTRACTION ===
import re

def extract_tags(section):
    tags = []
    if "Final" in section:
        tags.append("Final")
    elif "Proposed" in section:
        tags.append("Proposed")

    year_match = re.search(r"FY\s?(\d{4})", section)
    if year_match:
        tags.append(year_match.group(1))

    keyword_matches = re.findall(r"Hospice [A-Za-z ]+", section)
    tags += [kw.strip() for kw in keyword_matches]
    return list(set(tags))

df['tags'] = df['Section'].apply(extract_tags)

In [33]:
# === 3. CHUNKING ===
def chunk_text(text, max_tokens=400):
    max_chars = max_tokens * 4
    return textwrap.wrap(text, width=max_chars, break_long_words=False)

chunked_docs = []
for _, row in df.iterrows():
    chunks = chunk_text(row['Text'])
    for i, chunk in enumerate(chunks):
        chunked_docs.append({
            "doc_id": f"{row['doc_id']}_chunk{i}",
            "year": row["Year"],
            "type": row["Type"],
            "section": row["Section"],
            "tags": row["tags"],
            "text": f"Section: {row['Section']}\n\n{chunk}"
        })


In [34]:
# === 4. EMBEDDING + FAISS SETUP ===
def get_embedding(text):
    response = client.embeddings.create(
        model=EMBED_MODEL,
        input=[text]
    )
    return response.data[0].embedding

texts = []
metadatas = []
embeddings = []

print("🔁 Embedding documents...")
for doc in chunked_docs:
    embedding = get_embedding(doc["text"])
    texts.append(doc["text"])
    metadatas.append({
        "doc_id": doc["doc_id"],
        "year": doc["year"],
        "type": doc["type"],
        "section": doc["section"],
        "tags": doc["tags"]
    })
    embeddings.append(np.array(embedding, dtype='float32'))

embedding_dim = len(embeddings[0])
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.vstack(embeddings))
print(f"✅ FAISS index built with {index.ntotal} chunks.")


🔁 Embedding documents...
✅ FAISS index built with 63 chunks.


In [35]:
# === 5. RETRIEVAL FUNCTION ===
def search(query, k=10, token_limit=3000):
    query_emb = get_embedding(query)
    query_vec = np.array(query_emb, dtype='float32').reshape(1, -1)

    distances, indices = index.search(query_vec, k)

    results = []
    total_tokens = 0
    for i in indices[0]:
        text = texts[i]
        tokens = len(text) // 4
        if total_tokens + tokens > token_limit:
            break
        results.append({
            "text": text,
            "metadata": metadatas[i]
        })
        total_tokens += tokens
    return results


In [36]:
# === 6. GPT-4 COMPLETION ===
def generate_answer(query, docs):
    context = "\n\n".join([
        f"{d['metadata']['year']} {d['metadata']['type']} - {d['metadata']['section']}\n{d['text']}"
        for d in docs
    ])
    
    messages = [
        {"role": "system", "content": "You are a healthcare compliance expert."},
        {"role": "user", "content": f"""Answer the following question using the retrieved documents.

User Question: {query}

Retrieved Documents:
{context}

Answer:"""}
    ]
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages
    )
    
    return response.choices[0].message.content


In [37]:
# === 7. DEMO ===
if __name__ == "__main__":
    query = input("❓ Ask a healthcare compliance question:\n> ")
    top_docs = search(query)
    answer = generate_answer(query, top_docs)
    print("\n🤖 GPT-4 Answer:\n")
    print(answer)


🤖 GPT-4 Answer:

According to the documents, both the proposed and final 2025 Hospice Wage Index rules include the previously implemented policy of a permanent 5-percent cap on wage index decreases. In both rules, it's outlined that the proposed hospice wage index for FY 2025 would be based on the FY 2025 hospital pre-floor, pre-reclassified wage index for hospital cost reporting periods starting on or after October 1, 2020 and before October 1, 2021.

The methodology for applying the 5-percent decrease cap on wage index and making comparisons to previous fiscal years' wage index values appears consistent in both the proposed and final rules. There is an indication within the rules that the hospice wage index would not take into account any geographic reclassification of hospitals, with the logic behind this aligning with the regulations that govern hospice payment.

Furthermore, both the proposed and finalized rules agree with the idea of using pre-floor, pre-reclassification hospita

In [38]:
# Compare the 2025 proposed rule with the 2025 final rule regarding Hospice Wage Index.