In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import faiss
import textwrap
import re
from openai import OpenAI

# === SETUP ===
load_dotenv()  # Load variables from .env

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
EMBED_MODEL = "text-embedding-3-small"

# === LOAD & CLEAN EXCEL DATA ===
df = pd.read_excel("data/Hospice Text.xlsx")
df = df[['Year', 'Type', 'Section', 'Text']].dropna()
df['Year'] = df['Year'].astype(int)
df['doc_id'] = df.index

# === TAG EXTRACTION ===
def extract_tags(section):
    tags = []
    if "Final" in section:
        tags.append("Final")
    elif "Proposed" in section:
        tags.append("Proposed")
    year_match = re.search(r"FY\s?(\d{4})", section)
    if year_match:
        tags.append(year_match.group(1))
    keyword_matches = re.findall(r"Hospice [A-Za-z ]+", section)
    tags += [kw.strip() for kw in keyword_matches]
    return list(set(tags))

df['tags'] = df['Section'].apply(extract_tags)

# === CHUNKING ===
def chunk_text(text, max_tokens=400):
    max_chars = max_tokens * 4
    return textwrap.wrap(text, width=max_chars, break_long_words=False)

chunked_docs = []
for _, row in df.iterrows():
    chunks = chunk_text(row['Text'])
    for i, chunk in enumerate(chunks):
        chunked_docs.append({
            "doc_id": f"{row['doc_id']}_chunk{i}",
            "year": row["Year"],
            "type": row["Type"],
            "section": row["Section"],
            "tags": row["tags"],
            "text": f"Section: {row['Section']}\n\n{chunk}"
        })

In [2]:
# === EMBEDDING + FAISS SETUP ===
def get_embedding(text):
    response = client.embeddings.create(
        model=EMBED_MODEL,
        input=[text]
    )
    return response.data[0].embedding

texts = []
metadatas = []
embeddings = []

print("🔁 Embedding documents...")
for doc in chunked_docs:
    embedding = get_embedding(doc["text"])
    texts.append(doc["text"])
    metadatas.append({
        "doc_id": doc["doc_id"],
        "year": doc["year"],
        "type": doc["type"],
        "section": doc["section"],
        "tags": doc["tags"]
    })
    embeddings.append(np.array(embedding, dtype='float32'))

embedding_dim = len(embeddings[0])
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.vstack(embeddings))
print(f"✅ FAISS index built with {index.ntotal} chunks.")

🔁 Embedding documents...
✅ FAISS index built with 63 chunks.


In [3]:
# === SEARCH FUNCTION ===
def search(query, k=10, token_limit=3000):
    q_embedding = get_embedding(query)
    q_vec = np.array(q_embedding, dtype='float32').reshape(1, -1)

    distances, indices = index.search(q_vec, k)

    results = []
    token_count = 0
    for i in indices[0]:
        text = texts[i]
        tokens = len(text) // 4
        if token_count + tokens > token_limit:
            break
        results.append({
            "text": text,
            "metadata": metadatas[i]
        })
        token_count += tokens
    return results


In [4]:
# === GPT-4: Q&A MODE ===
def generate_answer(query, docs):
    context = "\n\n".join([
        f"{d['metadata']['year']} {d['metadata']['type']} - {d['metadata']['section']}\n{d['text']}"
        for d in docs
    ])

    messages = [
        {"role": "system", "content": "You are a healthcare compliance expert."},
        {"role": "user", "content": f"""Answer the following question using the retrieved documents.

User Question: {query}

Retrieved Documents:
{context}

Answer:"""}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages
    )

    return response.choices[0].message.content

In [5]:
# === GPT-4: SUGGESTION MODE ===
def generate_recommendations(docs, context_query=None):
    context = "\n\n".join([
        f"{d['metadata']['year']} {d['metadata']['type']} - {d['metadata']['section']}\n{d['text']}"
        for d in docs
    ])

    prompt = f"""
You are a regulatory compliance advisor for hospice care providers.

Based on the following CMS healthcare rules, generate **practical, actionable suggestions** for a hospice agency.

Explain what they should do to stay compliant, reduce risk, and prepare effectively.

{f"Focus on this question: {context_query}" if context_query else ""}
\n\nRegulatory Text:\n{context}

Suggestions:
"""

    messages = [
        {"role": "system", "content": "You are a compliance consultant for hospice care agencies."},
        {"role": "user", "content": prompt}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages
    )

    return response.choices[0].message.content

In [6]:
# === MAIN INTERFACE ===
if __name__ == "__main__":
    mode = input("Select mode: (1) Ask a question (2) Get suggestions\n> ").strip()

    query = input("\nEnter your query:\n> ")

    top_docs = search(query)

    if mode == "2":
        output = generate_recommendations(top_docs, context_query=query)
    else:
        output = generate_answer(query, top_docs)

    print("\n🤖 AI Response:\n")
    print(output)

BadRequestError: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [None]:
# What actions should hospitals take in response to the 2025 final cap amount changes?

In [None]:
# Doing it with the reasoning agents. 

# Perplexity Pro, deep research/deep reasoning

# Give 3 documents, 2024 final, 2025 proposed, 2025 final, ask specifically coparision between finals, prposed vs final, 