In [4]:
from pathlib import Path
import json
import re
import numpy as np
import pandas as pd

This avoids relying on OpenAI keys and still gives real embeddings + retrieval.

In [7]:
%pip -q install sentence-transformers faiss-cpu
from sentence_transformers import SentenceTransformer
import faiss

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"
RAW_PAGES_PATH = DATA_DIR / "raw_pages" / "raw_pages.jsonl"

OUT_DIR = PROJECT_ROOT / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# If you already saved LangExtract results here:
EXTRACTIONS_JSONL = OUT_DIR / "extraction_results.jsonl"

print("RAW_PAGES_PATH exists:", RAW_PAGES_PATH.exists())
print("EXTRACTIONS_JSONL exists:", EXTRACTIONS_JSONL.exists())
print("OUT_DIR:", OUT_DIR)

RAW_PAGES_PATH exists: True
EXTRACTIONS_JSONL exists: True
OUT_DIR: /Users/macbook/Documents/medicare-stars-nlq/outputs


In [11]:
pages = []
with open(RAW_PAGES_PATH, "r", encoding="utf-8") as f:
    for line in f:
        pages.append(json.loads(line))

print("Loaded pages:", len(pages))
print("Keys:", pages[0].keys())

Loaded pages: 229
Keys: dict_keys(['doc_name', 'relative_path', 'page_number', 'page_text'])


In [13]:
def chunk_text(text: str, chunk_size: int = 900, overlap: int = 150):
    text = (text or "").strip()
    if not text:
        return []
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        if end == len(text):
            break
        start = max(0, end - overlap)
    return chunks

records = []
for i, p in enumerate(pages):
    page_text = (p.get("page_text") or "").strip()
    if not page_text:
        continue

    for j, ch in enumerate(chunk_text(page_text)):
        records.append({
            "doc_name": p.get("doc_name"),
            "relative_path": p.get("relative_path"),
            "page_number": p.get("page_number"),
            "chunk_id": f"p{i}_c{j}",
            "text": ch
        })

chunks_df = pd.DataFrame(records)
print("Total chunks:", len(chunks_df))
chunks_df.head(3)

Total chunks: 778


Unnamed: 0,doc_name,relative_path,page_number,chunk_id,text
0,2026 Star Ratings Measures.pdf,data/raw_pdfs/2026 Star Ratings Measures.pdf,1,p0_c0,2026 Star Ratings Measures and Weights There a...
1,2026 Star Ratings Measures.pdf,data/raw_pdfs/2026 Star Ratings Measures.pdf,1,p0_c1,024 Policy and Technical Changes to the Medica...
2,2026 Star Ratings Measures.pdf,data/raw_pdfs/2026 Star Ratings Measures.pdf,2,p1_c0,Table 1. 2026 Star Ratings Part C Measures and...


In [15]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # small + fast

In [17]:
texts = chunks_df["text"].tolist()
emb = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)   # cosine similarity if normalized
index.add(emb)

print("Embedding matrix:", emb.shape)
print("FAISS index size:", index.ntotal)

Batches: 100%|██████████████████████████████████| 25/25 [00:06<00:00,  4.13it/s]

Embedding matrix: (778, 384)
FAISS index size: 778





In [19]:
def retrieve(query: str, k: int = 5):
    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, idxs = index.search(q_emb, k)
    hits = chunks_df.iloc[idxs[0]].copy()
    hits["score"] = scores[0]
    return hits.sort_values("score", ascending=False)

# quick test
retrieve("What changed in 2026 star ratings weights?", k=5)[["score","doc_name","page_number","text"]].head(3)

Unnamed: 0,score,doc_name,page_number,text
0,0.627185,2026 Star Ratings Measures.pdf,1,2026 Star Ratings Measures and Weights There a...
348,0.601286,2026_tech_notes_2025_09_25.pdf,97,"erage, the plan’s scores improved. Keep in min..."
76,0.585301,2026_tech_notes_2025_09_25.pdf,10,(Last Updated 09/25/2025) Page 2 c. Re-specifi...


Minimal NLQ layer

In [21]:
def load_extractions(jsonl_path: Path) -> pd.DataFrame:
    """
    Loads LangExtract's annotated documents JSONL into a flat table.
    This tries to be resilient to slightly different internal schemas.
    """
    rows = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)

            doc_id = obj.get("document_id") or obj.get("doc_id") or obj.get("id")
            text = obj.get("text") or obj.get("document_text") or ""

            # LangExtract typically stores extractions under something like 'extractions'
            extractions = obj.get("extractions") or obj.get("annotations") or []
            for ex in extractions:
                rows.append({
                    "document_id": doc_id,
                    "extraction_class": ex.get("extraction_class") or ex.get("class"),
                    "extraction_text": ex.get("extraction_text") or ex.get("text"),
                    "attributes": ex.get("attributes") or {},
                })

    return pd.DataFrame(rows)

entities_df = None
if EXTRACTIONS_JSONL.exists():
    entities_df = load_extractions(EXTRACTIONS_JSONL)
    print("Loaded entities:", len(entities_df))
    display(entities_df.head(10))
else:
    print("No extraction_results.jsonl found at:", EXTRACTIONS_JSONL)

Loaded entities: 249


Unnamed: 0,document_id,extraction_class,extraction_text,attributes
0,page_1,measure,Kidney Health Evaluation for Patients with Dia...,{'context': 'Medicare Stars'}
1,page_1,weight_change,,{}
2,page_1,measure,Improving or Maintaining Physical Health2,{'context': 'Medicare Stars'}
3,page_1,weight_change,,{}
4,page_1,measure,Improving or Maintaining Mental Health2,{'context': 'Medicare Stars'}
5,page_1,weight_change,,{}
6,page_1,measure,,{}
7,page_1,weight_change,will have a weight of 1 for the 2026 Star Ratings,{'year': '2026'}
8,page_1,measure,,{}
9,page_1,weight_change,a weight of 3 beginning with the 2027 Star Rat...,{'year': '2027'}


In [23]:
def nlq(question: str, top_k: int = 5):
    """
    Minimal NLQ:
    1) retrieve top chunks for grounding
    2) if extraction table exists, attempt a structured answer for common intents
    """
    hits = retrieve(question, k=top_k)

    structured = None
    if entities_df is not None and len(entities_df) > 0:
        q = question.lower()

        # Example intent: "what measures..." or "which measures..."
        if "measure" in q:
            structured = (entities_df[entities_df["extraction_class"].astype(str).str.contains("measure", case=False, na=False)]
                          .copy())

        # Example intent: weight change / increase / decrease / 2026
        if any(w in q for w in ["weight", "increase", "decrease", "change"]):
            wc = entities_df[entities_df["extraction_class"].astype(str).str.contains("weight_change", case=False, na=False)].copy()
            structured = wc if structured is None else pd.concat([structured, wc], ignore_index=True)

        # Filter by year if mentioned
        year_match = re.search(r"\b(20\d{2})\b", q)
        if structured is not None and year_match:
            year = year_match.group(1)
            structured = structured[structured["attributes"].astype(str).str.contains(year, na=False)]

        if structured is not None and len(structured) > 0:
            structured = structured.drop_duplicates(subset=["extraction_class", "extraction_text"]).head(25)

    return hits, structured

In [25]:
q1 = "What measures changed weight in 2026?"
hits, structured = nlq(q1, top_k=5)

print("QUESTION:", q1)
display(hits[["score","doc_name","page_number","text"]].head(3))

if structured is not None:
    display(structured)
else:
    print("No structured entity table available (missing extraction_results.jsonl).")

QUESTION: What measures changed weight in 2026?


Unnamed: 0,score,doc_name,page_number,text
330,0.508577,2026_tech_notes_2025_09_25.pdf,91,(Last Updated 09/25/2025) Page 83 Title Descri...
289,0.473848,2026_tech_notes_2025_09_25.pdf,77,signed. Data Time Frame: 03/2025 – 05/2025 Gen...
0,0.468935,2026 Star Ratings Measures.pdf,1,2026 Star Ratings Measures and Weights There a...


Unnamed: 0,document_id,extraction_class,extraction_text,attributes
176,page_1,weight_change,will have a weight of 1 for the 2026 Star Ratings,{'year': '2026'}
179,page_1,weight_change,decrease from 4 to 2 beginning with the 2026 S...,{'year': '2026'}
180,page_2,weight_change,considered a new measure,{'year': '2026'}
183,page_3,weight_change,2,{'year': '2026'}
186,page_3,weight_change,5,{'year': '2026'}
189,page_3,weight_change,1,{'year': '2026'}
190,page_3,weight_change,3,{'year': '2026'}
