In [2]:
from pathlib import Path
import json
import textwrap
import pandas as pd
import langextract as lx

In [21]:
from IPython.display import display

In [3]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [4]:
from dotenv import load_dotenv
from pathlib import Path
import os

env_path = Path.cwd().parent / ".env"   # project root
loaded = load_dotenv(dotenv_path=env_path, override=True)

print("DOTENV LOADED?", loaded)

key = os.getenv("LANGEXTRACT_API_KEY")
print("KEY FOUND?", key is not None)
print("KEY LENGTH:", 0 if key is None else len(key))
print("KEY PREFIX:", None if key is None else key[:6] + "...")

DOTENV LOADED? True
KEY FOUND? True
KEY LENGTH: 39
KEY PREFIX: AIzaSy...


In [5]:
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"

RAW_PAGES_PATH = DATA_DIR / "raw_pages" / "raw_pages.jsonl"
EXTRACTIONS_DIR = DATA_DIR / "extractions"
EXTRACTIONS_DIR.mkdir(exist_ok=True)

In [6]:
RAW_PAGES_PATH.exists()

True

In [7]:
pages = []
with open(RAW_PAGES_PATH, "r", encoding="utf-8") as f:
    for line in f:
        pages.append(json.loads(line))

len(pages), pages[0].keys()

(229, dict_keys(['doc_name', 'relative_path', 'page_number', 'page_text']))

In [8]:
prompt_description = textwrap.dedent("""
Extract Medicare Stars-related items from the text.

Rules:
- Use exact spans from the source text (no paraphrasing).
- Do not overlap entities.
- Extract in order of appearance.
- Add helpful attributes when available (e.g., year, context, category).

Entity classes:
- measure: the name of a measure
- weight_change: any phrase indicating a change in weight (increase/decrease/new/removed, etc.)
""").strip()

examples = [
    lx.data.ExampleData(
        text="New measure: Breast Cancer Screening Process Measure (Weight increases from 1 to 2 in 2026).",
        extractions=[
            lx.data.Extraction(
                extraction_class="measure",
                extraction_text="Breast Cancer Screening Process Measure",
                attributes={"context": "Medicare Stars"},
            ),
            lx.data.Extraction(
                extraction_class="weight_change",
                extraction_text="Weight increases from 1 to 2 in 2026",
                attributes={"year": "2026"},
            ),
        ],
    )
]

In [9]:
docs = [
    lx.data.Document(
        document_id=f"page_{i+1}",
        text=(p.get("page_text") or "")
    )
    for i, p in enumerate(pages)
    if (p.get("page_text") or "").strip()
]

len(docs), docs[0].document_id, docs[0].text[:120]

(229,
 'page_1',
 '2026 Star Ratings Measures and Weights There are three new measures being added beginning with the 2026 Star Ratings - K')

In [10]:
results = lx.extract(
    text_or_documents=docs[:10],         # 10 pages
    prompt_description=prompt_description,
    examples=examples,
    model_id="gemini-2.5-flash",
)

# results is a list of AnnotatedDocument objects
len(results), type(results[0])

[94m[1mLangExtract[0m: model=[92mgemini-2.5-flash[0m, current=[92m7,524[0m chars, processed=[92m33,573[0m chars:  [00:47]


(10, langextract.core.data.AnnotatedDocument)

In [15]:
project_root = Path.cwd().parent          
out_dir = project_root / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

jsonl_path = out_dir / "extraction_results.jsonl"

lx.io.save_annotated_documents(
    results,                             
    output_dir=str(out_dir),             
    output_name=jsonl_path.name          
)

print("Saved JSONL to:", jsonl_path)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 10 docs [00:00, 1192.92 docs/s]

[92m✓[0m Saved [1m10[0m documents to [92mextraction_results.jsonl[0m
Saved JSONL to: /Users/macbook/Documents/medicare-stars-nlq/outputs/extraction_results.jsonl





In [17]:
# Interactive HTML visualization
html_content = lx.visualize(str(jsonl_path))

# Write it to a file
html_path = out_dir / "visualization.html"
with open(html_path, "w", encoding="utf-8") as f:
    if hasattr(html_content, "data"):
        f.write(html_content.data)
    else:
        f.write(html_content)

print("Saved visualization to:", html_path)

[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|█| 107k/107k [00:00<00:00, 2[0m

[92m✓[0m Loaded [1m10[0m documents from [92mextraction_results.jsonl[0m
Saved visualization to: /Users/macbook/Documents/medicare-stars-nlq/outputs/visualization.html





In [19]:
display(html_content)