In [1]:
from pathlib import Path
import re
import json
import pandas as pd
from tqdm.auto import tqdm

from pypdf import PdfReader

PROJECT_ROOT = Path.cwd().parent  
DATA_DIR = PROJECT_ROOT / "data"
RAW_PDFS_DIR = DATA_DIR / "raw_pdfs"
RAW_PAGES_DIR = DATA_DIR / "raw_pages"

RAW_PAGES_DIR.mkdir(parents=True, exist_ok=True)

RAW_PDFS_DIR, RAW_PAGES_DIR

  from .autonotebook import tqdm as notebook_tqdm


(PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pdfs'),
 PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pages'))

In [3]:
pdfs = sorted(RAW_PDFS_DIR.glob("*.pdf"))
pdfs

[PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pdfs/2026 Star Ratings Measures.pdf'),
 PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pdfs/2026_tech_notes_2025_09_25.pdf'),
 PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pdfs/2027-star-ratings-measures.pdf')]

In [4]:
def clean_text(s: str) -> str:
    if s is None:
        return ""
    # normalize whitespace
    s = s.replace("\x00", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_pdf_pages(pdf_path: Path):
    reader = PdfReader(str(pdf_path))
    rows = []
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        rows.append(
            {
                "doc_name": pdf_path.name,
                "relative_path": str(pdf_path.relative_to(PROJECT_ROOT)),
                "page_number": i,
                "page_text": clean_text(text),
            }
        )
    return rows

In [5]:
all_rows = []
for pdf in tqdm(pdfs, desc="Parsing PDFs"):
    all_rows.extend(extract_pdf_pages(pdf))

df_pages = pd.DataFrame(all_rows)

df_pages.shape, df_pages.head(3)

Parsing PDFs: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.14s/it]


((229, 4),
                          doc_name  \
 0  2026 Star Ratings Measures.pdf   
 1  2026 Star Ratings Measures.pdf   
 2  2026 Star Ratings Measures.pdf   
 
                                   relative_path  page_number  \
 0  data/raw_pdfs/2026 Star Ratings Measures.pdf            1   
 1  data/raw_pdfs/2026 Star Ratings Measures.pdf            2   
 2  data/raw_pdfs/2026 Star Ratings Measures.pdf            3   
 
                                            page_text  
 0  2026 Star Ratings Measures and Weights There a...  
 1  Table 1. 2026 Star Ratings Part C Measures and...  
 2  Table 2. 2026 Star Ratings Part D Measures and...  )

In [6]:
# how many pages per doc
df_pages.groupby("doc_name")["page_number"].max().sort_values(ascending=False)

doc_name
2026_tech_notes_2025_09_25.pdf    223
2026 Star Ratings Measures.pdf      3
2027-star-ratings-measures.pdf      3
Name: page_number, dtype: int64

In [7]:
# check for empty pages
empty_rate = (df_pages["page_text"].str.len() == 0).mean()
empty_rate

np.float64(0.0)

In [8]:
sample = df_pages[df_pages["doc_name"] == df_pages["doc_name"].iloc[0]].head(5)
for _, r in sample.iterrows():
    print("\n" + "="*90)
    print(f"{r['doc_name']} — page {r['page_number']}")
    print(r["page_text"][:800])


2026 Star Ratings Measures.pdf — page 1
2026 Star Ratings Measures and Weights There are three new measures being added beginning with the 2026 Star Ratings - Kidney Health Evaluation for Patients with Diabetes1, Improving or Maintaining Physical Health2, and Improving or Maintaining Mental Health2. The Improving or Maintaining Physical Health and Improving or Maintaining Mental Health measures are returning to the Star Ratings after a substantive specification change and are treated as new measures. They will have a weight of 1 for the 2026 Star Ratings and a weight of 3 beginning with the 2027 Star Ratings. The weight of Patients’ Experience and Complaints, and Measures Capturing Access measures will decrease from 4 to 2 beginning with the 2026 Star Ratings 1. 1 Contract Year 2024 Policy and Technical Changes to the Medicare A

2026 Star Ratings Measures.pdf — page 2
Table 1. 2026 Star Ratings Part C Measures and Measure Weights *Measure has a weight of 1 for the 2026 Star Ratings b

In [9]:
jsonl_path = RAW_PAGES_DIR / "raw_pages.jsonl"
csv_path = RAW_PAGES_DIR / "raw_pages.csv"

# JSONL (best for LLM tooling + audit trails)
with open(jsonl_path, "w", encoding="utf-8") as f:
    for row in df_pages.to_dict(orient="records"):
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

# CSV (nice for quick viewing)
df_pages.to_csv(csv_path, index=False)

jsonl_path, csv_path

(PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pages/raw_pages.jsonl'),
 PosixPath('/Users/macbook/Documents/medicare-stars-nlq/data/raw_pages/raw_pages.csv'))

In [10]:
jsonl_path.exists(), csv_path.exists(), jsonl_path.stat().st_size

(True, True, 578136)

In [11]:
df_pages["page_text"].str.len().describe()

count     229.000000
mean     2353.192140
std      1209.718242
min        74.000000
25%      1639.000000
50%      2217.000000
75%      2944.000000
max      9253.000000
Name: page_text, dtype: float64