In [40]:
# Cell 1: Imports and Path Setup
import os
import fitz  # PyMuPDF
import json
import re
from pathlib import Path
from tqdm import tqdm
from difflib import get_close_matches

RAW_DATA_DIR = Path("D:/Technical_projects/PSAI/raw_data/PSR")
INDEX_JSON = Path("D:/Technical_projects/PSAI/code/psrindex.json")
CHUNKS_DIR = Path("D:/Technical_projects/PSAI/chunks")
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)

In [43]:
# Cell 2: Load and Normalize Index

def normalize_index(index_path):
    with open(index_path, "r", encoding="utf-8") as f:
        raw = json.load(f)
        index_data = raw["report_index"] if isinstance(raw, dict) and "report_index" in raw else raw

    normalized = {}
    for entry in index_data:
        month, day, year = entry["date"].split("/")
        key = f"{year.zfill(4)}-{month.zfill(2)}"
        normalized[key] = {
            "articles": entry["articles"],
            "subjects": entry["subjects"]
        }
    return normalized

index_lookup = normalize_index(INDEX_JSON)

In [44]:
# Cell 3: Extract YYYY-MM from Filename

def extract_date_key_from_filename(filename):
    match = re.search(r"(\d{6})\.pdf", filename)
    if match:
        yyyymm = match.group(1)
        return f"{yyyymm[:4]}-{yyyymm[4:]}"
    return None

In [45]:
# Cell 4: Split Text by Article Titles

def split_by_titles(text, titles):
    article_chunks = []
    current_title = None
    current_text = ""

    lines = text.splitlines()
    for line in lines:
        stripped = line.strip()
        match = get_close_matches(stripped, titles, n=1, cutoff=0.85)

        if match:
            # Save current chunk
            if current_text:
                article_chunks.append((current_title or "Untitled", current_text.strip()))
            # Start new chunk
            current_title = match[0]
            current_text = stripped + "\n"
        else:
            current_text += stripped + "\n"

    if current_text:
        article_chunks.append((current_title or "Untitled", current_text.strip()))

    return article_chunks

In [46]:
# Cell 5: Main Chunking Function

def process_pdf_with_index(pdf_path, index):
    date_key = extract_date_key_from_filename(pdf_path.name)
    if date_key not in index:
        print(f"Skipping {pdf_path.name}: No index entry for date {date_key}")
        return []

    entry = index[date_key]
    articles = entry["articles"]
    subjects = entry["subjects"]
    year, month = date_key.split("-")
    month_names = ["January", "February", "March", "April", "May", "June",
                   "July", "August", "September", "October", "November", "December"]
    readable_date = f"{month_names[int(month)-1]}, {year}"

    doc = fitz.open(pdf_path)
    full_text = ""
    page_map = []

    for page_number, page in enumerate(doc, start=1):
        page_text = page.get_text()
        page_map.append((page_number, len(full_text)))
        full_text += page_text + "\n"

    chunks = []
    article_chunks = split_by_titles(full_text, articles)

    for title, text in article_chunks:
        # Estimate page number from character offset
        first_char_offset = full_text.find(text[:30])
        page_number = next((pn for pn, offset in reversed(page_map) if first_char_offset >= offset), 1)

        chunks.append({
            "text": text.strip(),
            "metadata": {
                "title": title,
                "date": readable_date,
                "author": "Phyllis Schlafly",
                "subjects": subjects,
                "page_number": page_number,
                "source_file": pdf_path.name,
                "doc_type": "Phyllis Schlafly Report"
            }
        })

    return chunks

In [47]:
# Cell 6: Process All PDFs and Save Output

def chunk_all_psrs():
    all_chunks = []
    pdf_files = sorted(RAW_DATA_DIR.glob("*.pdf"))

    for pdf in tqdm(pdf_files):
        if "(1)" in pdf.stem:
            print(f"Skipping duplicate file: {pdf.name}")
            continue
        chunks = process_pdf_with_index(pdf, index_lookup)
        all_chunks.extend(chunks)

    output_path = CHUNKS_DIR / "all_chunks.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved {len(all_chunks)} chunks to {output_path}")

In [48]:
chunk_all_psrs()

 29%|██▉       | 171/593 [00:06<00:12, 34.47it/s]

Skipping duplicate file: PSCA_PSR_14_06_198101(1).pdf
Skipping duplicate file: PSCA_PSR_14_07_198102(1).pdf
Skipping duplicate file: PSCA_PSR_14_08_198103(1).pdf
Skipping duplicate file: PSCA_PSR_14_09_198104(1).pdf
Skipping duplicate file: PSCA_PSR_14_10_198105(1).pdf


 30%|██▉       | 177/593 [00:07<00:11, 37.19it/s]

Skipping duplicate file: PSCA_PSR_14_11_198106(1).pdf
Skipping duplicate file: PSCA_PSR_14_12_198107(1).pdf
Skipping duplicate file: PSCA_PSR_15_01_198108(1).pdf
Skipping duplicate file: PSCA_PSR_15_02_198109(1).pdf
Skipping duplicate file: PSCA_PSR_15_03_198110(1).pdf
Skipping duplicate file: PSCA_PSR_15_04_198111(1).pdf


 32%|███▏      | 188/593 [00:07<00:10, 40.50it/s]

Skipping duplicate file: PSCA_PSR_15_05_198112(1).pdf


 63%|██████▎   | 373/593 [00:13<00:05, 38.01it/s]

Skipping duplicate file: PSCA_PSR_31_06_199801(1).pdf
Skipping duplicate file: PSCA_PSR_31_07_199802(1).pdf
Skipping duplicate file: PSCA_PSR_31_08_199803(1).pdf
Skipping duplicate file: PSCA_PSR_31_09_199804(1).pdf
Skipping duplicate file: PSCA_PSR_31_10_199805(1).pdf
Skipping duplicate file: PSCA_PSR_31_11_199806(1).pdf
Skipping duplicate file: PSCA_PSR_31_12_199807(1).pdf


 65%|██████▍   | 385/593 [00:14<00:05, 41.47it/s]

Skipping duplicate file: PSCA_PSR_32_01_199808(1).pdf
Skipping duplicate file: PSCA_PSR_32_02_199809(1).pdf
Skipping duplicate file: PSCA_PSR_32_03_199810(1).pdf
Skipping duplicate file: PSCA_PSR_32_04_199811(1).pdf
Skipping duplicate file: PSCA_PSR_32_05_199812(1).pdf


 84%|████████▍ | 497/593 [00:17<00:02, 40.28it/s]

Skipping duplicate file: PSCA_PSR_41_06_200801(1).pdf
Skipping duplicate file: PSCA_PSR_41_07_200802(1).pdf
Skipping duplicate file: PSCA_PSR_41_08_200803(1).pdf
Skipping duplicate file: PSCA_PSR_41_09_200804(1).pdf
Skipping duplicate file: PSCA_PSR_41_10_200805(1).pdf
Skipping duplicate file: PSCA_PSR_41_11_200806(1).pdf


 86%|████████▌ | 509/593 [00:18<00:01, 47.97it/s]

Skipping duplicate file: PSCA_PSR_41_12_200807(1).pdf
Skipping duplicate file: PSCA_PSR_42_01_200808(1).pdf
Skipping duplicate file: PSCA_PSR_42_02_200809(1).pdf
Skipping duplicate file: PSCA_PSR_42_03_200810(1).pdf
Skipping duplicate file: PSCA_PSR_42_04_200811(1).pdf
Skipping duplicate file: PSCA_PSR_42_05_200812(1).pdf


 96%|█████████▌| 569/593 [00:19<00:00, 49.04it/s]

Skipping duplicate file: PSCA_PSR_47_06_201401(1).pdf
Skipping PSCA_PSR_47_06_201401(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_47_07_201402(1).pdf
Skipping PSCA_PSR_47_07_201402(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_47_08_201403(1).pdf
Skipping PSCA_PSR_47_08_201403(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_47_09_201404(1).pdf
Skipping PSCA_PSR_47_09_201404(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_47_10_201405(1).pdf
Skipping PSCA_PSR_47_10_201405(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_47_11_201406(1).pdf
Skipping PSCA_PSR_47_11_201406(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_47_12_201407(1).pdf
Skipping PSCA_PSR_47_12_201407(2).pdf: No index entry for date None


100%|██████████| 593/593 [00:20<00:00, 65.64it/s]

Skipping duplicate file: PSCA_PSR_48_01_201408(1).pdf
Skipping PSCA_PSR_48_01_201408(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_48_02_201409(1).pdf
Skipping PSCA_PSR_48_02_201409(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_48_03_201410(1).pdf
Skipping PSCA_PSR_48_03_201410(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_48_04_201411(1).pdf
Skipping PSCA_PSR_48_04_201411(2).pdf: No index entry for date None
Skipping duplicate file: PSCA_PSR_48_05_201412(1).pdf
Skipping PSCA_PSR_48_05_201412(2).pdf: No index entry for date None


100%|██████████| 593/593 [00:20<00:00, 29.37it/s]


✅ Saved 1232 chunks to D:\Technical_projects\PSAI\chunks\all_chunks.json


In [49]:
import json
from pathlib import Path

def summarize_titles(all_chunks_path, output_path=None):
    """
    Extracts just the date and title for each chunk from all_chunks.json.
    Optionally saves to a .jsonl file or prints sample output.
    """
    all_chunks_path = Path(all_chunks_path)
    
    with open(all_chunks_path, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    summary = [
        {
            "date": chunk["metadata"]["date"],
            "title": chunk["metadata"]["title"],
            "source_file": chunk["metadata"]["source_file"]
        }
        for chunk in chunks
    ]

    if output_path:
        with open(output_path, "w", encoding="utf-8") as out:
            json.dump(summary, out, indent=2, ensure_ascii=False)
        print(f"✅ Summary saved to {output_path}")
    else:
        # Just print a sample
        for entry in summary[:10]:
            print(entry)

    return summary


In [50]:
summary = summarize_titles(
    "D:/Technical_projects/PSAI/chunks/all_chunks.json",
    output_path="D:/Technical_projects/PSAI/chunks/title_summary.json"
)


✅ Summary saved to D:/Technical_projects/PSAI/chunks/title_summary.json


In [53]:
def replace_title_with_all_titles(chunks_path, index_path, output_path):
    # Load chunks
    with open(chunks_path, "r", encoding="utf-8") as f:
        chunks = json.load(f)

    # Load and normalize the index
    with open(index_path, "r", encoding="utf-8") as f:
        raw_index = json.load(f)["report_index"]

    index_lookup = {}
    for entry in raw_index:
        m, d, y = entry["date"].split("/")
        key = f"{y.zfill(4)}-{m.zfill(2)}"
        index_lookup[key] = entry["articles"]

    # Update each chunk
    updated = 0
    for chunk in chunks:
        filename = chunk["metadata"]["source_file"]
        match = re.search(r"(\d{6})\.pdf", filename)
        if match:
            yyyymm = match.group(1)
            key = f"{yyyymm[:4]}-{yyyymm[4:]}"
            if key in index_lookup:
                chunk["metadata"]["title"] = index_lookup[key]
                updated += 1

    # Save the updated version
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)

    print(f"✅ Updated {updated} chunks with full article title lists.")
    print(f"📄 Output saved to: {output_path}")


In [54]:
replace_title_with_all_titles(
    chunks_path="D:/Technical_projects/PSAI/chunks/all_chunks.json",
    index_path="D:/Technical_projects/PSAI/code/psrindex.json",
    output_path="D:/Technical_projects/PSAI/chunks/all_chunks_titlelists.json"
)

✅ Updated 1232 chunks with full article title lists.
📄 Output saved to: D:/Technical_projects/PSAI/chunks/all_chunks_titlelists.json


In [55]:
def split_large_chunks(chunks, max_len=1000):
    new_chunks = []

    for chunk in chunks:
        text = chunk["text"]
        metadata = chunk["metadata"]

        if len(text) <= max_len:
            new_chunks.append(chunk)
        else:
            # Split by paragraph breaks first
            parts = text.split("\n\n")
            buffer = ""

            for part in parts:
                if len(buffer) + len(part) < max_len:
                    buffer += part.strip() + "\n\n"
                else:
                    if buffer.strip():
                        new_chunks.append({
                            "text": buffer.strip(),
                            "metadata": metadata.copy()
                        })
                    buffer = part.strip() + "\n\n"

            if buffer.strip():
                new_chunks.append({
                    "text": buffer.strip(),
                    "metadata": metadata.copy()
                })

    print(f"✅ Split into {len(new_chunks)} chunks (was {len(chunks)})")
    return new_chunks


In [56]:
# Load the previously updated chunk file
with open("D:/Technical_projects/PSAI/chunks/all_chunks_titlelists.json", "r", encoding="utf-8") as f:
    original_chunks = json.load(f)

# Split them
smaller_chunks = split_large_chunks(original_chunks, max_len=1000)

# Save to new file
with open("D:/Technical_projects/PSAI/chunks/all_chunks_final.json", "w", encoding="utf-8") as f:
    json.dump(smaller_chunks, f, indent=2, ensure_ascii=False)

print("📄 Final chunk file written.")


✅ Split into 4621 chunks (was 1232)
📄 Final chunk file written.
