# Full text extraction and dump into .txt

In [1]:

from pathlib import Path
from page_text_extractor import extract_pdf_to_txt, ExtractConfig

PDF_PATH = Path(
    "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/02.pdf"
)
# By default, write alongside the PDF with same stem + .txt
OUT_PATH = PDF_PATH.with_suffix(".txt")

CFG = ExtractConfig(
    pdf_path=PDF_PATH,
    out_path=OUT_PATH,
    dpi=300,          # render resolution for OCR
    lang="eng",       # Tesseract language pack(s)
    overwrite=True,   # overwrite existing txt if present
    show_progress=True,
)
print(CFG)

# %% [markdown]
# ## 3 · Run extraction

# %%
output_txt = extract_pdf_to_txt(CFG)
print(f"✅ Saved OCR text → {output_txt}")

# %% [markdown]
# ## 4 · Preview first ~1,000 characters

# %%
preview_len = 1000
with open(output_txt, "r", encoding="utf-8") as f:
    snippet = f.read(preview_len)
print(snippet)



ExtractConfig(pdf_path=PosixPath('/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/02.pdf'), out_path=PosixPath('/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/02.txt'), dpi=300, lang='eng', overwrite=True, show_progress=True)
INFO | Opening /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/02.pdf


OCR pages: 100%|██████████| 21/21 [01:06<00:00,  3.16s/page]

INFO | Wrote 21 pages → /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/02.txt (43.7 KB)
✅ Saved OCR text → /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/02.txt
Phyllis Schlafly Commentaries on Christian Radio

Week of February 1, 2002:

02-24 Court Puts a Lid on Nosy Questionnaires Nosy Questions
Week of February 3:
02-25 Reform Our Visa System Now Terrorism
02-26 Key Supreme Cases in 2001 EF Website
02-27 The English Language Is the Tie that Binds English Language
02-28 What Is Fuzzy Math? Education Reporter I
02-29 Constitutional Convention R.I.P. EF Brochure
Week of February 10:
02-30 More Federal Control of Education Outcome Based Education
02-31 Some Congressmen Tried to Improve the Education Law Education Video
02-32 We Need a Time-Out on Immigration Immigration
02-33 What Free Trade with China Really Means Free Trade
02-34 Pro-Gay Curricula Is Coming to a School Near You Education Reporter II
Week of Fe




In [14]:
import pandas as pd

# Load the Excel file
excel_path = '/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/commindex.xlsx'
df = pd.read_excel(excel_path, header=0)

# Just print the first column (dates) and their types
for idx, val in enumerate(df.iloc[:, 0]):
    print(f"Row {idx}: {val} ({type(val)})")
    if idx > 10000:
        break  # don't print the entire thing


Row 0: 1983-10-03 00:00:00 (<class 'datetime.datetime'>)
Row 1: 1983-10-04 00:00:00 (<class 'datetime.datetime'>)
Row 2: 1983-10-05 00:00:00 (<class 'datetime.datetime'>)
Row 3: 1983-10-06 00:00:00 (<class 'datetime.datetime'>)
Row 4: 1983-10-07 00:00:00 (<class 'datetime.datetime'>)
Row 5: 1983-10-10 00:00:00 (<class 'datetime.datetime'>)
Row 6: 1983-10-11 00:00:00 (<class 'datetime.datetime'>)
Row 7: 1983-10-12 00:00:00 (<class 'datetime.datetime'>)
Row 8: 1983-10-13 00:00:00 (<class 'datetime.datetime'>)
Row 9: 1983-10-14 00:00:00 (<class 'datetime.datetime'>)
Row 10: 1983-10-17 00:00:00 (<class 'datetime.datetime'>)
Row 11: 1983-10-18 00:00:00 (<class 'datetime.datetime'>)
Row 12: 1983-10-19 00:00:00 (<class 'datetime.datetime'>)
Row 13: 1983-10-20 00:00:00 (<class 'datetime.datetime'>)
Row 14: 1983-10-21 00:00:00 (<class 'datetime.datetime'>)
Row 15: 1983-10-24 00:00:00 (<class 'datetime.datetime'>)
Row 16: 1983-10-25 00:00:00 (<class 'datetime.datetime'>)
Row 17: 1983-10-26 00:00

# Create year .json

In [24]:
import pandas as pd
import json

# === CONFIGURATION ===
excel_path = '/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/commindex.xlsx'
target_year = 2013
output_json_path = f'/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/{target_year}.json'

# Load correct sheet
df = pd.read_excel(excel_path)

entries = []

for idx, row in df.iterrows():
    raw_date = row.iloc[0]  # Column A (Date)
    title = str(row.iloc[2]).strip()  # Column C (Title)

    try:
        # Let pandas auto-parse the date
        parsed_date = pd.to_datetime(raw_date, errors='coerce')

        if pd.isna(parsed_date):
            continue  # skip unparsable

        if parsed_date.year == target_year:
            formatted_date = parsed_date.strftime('%m-%d-%Y')
            entry = {
                "metadata": {
                    "title": title,
                    "speaker": "Phyllis Schlafly",
                    "date": formatted_date
                },
                "text": ""
            }
            entries.append(entry)

    except Exception as e:
        print(f"Row {idx} skipped due to error: {e}")

# Write to JSON
with open(output_json_path, 'w') as f:
    json.dump(entries, f, indent=4)

print(f"✅ Saved {len(entries)} entries to {output_json_path}")


✅ Saved 261 entries to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2013.json


# For doing a whole month text and inserting into .json

In [6]:
import json
from pathlib import Path
import fitz  # PyMuPDF
from PIL import Image
import pytesseract

def render_page(page, dpi=300):
    zoom = dpi / 72
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    mode = "RGB" if pix.n < 4 else "RGBA"
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def ocr_page(page, dpi=300, lang="eng"):
    img = render_page(page, dpi)
    return pytesseract.image_to_string(img, lang=lang).strip()

def update_json_with_ocr(pdf_path: Path, json_path: Path, month: int, dpi=300, lang="eng"):
    # Step 1: Load JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Step 2: Filter entries for this month
    month_str = f"{month:02d}"
    monthly_entries = [entry for entry in data if entry["metadata"]["date"].startswith(month_str)]

    # Step 3: Open PDF and extract text page-by-page (skip index)
    doc = fitz.open(pdf_path)
    total_pages = doc.page_count
    expected_count = total_pages - 1  # page 0 is index

    if len(monthly_entries) != expected_count:
        raise ValueError(f"Mismatch: {len(monthly_entries)} JSON entries but {expected_count} PDF pages (excluding index).")

    for i in range(1, total_pages):  # skip first page (index)
        page = doc.load_page(i)
        ocr_text = ocr_page(page, dpi=dpi, lang=lang)
        monthly_entries[i - 1]["text"] = ocr_text  # match order

    # Step 4: Write back full updated JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"✅ Updated {len(monthly_entries)} entries for month {month_str} in {json_path.name}")


# === USAGE ===

pdf_path = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/10.pdf")
json_path = Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2002.json")
target_month = 10

update_json_with_ocr(pdf_path, json_path, month=target_month)

✅ Updated 23 entries for month 10 in 2002.json


# For Skipping Pages

In [7]:
def update_json_with_ocr(
    pdf_path: Path,
    json_path: Path,
    month: int,
    dpi=300,
    lang="eng",
    skip_pages: list[int] = None
):
    if skip_pages is None:
        skip_pages = []

    # Step 1: Load JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Step 2: Filter entries for this month
    month_str = f"{month:02d}"
    monthly_entries = [entry for entry in data if entry["metadata"]["date"].startswith(month_str)]

    # Step 3: Open PDF and extract text, skipping index + any skipped pages
    doc = fitz.open(pdf_path)
    total_pages = doc.page_count

    # Build the list of usable pages
    usable_pages = [i for i in range(1, total_pages) if i not in skip_pages]

    if len(monthly_entries) != len(usable_pages):
        raise ValueError(
            f"Mismatch: {len(monthly_entries)} JSON entries but {len(usable_pages)} usable PDF pages "
            f"(after skipping index and custom skip_pages)."
        )

    for json_index, pdf_page_index in enumerate(usable_pages):
        page = doc.load_page(pdf_page_index)
        ocr_text = ocr_page(page, dpi=dpi, lang=lang)
        monthly_entries[json_index]["text"] = ocr_text

    # Step 4: Write back full updated JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"✅ Updated {len(monthly_entries)} entries for month {month_str} in {json_path.name} (skipped pages: {skip_pages})")


In [10]:
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2002/12.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2002.json"),
    month=12,
    skip_pages=[18]
)


✅ Updated 21 entries for month 12 in 2002.json (skipped pages: [18])


In [13]:
# update_json_with_ocr(
#     pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/01.pdf"),
#     json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
#     month=1,
#     skip_pages=[]
# )

# update_json_with_ocr(
#     pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/02.pdf"),
#     json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
#     month=2,
#     skip_pages=[]
# )

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/03.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=3,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/04.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=4,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/05.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=5,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/07.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=7,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/08.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=8,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/09.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=9,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/10.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=10,
    skip_pages=[]
)

update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/11.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=11,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2003/12.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2003.json"),
    month=12,
    skip_pages=[]
)

✅ Updated 14 entries for month 03 in 2003.json (skipped pages: [])
✅ Updated 22 entries for month 04 in 2003.json (skipped pages: [])
✅ Updated 22 entries for month 05 in 2003.json (skipped pages: [])
✅ Updated 23 entries for month 07 in 2003.json (skipped pages: [])
✅ Updated 21 entries for month 08 in 2003.json (skipped pages: [])
✅ Updated 22 entries for month 09 in 2003.json (skipped pages: [])
✅ Updated 23 entries for month 10 in 2003.json (skipped pages: [])
✅ Updated 19 entries for month 11 in 2003.json (skipped pages: [])
✅ Updated 23 entries for month 12 in 2003.json (skipped pages: [])


In [15]:
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/01.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=1,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/02.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=2,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/03.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=3,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/04.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=4,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/05.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=5,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/06.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=6,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/07.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=7,
    skip_pages=[13]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/08.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=8,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/09.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=9,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/10.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=10,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/11.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=11,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2004/12.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2004.json"),
    month=12,
    skip_pages=[24]
)

✅ Updated 22 entries for month 01 in 2004.json (skipped pages: [])
✅ Updated 20 entries for month 02 in 2004.json (skipped pages: [])
✅ Updated 23 entries for month 03 in 2004.json (skipped pages: [])
✅ Updated 21 entries for month 04 in 2004.json (skipped pages: [])
✅ Updated 20 entries for month 05 in 2004.json (skipped pages: [21])
✅ Updated 22 entries for month 06 in 2004.json (skipped pages: [23])
✅ Updated 12 entries for month 07 in 2004.json (skipped pages: [13])
✅ Updated 22 entries for month 08 in 2004.json (skipped pages: [23])
✅ Updated 22 entries for month 09 in 2004.json (skipped pages: [23])
✅ Updated 20 entries for month 10 in 2004.json (skipped pages: [21])
✅ Updated 22 entries for month 11 in 2004.json (skipped pages: [23])
✅ Updated 23 entries for month 12 in 2004.json (skipped pages: [24])


In [17]:
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/01.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=1,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/02.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=2,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/03.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=3,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/04.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=4,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/05.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=5,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/06.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=6,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/07.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=7,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/08.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=8,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/09.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=9,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/10.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=10,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/11.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=11,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2005/12.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2005.json"),
    month=12,
    skip_pages=[10,11,25]
)

✅ Updated 21 entries for month 01 in 2005.json (skipped pages: [22])
✅ Updated 20 entries for month 02 in 2005.json (skipped pages: [21])
✅ Updated 23 entries for month 03 in 2005.json (skipped pages: [24])
✅ Updated 21 entries for month 04 in 2005.json (skipped pages: [22])
✅ Updated 22 entries for month 05 in 2005.json (skipped pages: [23])
✅ Updated 22 entries for month 06 in 2005.json (skipped pages: [23])
✅ Updated 21 entries for month 07 in 2005.json (skipped pages: [])
✅ Updated 23 entries for month 08 in 2005.json (skipped pages: [24])
✅ Updated 22 entries for month 09 in 2005.json (skipped pages: [23])
✅ Updated 21 entries for month 10 in 2005.json (skipped pages: [22])
✅ Updated 22 entries for month 11 in 2005.json (skipped pages: [23])
✅ Updated 22 entries for month 12 in 2005.json (skipped pages: [10, 11, 25])


In [19]:
#2006
# update_json_with_ocr(
#     pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/01.pdf"),
#     json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
#     month=1,
#     skip_pages=[23]
# )
# update_json_with_ocr(
#     pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/02.pdf"),
#     json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
#     month=2,
#     skip_pages=[21]
# )
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/03.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=3,
    skip_pages=[]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/04.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=4,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/05.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=5,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/06.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=6,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/07.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=7,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/08.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=8,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/09.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=9,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/10.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=10,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/11.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=11,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2006/12.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2006.json"),
    month=12,
    skip_pages=[22]
)

✅ Updated 14 entries for month 03 in 2006.json (skipped pages: [])
✅ Updated 20 entries for month 04 in 2006.json (skipped pages: [21])
✅ Updated 23 entries for month 05 in 2006.json (skipped pages: [24])
✅ Updated 22 entries for month 06 in 2006.json (skipped pages: [23])
✅ Updated 21 entries for month 07 in 2006.json (skipped pages: [22])
✅ Updated 23 entries for month 08 in 2006.json (skipped pages: [24])
✅ Updated 21 entries for month 09 in 2006.json (skipped pages: [22])
✅ Updated 22 entries for month 10 in 2006.json (skipped pages: [23])
✅ Updated 22 entries for month 11 in 2006.json (skipped pages: [23])
✅ Updated 21 entries for month 12 in 2006.json (skipped pages: [22])


In [20]:
#2007
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/01.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=1,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/02.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=2,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/03.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=3,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/04.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=4,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/05.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=5,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/06.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=6,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/07.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=7,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/08.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=8,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/09.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=9,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/10.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=10,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/11.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=11,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/2007/12.pdf"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2007.json"),
    month=12,
    skip_pages=[22]
)

✅ Updated 23 entries for month 01 in 2007.json (skipped pages: [24])
✅ Updated 20 entries for month 02 in 2007.json (skipped pages: [21])
✅ Updated 22 entries for month 03 in 2007.json (skipped pages: [23])
✅ Updated 21 entries for month 04 in 2007.json (skipped pages: [22])
✅ Updated 23 entries for month 05 in 2007.json (skipped pages: [24])
✅ Updated 21 entries for month 06 in 2007.json (skipped pages: [22])
✅ Updated 22 entries for month 07 in 2007.json (skipped pages: [23])
✅ Updated 23 entries for month 08 in 2007.json (skipped pages: [24])
✅ Updated 20 entries for month 09 in 2007.json (skipped pages: [21])
✅ Updated 23 entries for month 10 in 2007.json (skipped pages: [24])
✅ Updated 22 entries for month 11 in 2007.json (skipped pages: [23])
✅ Updated 21 entries for month 12 in 2007.json (skipped pages: [22])


In [25]:
#2008 DONE
yy = "08"
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/01.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=1,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/02.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=2,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/03.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=3,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/04.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=4,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/05.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=5,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/06.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=6,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/07.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=7,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/08.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=8,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/09.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=9,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/10.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=10,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/11.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=11,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/12.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=12,
    skip_pages=[24]
)

✅ Updated 23 entries for month 01 in 2008.json (skipped pages: [24])
✅ Updated 21 entries for month 02 in 2008.json (skipped pages: [22])
✅ Updated 21 entries for month 03 in 2008.json (skipped pages: [22])
✅ Updated 22 entries for month 04 in 2008.json (skipped pages: [23])
✅ Updated 22 entries for month 05 in 2008.json (skipped pages: [23])
✅ Updated 21 entries for month 06 in 2008.json (skipped pages: [22])
✅ Updated 23 entries for month 07 in 2008.json (skipped pages: [24])
✅ Updated 21 entries for month 08 in 2008.json (skipped pages: [22])
✅ Updated 22 entries for month 09 in 2008.json (skipped pages: [23])
✅ Updated 23 entries for month 10 in 2008.json (skipped pages: [24])
✅ Updated 20 entries for month 11 in 2008.json (skipped pages: [21])
✅ Updated 23 entries for month 12 in 2008.json (skipped pages: [24])


In [None]:
#2009
yy = "09"
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/01.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=1,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/02.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=2,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/03.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=3,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/04.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=4,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/05.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=5,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/06.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=6,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/07.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=7,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/08.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=8,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/09.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=9,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/10.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=10,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/11.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=11,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/12.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=12,
    skip_pages=[24]
)

In [None]:
#2010
yy = "10"
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/01.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=1,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/02.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=2,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/03.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=3,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/04.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=4,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/05.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=5,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/06.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=6,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/07.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=7,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/08.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=8,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/09.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=9,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/10.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=10,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/11.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=11,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/12.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=12,
    skip_pages=[24]
)

In [None]:
#2011
yy = "11"
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/01.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=1,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/02.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=2,
    skip_pages=[21]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/03.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=3,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/04.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=4,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/05.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=5,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/06.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=6,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/07.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=7,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/08.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=8,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/09.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=9,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/10.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=10,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/11.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=11,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/12.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=12,
    skip_pages=[23]
)

In [None]:
#2012
yy = "12"
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/01.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=1,
    skip_pages=[1,18,20,26]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/02.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=2,
    skip_pages=[1,23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/03.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=3,
    skip_pages=[1,24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/04.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=4,
    skip_pages=[1,23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/05.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=5,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/06.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=6,
    skip_pages=[22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/07.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=7,
    skip_pages=[23]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/08.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=8,
    skip_pages=[20,21,22,27]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/09.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=9,
    skip_pages=[4,22]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/10.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=10,
    skip_pages=[24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/11.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=11,
    skip_pages=[20,24]
)
update_json_with_ocr(
    pdf_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/code/data/raw/comm/20{yy}/12.pdf"),
    json_path=Path(f"/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/20{yy}.json"),
    month=12,
    skip_pages=[22]
)

# For .docx text extraction

In [None]:
from pathlib import Path
import json
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
from docx import Document


def render_page(page, dpi=300):
    zoom = dpi / 72
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    mode = "RGB" if pix.n < 4 else "RGBA"
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def ocr_page(page, dpi=300, lang="eng"):
    img = render_page(page, dpi)
    return pytesseract.image_to_string(img, lang=lang).strip()

def extract_texts_from_pdf(pdf_path: Path, skip_pages: list[int], dpi=300, lang="eng"]) -> list[str]:
    doc = fitz.open(pdf_path)
    return [
        ocr_page(doc.load_page(i), dpi=dpi, lang=lang)
        for i in range(1, doc.page_count)
        if i not in skip_pages
    ]

def extract_texts_from_docx(docx_path: Path, skip_pages: list[int]) -> list[str]:
    doc = Document(docx_path)
    # We'll treat each paragraph as a "page" for now unless we detect page breaks
    all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    usable = [p for i, p in enumerate(all_paragraphs) if i not in skip_pages]
    return usable

def update_json_with_texts(
    file_path: Path,
    json_path: Path,
    month: int,
    dpi=300,
    lang="eng",
    skip_pages: list[int] = None
):
    if skip_pages is None:
        skip_pages = []

    # Load JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    month_str = f"{month:02d}"
    monthly_entries = [entry for entry in data if entry["metadata"]["date"].startswith(month_str)]

    # Extract text based on file type
    if file_path.suffix.lower() == ".pdf":
        extracted_texts = extract_texts_from_pdf(file_path, skip_pages=skip_pages, dpi=dpi, lang=lang)
    elif file_path.suffix.lower() == ".docx":
        extracted_texts = extract_texts_from_docx(file_path, skip_pages=skip_pages)
    else:
        raise ValueError(f"Unsupported file type: {file_path.suffix}")

    if len(monthly_entries) != len(extracted_texts):
        raise ValueError(
            f"Mismatch: {len(monthly_entries)} JSON entries vs. {len(extracted_texts)} text blocks from {file_path.name}"
        )

    # Insert text into JSON
    for entry, text in zip(monthly_entries, extracted_texts):
        entry["text"] = text

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"✅ Updated {len(monthly_entries)} entries from {file_path.name} → {json_path.name}")


In [None]:
update_json_with_texts(
    file_path=Path("/Users/mason/Desktop/comm/2002/03.docx"),
    json_path=Path("/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/commentaries/2002.json"),
    month=3,
    skip_pages=[7]  # skips the 8th paragraph/page-equivalent
)
