In [None]:
import PyPDF2
import pytesseract
import json
import re
import os
from pdf2image import convert_from_path
from PIL import Image
from datetime import date, timedelta

# --- Configuration ---

# IMPORTANT: Update this path if Tesseract is not in your system's PATH
# Example for Windows: pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Example for Linux/macOS (if installed in a non-standard location): pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
# If Tesseract is in your PATH, you might be able to comment this line out.
# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'

# --- IMPORTANT: SET THESE PATHS ---
PDF_FILE_PATH = '01.pdf'  # The path to your input PDF file
OUTPUT_JSON_PATH = 'schlafly_commentaries_jan_2002.json' # The path for the output JSON file
# Path to Poppler binaries (required by pdf2image) - uncomment and set if not in PATH
# POPPLER_PATH = r"C:\path\to\poppler-xx.xx.x\bin" # Example for Windows
POPPLER_PATH = None # Set this if needed, otherwise keep as None

# --- Helper Functions ---

def is_weekday(d):
    """Checks if a given date is a weekday (Monday to Friday)."""
    return d.weekday() < 5

def get_next_weekday(d):
    """Gets the next weekday after the given date."""
    next_day = d + timedelta(days=1)
    while not is_weekday(next_day):
        next_day += timedelta(days=1)
    return next_day

def calculate_date_from_number(commentary_num_str):
    """
    Calculates the date based on the commentary number (e.g., '02-01').
    Assumes 02-01 is Jan 2, 2002, and subsequent numbers are sequential weekdays.
    """
    try:
        parts = commentary_num_str.split('-')
        if len(parts) != 2 or not parts[0] == '02': # Expecting format '02-XX' for Jan 2002
             return "Unknown Date"

        day_index = int(parts[1]) - 1 # 0-based index for calculation
        if day_index < 0:
            return "Unknown Date"

        # Start date is Wednesday, January 2, 2002 (based on PDF first page)
        current_date = date(2002, 1, 2)
        days_added = 0

        # Find the correct weekday date corresponding to the index
        while days_added < day_index:
            current_date = get_next_weekday(current_date)
            days_added += 1

        return current_date.strftime('%Y-%m-%d')

    except (ValueError, IndexError):
        print(f"Warning: Could not parse date for commentary number {commentary_num_str}")
        return "Unknown Date"


def extract_text_from_pdf_ocr(pdf_path, poppler_path=None):
    """
    Extracts text from a non-OCR PDF using OCR (Tesseract via pdf2image).
    Returns a list of strings, where each string is the text of a page.
    """
    print(f"Starting OCR process for {pdf_path}...")
    all_page_text = []
    try:
        # Convert PDF pages to images
        # Use dpi=300 for better OCR quality, adjust if needed
        images = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)
        print(f"Converted {len(images)} pages to images.")

        # Perform OCR on each image
        for i, img in enumerate(images):
            print(f"Performing OCR on page {i + 1}/{len(images)}...")
            try:
                # Use pytesseract to extract text
                # lang='eng' specifies English language
                page_text = pytesseract.image_to_string(img, lang='eng')
                all_page_text.append(page_text)
                print(f"Page {i + 1} OCR complete.")
            except pytesseract.TesseractNotFoundError:
                print("\n--- TESSERACT NOT FOUND ---")
                print("Error: Tesseract is not installed or not in your PATH.")
                print("Please install Tesseract OCR and configure the path in the script if necessary.")
                print("Installation guide: https://github.com/tesseract-ocr/tesseract#installing-tesseract")
                return None
            except Exception as ocr_err:
                print(f"Error during OCR on page {i + 1}: {ocr_err}")
                all_page_text.append("") # Add empty string on error

    except Exception as e:
        print(f"Error processing PDF or converting pages: {e}")
        if "poppler" in str(e).lower():
             print("\n--- POPPLER ERROR ---")
             print("This might be due to Poppler not being installed or not found.")
             print("Please install Poppler and provide the path in the POPPLER_PATH variable if needed.")
        return None

    print("OCR process finished.")
    return all_page_text

def parse_commentaries(all_pages_text):
    """
    Parses the extracted text to find and structure commentaries.
    """
    print("Parsing extracted text for commentaries...")
    commentaries = []
    full_text = "\n".join(all_pages_text) # Combine all pages for easier parsing

    # Regex to find commentary headers:
    # - Optional leading whitespace/newlines
    # - Commentary Number (e.g., 02-01) on its own line (potentially with whitespace)
    # - Title on the next line (captures everything until the next newline)
    # - Category might be above the title or number, but the most reliable marker seems to be the number.
    # Let's try finding the number first, then look for title nearby.
    # This regex looks for the number, then captures the text *before* it (potential title/category)
    # and the text *after* it until the next commentary number or end of text.

    # Revised Regex Strategy: Find number, then work backwards/forwards
    # Pattern to find the commentary number marker (e.g., 02-01) potentially surrounded by whitespace/newlines
    # We capture the number itself.
    pattern = re.compile(r"^\s*(\d{2}-\d{2})\s*$", re.MULTILINE)

    matches = list(pattern.finditer(full_text))
    print(f"Found {len(matches)} potential commentary number markers.")

    for i, match in enumerate(matches):
        commentary_data = {}
        commentary_num = match.group(1)
        start_index = match.end() # Start of commentary text is after the number marker

        # End index is the start of the *next* commentary number marker, or end of text
        end_index = matches[i + 1].start() if (i + 1) < len(matches) else len(full_text)

        # Extract the raw text block associated with this number
        raw_block = full_text[start_index:end_index]

        # Now, try to extract Title and Text from this block
        # Assumption: Title is usually the first significant line(s) after the number.
        #             Category might be present but harder to reliably extract.
        #             The rest is the commentary body.

        lines = [line.strip() for line in raw_block.strip().split('\n') if line.strip()]

        if not lines:
            print(f"Warning: No text content found for commentary {commentary_num}")
            continue

        # Heuristic: Assume the first non-empty line is the Title
        # This might need adjustment based on actual OCR output variations.
        # Sometimes the title might be *before* the number on the page layout.
        # Let's look *before* the number match as well.
        # Find the text block between the *previous* match end and *this* match start
        prev_end_index = matches[i-1].end() if i > 0 else 0
        header_block = full_text[prev_end_index:match.start()]
        header_lines = [line.strip() for line in header_block.strip().split('\n') if line.strip()]

        # Try to find title in header block first. Often Title\nNumber or Category\nTitle\nNumber
        title = "Unknown Title"
        if len(header_lines) >= 1:
             # If 2 lines before number, assume last is title, one before is category
             if len(header_lines) >= 2:
                 title = header_lines[-1] # Assume last line before number is title
             else: # If only 1 line before number, assume it's the title
                 title = header_lines[0]
        elif lines: # If nothing before, check the first line *after* the number
             title = lines[0]
             # Remove title line from commentary text if found this way
             commentary_text = "\n".join(lines[1:]).strip()
        else:
            commentary_text = "\n".join(lines).strip() # Use all lines if title wasn't separated

        # If title was found *before* the number, the text starts *after* the number
        if title in header_lines:
             commentary_text = "\n".join(lines).strip()


        # Clean up potential OCR noise/artifacts if needed (optional)
        commentary_text = re.sub(r'\s*\n\s*', '\n', commentary_text).strip() # Consolidate whitespace

        # Calculate date
        commentary_date = calculate_date_from_number(commentary_num)

        commentary_data['Author'] = "Phyllis Schlafly"
        commentary_data['title'] = title
        commentary_data['date'] = commentary_date
        commentary_data['commentary_number'] = commentary_num
        commentary_data['text'] = commentary_text

        # Basic validation
        if title != "Unknown Title" and commentary_text:
             commentaries.append(commentary_data)
             print(f"Successfully parsed commentary: {commentary_num} - {title[:30]}...")
        else:
             print(f"Warning: Could not fully parse commentary {commentary_num}. Title: '{title}', Text found: {bool(commentary_text)}")


    print(f"Successfully parsed {len(commentaries)} commentaries.")
    return commentaries

# --- Main Execution ---
if __name__ == "__main__":
    if not os.path.exists(PDF_FILE_PATH):
        print(f"Error: PDF file not found at '{PDF_FILE_PATH}'")
    else:
        # 1. Extract text using OCR
        pages_text = extract_text_from_pdf_ocr(PDF_FILE_PATH, POPPLER_PATH)

        if pages_text:
            # 2. Parse the extracted text
            parsed_data = parse_commentaries(pages_text)

            if parsed_data:
                # 3. Write to JSON file
                try:
                    with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f:
                        json.dump(parsed_data, f, indent=4, ensure_ascii=False)
                    print(f"\nSuccessfully wrote {len(parsed_data)} commentaries to '{OUTPUT_JSON_PATH}'")
                except IOError as e:
                    print(f"\nError writing JSON file: {e}")
            else:
                print("\nNo commentaries could be successfully parsed from the PDF.")
        else:
            print("\nText extraction failed. Cannot proceed with parsing.")

