In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
import os

# --- Config ---
PDF_PATH = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/A Choice Not An Echo 2014 7-23-14.pdf"
OUTPUT_JSON = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/choice_not_echo_2014.json"

BOOK_METADATA = {
    "author": "Phyllis Schlafly",
    "book_title": "A Choice Not An Echo, 2014 Edition",
    "publication_year": 2014
}

# --- Step 1: Load PDF using LangChain ---
def load_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

# --- Step 2: Split into chunks using LangChain text splitter ---
def chunk_documents(docs, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(docs)

# --- Step 3: Structure chunks with metadata ---
def structure_chunks(chunks, metadata):
    return [
        {
            **metadata,
            "text": chunk.page_content.strip()
        }
        for chunk in chunks
    ]

# --- Step 4: Save as JSON ---
def write_json(data, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# --- Full Pipeline ---
def process_pdf():
    print("Loading PDF...")
    docs = load_pdf(PDF_PATH)

    print("Chunking...")
    chunks = chunk_documents(docs)
    print(f"Total chunks: {len(chunks)}")

    print("Structuring output...")
    structured = structure_chunks(chunks, BOOK_METADATA)

    print("Writing JSON...")
    write_json(structured, OUTPUT_JSON)
    print(f"Done! Output saved to: {OUTPUT_JSON}")


In [5]:
process_pdf()

Loading PDF...
Chunking...
Total chunks: 547
Structuring output...
Writing JSON...
Done! Output saved to: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/choice_not_echo_2014.json


In [10]:
import os
import pytesseract
from pdf2image import convert_from_path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

# --- Config ---
PDF_PATH = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/ALL001.pdf"
OUTPUT_JSON = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/allegiance.json"

BOOK_METADATA = {
    "author": "Phyllis Schlafly",
    "book_title": "Allegiance: Briefing Book on American Independence & Sovereignty",
    "publication_year": 2000
}

# --- Step 1: OCR PDF pages ---
def ocr_pdf(pdf_path):
    print("Converting PDF pages to images...")
    images = convert_from_path(pdf_path)

    print("Running OCR on each page...")
    full_text = ""
    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image)
        full_text += text + "\n"
        print(f"OCR done for page {i+1}/{len(images)}")

    return full_text

# --- Step 2: Chunk the text ---
def chunk_text(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.create_documents([text])

# --- Step 3: Structure chunks with metadata ---
def structure_chunks(chunks, metadata):
    return [
        {
            **metadata,
            "text": chunk.page_content.strip()
        }
        for chunk in chunks
    ]

# --- Step 4: Save as JSON ---
def write_json(data, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# --- Run full process ---
def process_non_ocr_pdf():
    print("Starting OCR + Chunking Pipeline...")
    text = ocr_pdf(PDF_PATH)
    print("Chunking text...")
    chunks = chunk_text(text)
    print(f"Total chunks: {len(chunks)}")

    print("Structuring output...")
    structured = structure_chunks(chunks, BOOK_METADATA)

    print("Writing JSON...")
    write_json(structured, OUTPUT_JSON)
    print(f"Done! Output saved to: {OUTPUT_JSON}")

In [11]:
process_non_ocr_pdf()

Starting OCR + Chunking Pipeline...
Converting PDF pages to images...
Running OCR on each page...
OCR done for page 1/50
OCR done for page 2/50
OCR done for page 3/50
OCR done for page 4/50
OCR done for page 5/50
OCR done for page 6/50
OCR done for page 7/50
OCR done for page 8/50
OCR done for page 9/50
OCR done for page 10/50
OCR done for page 11/50
OCR done for page 12/50
OCR done for page 13/50
OCR done for page 14/50
OCR done for page 15/50
OCR done for page 16/50
OCR done for page 17/50
OCR done for page 18/50
OCR done for page 19/50
OCR done for page 20/50
OCR done for page 21/50
OCR done for page 22/50
OCR done for page 23/50
OCR done for page 24/50
OCR done for page 25/50
OCR done for page 26/50
OCR done for page 27/50
OCR done for page 28/50
OCR done for page 29/50
OCR done for page 30/50
OCR done for page 31/50
OCR done for page 32/50
OCR done for page 33/50
OCR done for page 34/50
OCR done for page 35/50
OCR done for page 36/50
OCR done for page 37/50
OCR done for page 38/50

In [12]:
import re
import json
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Config ---
PDF_PATH = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Schlafly_Phy_4FNL.pdf"
OUTPUT_JSON = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/interview.json"

# --- LangChain Loader ---
def load_pdf_text(pdf_path):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    return "\n".join([doc.page_content for doc in docs])

# --- Strip copyright and reader note section ---
def remove_front_matter(text):
    start_marker = "COPYRIGHT"
    end_marker = "reviewed by the interviewee"

    start = text.find(start_marker)
    end = text.find(end_marker)
    if start != -1 and end != -1:
        return text[end + len(end_marker):].strip()
    return text

# --- Extract interviews from text ---
def extract_interviews(text):
    pattern = re.compile(r"(Interview #\s*\d+:\s*[\w\s,]+\d{4})", re.IGNORECASE)
    matches = list(pattern.finditer(text))

    interviews = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        header = matches[i].group(1)

        number_match = re.search(r"Interview #\s*(\d+)", header)
        date_match = re.search(r":\s*([\w\s,]+\d{4})", header)

        interview_number = int(number_match.group(1)) if number_match else None
        interview_date = date_match.group(1).strip() if date_match else None

        segment_text = text[start:end].strip()

        interviews.append({
            "interview_number": interview_number,
            "interview_date": interview_date,
            "text": segment_text
        })

    return interviews

# --- Chunk each interview ---
def chunk_and_structure(interviews):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    final_chunks = []

    for interview in interviews:
        chunks = splitter.split_text(interview["text"])
        for chunk in chunks:
            final_chunks.append({
                "author": "Phyllis Schlafly",
                "interview_number": interview["interview_number"],
                "interview_date": interview["interview_date"],
                "interviewer": "Mark DePue",
                "source_type": "Interview Transcript",
                "text": chunk.strip()
            })

    return final_chunks

# --- Write JSON ---
def write_json(data, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# --- Main pipeline ---
def process_interview_pdf():
    print("Loading PDF...")
    full_text = load_pdf_text(PDF_PATH)

    print("Removing front matter...")
    cleaned_text = remove_front_matter(full_text)

    print("Extracting interviews...")
    interviews = extract_interviews(cleaned_text)
    print(f"Found {len(interviews)} interviews.")

    print("Chunking...")
    structured_chunks = chunk_and_structure(interviews)

    print("Saving to JSON...")
    write_json(structured_chunks, OUTPUT_JSON)
    print(f"Done! Saved to {OUTPUT_JSON}")

In [13]:
process_interview_pdf()

Loading PDF...
Removing front matter...
Extracting interviews...
Found 6 interviews.
Chunking...
Saving to JSON...
Done! Saved to /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/interview.json


In [14]:
import json
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Config ---
PDF_PATH = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/raw/Schlafly_Phy_4FNL.pdf"
OUTPUT_JSON = "/Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/interview.json"

# --- Load full text using LangChain ---
def load_pdf_text(pdf_path):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    return "\n".join([doc.page_content for doc in docs])

# --- Remove boilerplate front matter ---
def remove_front_matter(text):
    start_marker = "COPYRIGHT"
    end_marker = "reviewed by the interviewee"

    start = text.find(start_marker)
    end = text.find(end_marker)
    if start != -1 and end != -1:
        return text[end + len(end_marker):].strip()
    return text

# --- Extract text from start until "Interview # 2" ---
def extract_interview_one(text):
    match = re.search(r"Interview #\s*2\s*[:\-]", text, flags=re.IGNORECASE)
    if match:
        return text[:match.start()].strip()
    return text  # fallback: return full text if marker not found

# --- Chunk and structure Interview #1 ---
def chunk_interview_one(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_text(text)

    return [
        {
            "author": "Phyllis Schlafly",
            "interview_number": 1,
            "interview_date": "January 5, 2011",
            "interviewer": "Mark DePue",
            "source_type": "Interview Transcript",
            "text": chunk.strip()
        }
        for chunk in chunks
    ]

# --- Append to existing JSON ---
def append_to_json(new_data, json_path):
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    combined = existing_data + new_data

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(combined, f, indent=2, ensure_ascii=False)

# --- Run the pipeline ---
def process_interview_one():
    print("Loading PDF...")
    full_text = load_pdf_text(PDF_PATH)

    print("Removing front matter...")
    cleaned_text = remove_front_matter(full_text)

    print("Extracting Interview #1...")
    interview_one_text = extract_interview_one(cleaned_text)

    print("Chunking Interview #1...")
    chunks = chunk_interview_one(interview_one_text)

    print(f"Appending {len(chunks)} chunks to JSON...")
    append_to_json(chunks, OUTPUT_JSON)

    print(f"✅ Done! Interview #1 appended to: {OUTPUT_JSON}")

process_interview_one()


Loading PDF...
Removing front matter...
Extracting Interview #1...
Chunking Interview #1...
Appending 80 chunks to JSON...
✅ Done! Interview #1 appended to: /Users/mason/Desktop/Technical_Projects/PYTHON_Projects/PSAI/chunks/interview.json
