In [1]:
import os
import json
import docx
import fitz  # PyMuPDF
from pathlib import Path
from textwrap import wrap

def extract_text_docx(filepath):
    doc = docx.Document(filepath)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

def extract_text_pdf(filepath):
    doc = fitz.open(filepath)
    return "\n".join([page.get_text() for page in doc])

def chunk_text(text, max_length=1000):
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    chunks, current_chunk = [], ""

    for para in paragraphs:
        if len(current_chunk) + len(para) + 1 <= max_length:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def process_book(filepath, book_title, year, output_folder):
    ext = filepath.suffix.lower()
    if ext == ".docx":
        text = extract_text_docx(filepath)
    elif ext == ".pdf":
        text = extract_text_pdf(filepath)
    else:
        raise ValueError(f"Unsupported file type: {filepath.name}")

    chunks = chunk_text(text)
    json_chunks = [
        {
            "author": "Phyllis Schlafly",
            "book_title": book_title,
            "publication_year": year,
            "text": chunk
        }
        for chunk in chunks
    ]

    output_path = output_folder / f"{book_title.replace(' ', '_')}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(json_chunks, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(json_chunks)} chunks to {output_path}")

# Example usage
book_info = [
    ("LifeBook2015 - MANUSCRIPT.docx", "How the Republican Party Became Pro-Life", 2015),
    ("Who Killed the American Family - EDITED.docx", "Who Killed the American Family", 2014),
    ("The_Supremacists_by_Phyllis_Schlafly.pdf", "The Supremacists", 2004)
]

input_folder = Path(r"D:\Technical_projects\PSAI\raw_data\books")
output_folder = Path(r"D:\Technical_projects\PSAI\chunks\books")
output_folder.mkdir(parents=True, exist_ok=True)

for filename, title, year in book_info:
    process_book(input_folder / filename, title, year, output_folder)


Saved 69 chunks to D:\Technical_projects\PSAI\chunks\books\How_the_Republican_Party_Became_Pro-Life.json
Saved 217 chunks to D:\Technical_projects\PSAI\chunks\books\Who_Killed_the_American_Family.json
Saved 407 chunks to D:\Technical_projects\PSAI\chunks\books\The_Supremacists.json
