In [None]:
# ================================
# Parse FIA Docs (Australia 2025)
# ================================

import os
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- Settings ---
RACE = "australia_2025"
RAW_DIR = os.path.join("data", "raw", RACE)
OUTPUT_DIR = "processed"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"fia_docs_{RACE}.txt")

# --- Find PDFs (case-insensitive) ---
pdf_files = glob.glob(os.path.join(RAW_DIR, "*.pdf")) + glob.glob(os.path.join(RAW_DIR, "*.PDF"))

print(f"Looking for PDFs in: {os.path.abspath(RAW_DIR)}")
if not pdf_files:
    print("⚠️ No PDFs found. Creating an empty output file.")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        f.write("")
    print(f"✅ Empty file created at {OUTPUT_FILE}")
    exit()

# --- Load PDFs ---
docs = []
for pdf in pdf_files:
    try:
        loader = PyPDFLoader(pdf)
        loaded = loader.load()
        docs.extend(loaded)
        print(f"Loaded {len(loaded)} docs from {os.path.basename(pdf)}")
    except Exception as e:
        print(f"⚠️ Could not load {pdf}: {e}")

print(f"Total documents loaded: {len(docs)}")

# --- Split into chunks ---
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)
print(f"Total chunks created: {len(split_docs)}")

# --- Preview ---
if split_docs:
    print("\nExample chunk:\n", split_docs[0].page_content[:500])

# --- Save chunks to txt ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for d in split_docs:
        f.write(d.page_content.strip() + "\n\n")

print(f"✅ FIA docs exported to {OUTPUT_FILE}")


⚠️ No PDFs found at data/raw/australia_2025/*.pdf. Please check the folder and file names.
