## 🪨 PDF Processing Script using **Unstructured** Library  
### Filter pages containing mining resource keywords and tables

---

### 🧭 Overview
This notebook automates the extraction of relevant pages from mining-related PDF documents (e.g., ASX announcements, JORC or MRE reports).  
It uses the **Unstructured** library to identify and filter pages that contain:
- **Main resource keywords** — “MRE”, “mineral resource”, “ore reserve”  
- **Table indicators** — “indic”, “meas”, “infer”, “prove”, “prob”  
- **Mineral codes/names** — “Au”, “Cu”, “Fe”, “gold”, “lithium”, etc.  

Filtered pages are then exported as both `.json` and `.txt` files, and a summary report is saved as a `.csv`.

---

### ⚙️ Key Features
- 🔍 **Keyword and table detection** using Unstructured’s high-resolution PDF parsing  
- 🧩 **Page-level filtering** based on combined text and structural features  
- 💾 **Output formats**: JSON (structured), TXT (readable), CSV (summary)  
- 📁 **Automatic directory setup** for input PDFs and processed results  

---

### 📦 Requirements
Install these packages before running:
```bash
pip install unstructured pypdfium2 pandas

In [None]:
# PDF Processing Script using Unstructured Library
# Filter pages containing mining resource keywords and tables   

# Import required libraries
import os
import re
import json
from pathlib import Path
from typing import List, Dict, Any
import pandas as pd
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements

# Define filtering keywords
KEYWORDS = ["mre", "mineral resource", "ore reserve"]
TABLE_KEYWORDS = ["indic", "meas", "infer", "prove", "prob"]
MINERAL_CODES = ["Au", "Ag", "Pt", "Pd", "Cu", "Pb", "Zn", "Ni", "Co",
                 "Fe", "Mn", "Al", "Li", "Sn", "W", "Mo", "Cr", "U", "V", "REE"]
MINERAL_NAMES = ["gold", "silver", "platinum", "palladium", "copper", "lead", "zinc",
                 "nickel", "cobalt", "iron", "manganese", "aluminium", "lithium", "tin",
                 "tungsten", "molybdenum", "chromium", "uranium", "vanadium", "rare earth"]

In [None]:
def setup_directories():
    """Create necessary directories if they don't exist."""
    input_dir = Path("pdf_downloads")
    output_dir = Path("unstructured/filtered_pages")
    
    # Create output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)
    
    return input_dir, output_dir

def contains_keywords(text: str, keywords: List[str], case_sensitive: bool = False) -> bool:
    """Check if text contains any of the specified keywords."""
    if not case_sensitive:
        text = text.lower()
        keywords = [kw.lower() for kw in keywords]
    
    return any(keyword in text for keyword in keywords)

def has_table_content(elements: List[Any]) -> bool:
    """Check if elements contain table-related content."""
    table_elements = [elem for elem in elements if hasattr(elem, 'category') and elem.category == 'Table']
    
    if not table_elements:
        return False
    
    # Check if table content contains table keywords
    for table_elem in table_elements:
        table_text = str(table_elem)
        if contains_keywords(table_text, TABLE_KEYWORDS):
            return True
    
    return False

def filter_page_content(elements: List[Any]) -> bool:
    """
    Filter page based on all criteria:
    1. Contains main keywords (mre, mineral resource, ore reserve)
    2. Contains table keywords (indic, meas, infer, prove, prob)
    3. Contains mineral codes or names
    """
    # Combine all text from the page
    page_text = " ".join([str(elem) for elem in elements])
    
    # Check main keywords
    has_main_keywords = contains_keywords(page_text, KEYWORDS)
    
    # Check table keywords
    has_table_keywords = contains_keywords(page_text, TABLE_KEYWORDS)
    
    # Check mineral codes or names
    has_mineral_codes = contains_keywords(page_text, MINERAL_CODES)
    has_mineral_names = contains_keywords(page_text, MINERAL_NAMES)
    has_minerals = has_mineral_codes or has_mineral_names
    
    # All criteria must be met
    return has_main_keywords and has_table_keywords and has_minerals

In [None]:
def process_pdf_file(pdf_path: Path, output_dir: Path) -> Dict[str, Any]:
    """Process a single PDF file and extract filtered pages."""
    print(f"Processing: {pdf_path.name}")

    try:
        # Partition PDF with page-level extraction
        elements = partition_pdf(
            filename=str(pdf_path),
            strategy="hi_res",   # Use "hi_res" if OCR is needed
            languages=["eng"],
            include_metadata=True,
            infer_table_structure=True,
            extract_images_in_pdf=False,
            include_page_breaks=True,
        )

        # Group elements by page
        pages = {}
        for element in elements:
            page_num = getattr(element.metadata, "page_number", None)
            if page_num is None:
                page_num = -1  # fallback if no page number found

            if page_num not in pages:
                pages[page_num] = []
            pages[page_num].append(element)

        # Filter pages
        filtered_pages = {}
        for page_num, page_elements in pages.items():
            if filter_page_content(page_elements):  # <- make sure you defined this
                filtered_pages[page_num] = page_elements
                print(f"  ✓ Page {page_num} matches criteria")

        # Save filtered pages
        if filtered_pages:
            output_file = output_dir / f"{pdf_path.stem}_filtered.json"

            def safe_text(elem):
                """Return safe text representation of an element."""
                if getattr(elem, "text", None):
                    return elem.text
                try:
                    return str(elem) or ""
                except Exception:
                    return ""

            def safe_metadata(elem):
                """Return safe metadata dict."""
                meta = getattr(elem, "metadata", None)
                if meta is None:
                    return {}
                to_dict = getattr(meta, "to_dict", None)
                if callable(to_dict):
                    try:
                        return to_dict()
                    except Exception:
                        return {}
                return {}

            json_data = {
                "source_file": pdf_path.name,
                "filtered_pages": {},
                "summary": {
                    "total_pages": len(pages),
                    "filtered_pages": len(filtered_pages),
                    "filtered_page_numbers": list(filtered_pages.keys()),
                },
            }

            for page_num, page_elements in filtered_pages.items():
                json_data["filtered_pages"][str(page_num)] = [
                    {
                        "text": safe_text(elem),
                        "category": getattr(elem, "category", "Unknown"),
                        "metadata": safe_metadata(elem),
                    }
                    for elem in page_elements
                ]

            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(json_data, f, indent=2, ensure_ascii=False)

            # Also save as plain text
            text_file = output_dir / f"{pdf_path.stem}_filtered.txt"
            with open(text_file, "w", encoding="utf-8") as f:
                f.write(f"Source: {pdf_path.name}\n")
                f.write(f"Filtered Pages: {list(filtered_pages.keys())}\n")
                f.write("=" * 50 + "\n\n")

                for page_num in sorted(filtered_pages.keys()):
                    f.write(f"PAGE {page_num}\n")
                    f.write("-" * 20 + "\n")
                    for elem in filtered_pages[page_num]:
                        f.write(f"{safe_text(elem)}\n\n")
                    f.write("\n" + "=" * 50 + "\n\n")

        return {
            "file": pdf_path.name,
            "total_pages": len(pages),
            "filtered_pages": len(filtered_pages),
            "filtered_page_numbers": list(filtered_pages.keys()),
            "status": "success",
        }

    except Exception as e:
        print(f"  ✗ Error processing {pdf_path.name}: {str(e)}")
        return {
            "file": pdf_path.name,
            "total_pages": 0,
            "filtered_pages": 0,
            "filtered_page_numbers": [],
            "status": f"error: {str(e)}",
        }


In [None]:
if __name__ == "__main__":
    # Setup directories
    input_dir, output_dir = setup_directories()

    # Check input directory exists
    if not input_dir.exists():
        print(f"Input directory '{input_dir}' does not exist. Please add PDF files.")
        exit(1)

    # Find all PDFs
    pdf_files = list(input_dir.glob("*.pdf"))
    if not pdf_files:
        print(f"No PDF files found in '{input_dir}'")
        exit(1)

    print(f"Found {len(pdf_files)} PDF files to process.\n")

    # Process PDFs one by one
    results = []
    for pdf_file in pdf_files:
        result = process_pdf_file(pdf_file, output_dir)
        results.append(result)

    # Print summary
    total_files = len(results)
    successful_files = len([r for r in results if r["status"] == "success"])
    total_filtered_pages = sum(r["filtered_pages"] for r in results)

    print("\nProcessing Summary")
    print("=" * 50)
    print(f"Total files processed: {total_files}")
    print(f"Successful: {successful_files}")
    print(f"Failed: {total_files - successful_files}")
    print(f"Total filtered pages: {total_filtered_pages}")

    # Save summary CSV
    df_results = pd.DataFrame(results)
    summary_file = output_dir / "processing_summary.csv"
    df_results.to_csv(summary_file, index=False)
    print(f"\nSummary saved to: {summary_file}")