In [21]:
!pip install unstructured pypdf python-docx nltk --quiet  # Install Unstructured to extract structured content from PDFs
!pip install pdfminer.six --quiet
!pip install "unstructured[pdf]" --quiet
# Download NLTK tokenizer for sentence splitting (used later)
import nltk
nltk.download("punkt")# Required for sentence tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
#Import and Load PDF

from unstructured.partition.pdf import partition_pdf
import os
from collections import Counter
import re


# Place path for PDF application
pdf_path = "/content/NoMarkUP30%_LA_Ch1_General Info_FINAL.pdf"

# Use Unstructured's partition_pdf to break the PDF into structured elements
elements = partition_pdf(filename=pdf_path)

# Preview total elements found
print(f"Total elements extracted: {len(elements)}")
print("First element preview:")
print(elements[0])

Total elements extracted: 1099
First element preview:
Eagle RockIKE Enrichment Facility


### Need to return and categorize table of content entries


In [23]:
# Step 1: Collect all short blocks from the document
# We're focusing on blocks with <10 words — likely to be headers, footers, or labels


from collections import Counter

short_blocks = [
    el.text.strip()
    for el in elements
    if hasattr(el, "text") and 0 < len(el.text.strip().split()) < 10
]

#  Count how often each short block appears
footer_counts = Counter(short_blocks)

# If something appears too often (e.g., >5 times), it's probably a footer/header
# You can tune this threshold depending on document size
common_footers = {
    text for text, count in footer_counts.items() if count > 5
}



In [3]:

#  Define a helper function to detect bullet points
def is_bullet_point(text):
    """
    Returns True if a line looks like a bullet point.
    Matches:
      - • Conduct...
      - - Maintain...
      - * Submit...
      - 1. Evaluate...
      - 2) Review...
    """
    text = text.strip()
    return bool(re.match(r"^(\s*[\*\-•]\s+|\d+[\.\)]\s+)", text))


# Step 2: Filter out unwanted text
# Keep text blocks if they are either:
# - Long paragraphs (>15 words), OR
# - Bullet points (even short ones)

filtered_chunks = []
discarded_chunks = []

for el in elements:
    if hasattr(el, "text"):
        text = el.text.strip()
        word_count = len(text.split())

        if word_count > 15 or is_bullet_point(text):
            filtered_chunks.append(text)
        else:
            discarded_chunks.append(text)

# ---- Step 3: Show basic stats ----
print(f" Retained: {len(filtered_chunks)} meaningful paragraphs")
print(f" Discarded: {len(discarded_chunks)} short or noisy blocks")

# Optional: Preview a few filtered results
print("\n Sample Clean Paragraph:\n")
print(filtered_chunks[0][:500])

# Optional: Preview what was discarded (debugging)
print("\n Sample Discarded Text:\n")
print(discarded_chunks[:5])


Jello


In [19]:
import re

# --- Step 1: Extract TOC entries from top of document ---
toc_entries = []
toc_pattern = re.compile(r"^(\d+(\.\d+)+)\s+(.+?)\.{3,}\s+(\S+)$")

for i, el in enumerate(elements[:150]):
    if hasattr(el, "text"):
        text = el.text.strip()
        match = toc_pattern.match(text)
        if match:
            section = match.group(1)
            title = match.group(3).strip()
            page = match.group(4).strip()
            toc_entries.append({"section": section, "title": title, "toc_page": page})

# --- Step 2: Scan entire doc for actual body sections ---
body_sections = []
header_pattern = re.compile(r"^(\d+(\.\d+)+)\s+(.+)$")

for i, el in enumerate(elements):
    if hasattr(el, "text"):
        text = el.text.strip()
        match = header_pattern.match(text)
        if match:
            section = match.group(1)
            title = match.group(3).strip()
            body_sections.append({"section": section, "title": title, "index": i})

# --- Step 3: Create DocMap ---
doc_map = {}

for idx, entry in enumerate(body_sections):
    section = entry["section"]
    doc_map[section] = {
        "title": entry["title"],
        "start_index": entry["index"],
        "end_index": (
            body_sections[idx + 1]["index"] - 1
            if idx + 1 < len(body_sections)
            else len(elements) - 1
        )
    }

# Optional: Add TOC page numbers
for toc in toc_entries:
    if toc["section"] in doc_map:
        doc_map[toc["section"]]["toc_page"] = toc["toc_page"]

# --- Preview ---
import pprint
pprint.pprint(dict(list(doc_map.items())[:5]))


{'1.1.1': {'end_index': 34,
           'start_index': 34,
           'title': 'Facility Location, Site Layout, and Surrounding '
                    'Characteristics ......... 1.1-3',
           'toc_page': '1.1-3'},
 '1.1.2': {'end_index': 35,
           'start_index': 35,
           'title': 'Facilities Description '
                    '......................................................................... '
                    '1.1-3',
           'toc_page': '1.1-3'},
 '1.1.3': {'end_index': 38,
           'start_index': 36,
           'title': 'Process Descriptions '
                    '......................................................................... '
                    '1.1-7',
           'toc_page': '1.1-7'},
 '1.1.4': {'end_index': 12,
           'start_index': 10,
           'title': 'Descriptive Summary of Licensed Material '
                    '...................................... 1.1-16',
           'toc_page': '1.1-16'},
 '1.2.1': {'end_index': 39,
      

In [None]:
# === Save cleaned output to file (optional) ===
with open("clean_application.txt", "w", encoding="utf-8") as f:
    for para in filtered_chunks:
        f.write(para + "\n\n")



In [25]:
# === Utility Code: Extract full text for any section ===
def get_section_text(section_number, elements, doc_map):
    """Return full paragraph text for a given section number using doc_map."""
    s = doc_map[section_number]["start_index"]
    e = doc_map[section_number]["end_index"]
    return "\n".join(
        el.text.strip() for el in elements[s:e+1]
        if hasattr(el, "text") and len(el.text.strip()) > 0
    )

# ✅ Example usage:
print(get_section_text("1.1.3", elements, doc_map)[:1000])  # Preview first 1000 characters of Section 1.1.3


1.1.3 Process Descriptions ......................................................................... 1.1-7
1.2
INSTITUTIONAL INFORMATION .................................................................. 1.2-1
