## Phase 1: Setup & Ingestion Liscence Application

In [None]:
!pip install unstructured pypdf python-docx nltk --quiet  # Install Unstructured to extract structured content from PDFs
!pip install pdfminer.six --quiet
!pip install "unstructured[pdf]" --quiet
# Download NLTK tokenizer for sentence splitting (used later)
import nltk
nltk.download("punkt")# Required for sentence tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#Import and Load PDF

from unstructured.partition.pdf import partition_pdf
import os
from collections import Counter
import re


# Place path for PDF application
pdf_path = "/content/NoMarkUP30%_LA_Ch1_General Info_FINAdsL.pdf"

# Use Unstructured's partition_pdf to break the PDF into structured elements
elements = partition_pdf(filename=pdf_path)

# Preview total elements found
print(f"Total elements extracted: {len(elements)}")
print("First element preview:")
print(elements[0])


for i, el in enumerate(elements[:10]):                   #########Running into a ton of uncategorized text
    print(f"\n=== Element {i} ===")
    print(f"Category: {el.category}")
    print(f"Text:\n{el.text.strip()}")


Total elements extracted: 918
First element preview:
IKE Enrichment Facility

=== Element 0 ===
Category: Title
Text:
IKE Enrichment Facility

=== Element 1 ===
Category: Title
Text:
License Application

=== Element 2 ===
Category: Title
Text:
TABLE OF CONTENTS

=== Element 3 ===
Category: Title
Text:
Page

=== Element 4 ===
Category: UncategorizedText
Text:
1.0

=== Element 5 ===
Category: UncategorizedText
Text:
GENERAL INFORMATION ............................................................................................. 1

=== Element 6 ===
Category: UncategorizedText
Text:
1.1

=== Element 7 ===
Category: UncategorizedText
Text:
FACILITY AND PROCESS OVERVIEW ........................................................ 1.1-1

=== Element 8 ===
Category: UncategorizedText
Text:
1.1.1 Facility Layout Description ................................................................. 1.1-3

=== Element 9 ===
Category: UncategorizedText
Text:
1.1.2 Process Overview ............................

## Phase 2:  Structure Detection & Mapping && Embedding & Vector Search

 Need to return and categorize table of content entries


In [None]:
# Step 1: Collect all short blocks from the document
# We're focusing on blocks with <10 words — likely to be headers, footers, or labels  -- THough this is likely an error and will need to be shortened or readdressed in a new way


from collections import Counter

short_blocks = [
    el.text.strip()
    for el in elements
    if hasattr(el, "text") and 0 < len(el.text.strip().split()) < 10
]

#  Count how often each short block appears
footer_counts = Counter(short_blocks)

# If something appears too often (e.g., >5 times), it's probably a footer/header
# You can tune this threshold depending on document size
common_footers = {
    text for text, count in footer_counts.items() if count > 5
}



In [None]:

#  Define a helper function to detect bullet points
def is_bullet_point(text):
    """
    Returns True if a line looks like a bullet point.
    Matches:
      - • Conduct...
      - - Maintain...
      - * Submit...
      - 1. Evaluate...
      - 2) Review...
    """
    text = text.strip()
    return bool(re.match(r"^(\s*[\*\-•]\s+|\d+[\.\)]\s+)", text))


# Step 2: Filter out unwanted text
# Keep text blocks if they are either:
# - Long paragraphs (>15 words), OR
# - Bullet points (even short ones)

filtered_chunks = []
discarded_chunks = []

for el in elements:
    if hasattr(el, "text"):
        text = el.text.strip()
        word_count = len(text.split())

        if word_count > 15 or is_bullet_point(text):
            filtered_chunks.append(text)
        else:
            discarded_chunks.append(text)

# ---- Step 3: Show basic stats ----
print(f" Retained: {len(filtered_chunks)} meaningful paragraphs")
print(f" Discarded: {len(discarded_chunks)} short or noisy blocks")

# Optional: Preview a few filtered results
print("\n Sample Clean Paragraph:\n")
print(filtered_chunks[0][:500])

# Optional: Preview what was discarded (debugging)
print("\n Sample Discarded Text:\n")
print(discarded_chunks[:5])


Jello


In [None]:
import re

# --- Step 1: Extract TOC entries from top of document ---
toc_entries = []
toc_pattern = re.compile(r"^(\d+(\.\d+)+)\s+(.+?)\.{3,}\s+(\S+)$")

for i, el in enumerate(elements[:150]):
    if hasattr(el, "text"):
        text = el.text.strip()
        match = toc_pattern.match(text)
        if match:
            section = match.group(1)
            title = match.group(3).strip()
            page = match.group(4).strip()
            toc_entries.append({"section": section, "title": title, "toc_page": page})

# --- Step 2: Scan entire doc for actual body sections ---
body_sections = []
header_pattern = re.compile(r"^(\d+(\.\d+)+)\s+(.+)$")

for i, el in enumerate(elements):
    if hasattr(el, "text"):
        text = el.text.strip()
        match = header_pattern.match(text)
        if match:
            section = match.group(1)
            title = match.group(3).strip()
            body_sections.append({"section": section, "title": title, "index": i})

# --- Step 3: Create DocMap ---
doc_map = {}

for idx, entry in enumerate(body_sections):
    section = entry["section"]
    doc_map[section] = {
        "title": entry["title"],
        "start_index": entry["index"],
        "end_index": (
            body_sections[idx + 1]["index"] - 1
            if idx + 1 < len(body_sections)
            else len(elements) - 1
        )
    }

# Merge TOC page references into doc_map
for toc in toc_entries:
    if toc["section"] in doc_map:
        doc_map[toc["section"]]["toc_page"] = toc["toc_page"]

# --- Preview ---
import pprint
pprint.pprint(dict(list(doc_map.items())[:5]))


{'1.1.1': {'end_index': 34,
           'start_index': 34,
           'title': 'Facility Location, Site Layout, and Surrounding '
                    'Characteristics ......... 1.1-3',
           'toc_page': '1.1-3'},
 '1.1.2': {'end_index': 35,
           'start_index': 35,
           'title': 'Facilities Description '
                    '......................................................................... '
                    '1.1-3',
           'toc_page': '1.1-3'},
 '1.1.3': {'end_index': 38,
           'start_index': 36,
           'title': 'Process Descriptions '
                    '......................................................................... '
                    '1.1-7',
           'toc_page': '1.1-7'},
 '1.1.4': {'end_index': 12,
           'start_index': 10,
           'title': 'Descriptive Summary of Licensed Material '
                    '...................................... 1.1-16',
           'toc_page': '1.1-16'},
 '1.2.1': {'end_index': 39,
      

In [None]:
### For tables and figure mapping

table_figure_entries = []

caption_pattern = re.compile(r"^(Table|Figure)\s+\d+[\.\d\-]*\s+(.*)", re.IGNORECASE)

for i, el in enumerate(elements):
    if hasattr(el, "text"):
        text = el.text.strip()
        if caption_pattern.match(text):
            table_figure_entries.append({
                "index": i,
                "text": text
            })

print(f"✅ Found {len(table_figure_entries)} tables/figures")
for t in table_figure_entries[:5]:
    print(t)


In [None]:


# === Save cleaned output to file (optional) ===
with open("clean_application.txt", "w", encoding="utf-8") as f:
    for para in filtered_chunks:
        f.write(para + "\n\n")



In [None]:
#### Save code to JSON

import json

with open("docmap.json", "w") as f:
    json.dump(doc_map, f, indent=2)


In [None]:
# === Utility Code: Extract full text for any section ===
def get_section_text(section_number, elements, doc_map):
    """Return full paragraph text for a given section number using doc_map."""
    s = doc_map[section_number]["start_index"]
    e = doc_map[section_number]["end_index"]
    return "\n".join(
        el.text.strip() for el in elements[s:e+1]
        if hasattr(el, "text") and len(el.text.strip()) > 0
    )

# ✅ Example usage:
print(get_section_text("1.1.3", elements, doc_map)[:1000])  # Preview first 1000 characters of Section 1.1.3


1.1.3 Process Descriptions ......................................................................... 1.1-7
1.2
INSTITUTIONAL INFORMATION .................................................................. 1.2-1


## Phase 3: Annotation, Entity Mapping, & Semantic Consistency Checks

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm


In [None]:
## Load
import spacy
from pprint import pprint

nlp = spacy.load("en_core_web_sm")


In [None]:
#Process each cleaned paragraph


ner_results = []

for para in filtered_chunks:
    doc = nlp(para)
    entities = []
    for ent in doc.ents:
        entities.append({
            "text": ent.text,
            "label": ent.label_
        })
    ner_results.append({
        "paragraph": para,
        "entities": entities
    })

print(f" Processed {len(ner_results)} paragraphs with NER")


In [None]:
# see

pprint(ner_results[0])

print("space")
pprint(ner_results[:2])

In [None]:
## Global Reference table


from collections import defaultdict

# Dictionary of label -> set of unique entities
global_reference_table = defaultdict(set)

# Loop over all items and collect entities
for item in ner_results:
    for ent in item["entities"]:
        label = ent.get("label") or ent.get("label_")
        text = ent["text"]
        global_reference_table[label].add(text)

# Convert sets to sorted lists for easier viewing
global_reference_table = {k: sorted(list(v)) for k, v in global_reference_table.items()}

# Display
print("Global Reference Table of Entities:")
for label, entries in global_reference_table.items():
    print(f"\n{label}:")
    for e in entries:
        print(f" - {e}")


In [None]:
# Parse dates and times
from dateutil.parser import parse as date_parse
from dateutil.parser import ParserError

normalized_dates = []

# Loop over all items and process DATE entities
for item in ner_results:
    para = item["paragraph"]
    for ent in item["entities"]:
        label = ent.get("label") or ent.get("label_")
        text = ent["text"]

        if label == "DATE":
            try:
                dt = date_parse(text, fuzzy=True)
                normalized_dates.append({
                    "original": text,
                    "standardized": dt.isoformat(),
                    "paragraph": para[:100]
                })
            except (ParserError, ValueError):
                # Skip unparseable dates
                pass

# Display
print(f"\n Parsed {len(normalized_dates)} dates.")
for d in normalized_dates:
    print(f"- '{d['original']}' standardized to {d['standardized']} (excerpt: '{d['paragraph']}...')")


In [None]:
pprint(normalized_dates[:3])

In [None]:
##
##
## Acronym Library



acronym_dict = {
    "ERIEF": "Eagle Rock Enrichment Facility",
    "SAR": "Safety Analysis Report",
    "UF6": "Uranium Hexafluoride",
    "NRC": "Nuclear Regulatory Commission"
}




In [None]:
###Create a long text /acronym replacement version of each paragraph:

normalized_paragraphs = []

for para in filtered_chunks:
    norm_para = para
    for short, long in acronym_dict.items():
        pattern = re.compile(rf"\b{short}\b")
        norm_para = pattern.sub(long, norm_para)
    normalized_paragraphs.append(norm_para)

print(" Acronym normalization complete.")


In [None]:
# Compare orig vs normal

print("Original:")
print(filtered_chunks[0][:200])

print("\nNormalized:")
print(normalized_paragraphs[0][:200])


In [None]:

#refernce map for cross ref questions - see table 2.3 appendix a

reference_pattern = re.compile(r"(Table|Figure|Appendix)\s+([\w\d\.\-]+)", re.IGNORECASE)

cross_references = []

for i, para in enumerate(filtered_chunks):
    matches = reference_pattern.findall(para)
    if matches:
        refs = []
        for m in matches:
            refs.append({
                "type": m[0],
                "ref": m[1]
            })
        cross_references.append({
            "paragraph_index": i,
            "references": refs
        })

print(f"✅ Found {len(cross_references)} cross-reference mentions")


In [None]:
## Verify Doc Map is WOrking correctly
# Extract numeric parts for sorting
def section_key(s):
    return [int(part) for part in s.split(".")]

# Sort sections
sorted_sections = sorted(doc_map.keys(), key=section_key)

# Check order and print
print("✅ Section order verification:")
for s in sorted_sections:
    print(f"{s} - {doc_map[s]['title']}")


## Phase 4:  Ingest and Chunk NUREG Document

In [None]:
!pip install unstructured pdfminer.six nltk --quiet


In [None]:
nureg_path = "/content/NUREG-1520.txt"  # Change to your filepath

with open(nureg_path, "r", encoding="utf-8") as f:
    nureg_text = f.read()

print(f"✅ NUREG loaded: {len(nureg_text):,} characters")


In [None]:
from unstructured.partition.pdf import partition_pdf

pdf_path = "/content/NUREG-1520.pdf"  # Update path
elements = partition_pdf(filename=pdf_path)

# Extract text
nureg_text = "\n\n".join([el.text for el in elements if hasattr(el, "text")])
print(f"✅ PDF parsed into {len(elements)} elements")


In [None]:
#######
###### CHUNKing

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # ~700–1000 words max
    chunk_overlap=150,      # for better context
    separators=["\n\n", "\n", ".", " "]  # fallback breaks
)

nureg_chunks = splitter.split_text(nureg_text)

print(f"✅ Total NUREG Chunks: {len(nureg_chunks)}")
print("\n🔹 Sample Chunk:\n", nureg_chunks[0][:500])


##  Create Semantic Vector Store

In [None]:
!pip install faiss-cpu sentence-transformers langchain --quiet


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings  ## LANGCHAINNN

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [None]:
##
## Build a Facebook AI Similarity Search (FAISS) Vector Store

## look at chromadb

from langchain.vectorstores import FAISS

# Build vector index from the NUREG chunks
vectorstore = FAISS.from_texts(nureg_chunks, embedding_model)

print("✅ Vector store built.")


In [None]:
vectorstore.save_local("nureg_faiss_index")   ## Come back
vectorstore = FAISS.load_local("nureg_faiss_index", embedding_model)  # must pass the same embedding model used for saving,
#or else queries might not match as expected.

# Needs  index.pkl) to the "nureg_faiss_index" folder on your local disk  .. file?
