In [16]:
# !pip install easyocr
# !pip install pdf2image
# !pip install pymupdf
# !pip install bitsandbytes

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import easyocr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch


working_dir = "/content/drive/MyDrive/shorline_permit"
out_dir = os.path.join(working_dir, "data/outputs")

In [None]:
# --- CONFIGURATION ---
pdf_path = os.path.join(working_dir, "data/23_1913_JPA.pdf")
output_folder = os.path.join(working_dir, "data/test")
os.makedirs(output_folder, exist_ok=True)

In [None]:
# OCR from PDF
doc = fitz.open(pdf_path)
reader = easyocr.Reader(['en'])  # You can add languages if needed

all_text = ""
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    pix = page.get_pixmap(dpi=300)
    image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
    pix.save(image_path)
    result = reader.readtext(image_path, detail=0)
    page_text = "\n".join(result)
    all_text += f"\n--- Page {page_num + 1} ---\n{page_text}"

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `llm-rag` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-auth

In [None]:
# Few-shot Prompt Template
prompt_template = f"""
You are an assistant that extracts structured data from construction permit documents.
Extract the following fields from the input text and return them in JSON format:
- applicant_name
- property_address
- permit_type
- structure_type
- application_date
- approval_date

Input:
\"\"\"
{all_text}
\"\"\"

Output (as JSON):
"""

# -------- Step 4: Run LLM Extraction --------
print("Running LLM extraction...")
response = llm(prompt_template, max_new_tokens=512, do_sample=False)[0]['generated_text']


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import textwrap


class HuggingFacePermitExtractor:
    def __init__(
        self,
        model_name: str,
        device_map: str = "auto",
        load_in_4bit: bool = True,
        torch_dtype=torch.bfloat16,
        default_generate_kwargs: dict = None,
        max_input_tokens: int = 3500,  # stay under model limit
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map=device_map,
            load_in_4bit=load_in_4bit,
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
        self.default_generate_kwargs = default_generate_kwargs or {
            "do_sample": False,
            "max_new_tokens": 1000,
            "temperature": 0.3,
        }
        self.max_input_tokens = max_input_tokens

    def _truncate_input(self, text: str) -> str:
        tokens = self.tokenizer.encode(text, truncation=True, max_length=self.max_input_tokens)
        return self.tokenizer.decode(tokens)

    def build_prompt(self, unstructured_text: str) -> list:
        truncated_text = self._truncate_input(unstructured_text)
        prompt = (
            "You are an expert assistant that extracts structured information from permit applications. "
            "Given the following text from a coastal structure permit application, extract the relevant details in JSON format, "
            "with keys such as: applicant_name, project_title, project_location, permit_id (if any), date_received, "
            "proposed_structure, purpose\n\n"
            f"Permit Text:\n{truncated_text}\n\n"
            "JSON Output:"
        ) #and any other important fields.
        return [{"role": "user", "content": prompt}]

    def extract_structured_data(self, permit_text: str, generate_kwargs: dict = None) -> str:
        messages = self.build_prompt(permit_text)
        model_inputs = self.tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        ).to(self.model.device)

        input_length = model_inputs.shape[1]

        gen_args = self.default_generate_kwargs.copy()
        if generate_kwargs:
            gen_args.update(generate_kwargs)

        with torch.no_grad():
            generated_ids = self.model.generate(model_inputs, **gen_args)

        output_text = self.tokenizer.decode(
            generated_ids[:, input_length:].squeeze(), skip_special_tokens=True
        )
        return output_text


# Example usage
if __name__ == "__main__":
    llm_extractor = HuggingFacePermitExtractor(
        model_name="HuggingFaceH4/zephyr-7b-alpha",
        default_generate_kwargs={
            "do_sample": False,
            "max_new_tokens": 1000,
            "temperature": 0.2,
        },
    )

    # Replace with your real permit application content
    raw_text = """
    Applicant: John Doe
    Location: 1025 Coastal Road, Virginia Beach, VA
    Proposed Project: Construction of a riprap revetment along 80 feet of shoreline
    Date Submitted: February 12, 2024
    Purpose: Shoreline erosion protection
    Permit ID: VMRC-2024-0912
    """

    output = llm_extractor.extract_structured_data(raw_text)
    print(output)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


{
  "applicant_name": "John Doe",
  "project_title": "Construction of a riprap revetment along 80 feet of shoreline",
  "project_location": "1025 Coastal Road, Virginia Beach, VA",
  "permit_id": "VMRC-2024-0912",
  "date_received": "February 12, 2024",
  "purpose": "Shoreline erosion protection"
}


In [None]:

raw_text = all_text
output = llm_extractor.extract_structured_data(raw_text)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{
  "applicant_name": "Yz ounce of slow-release fertilizer per plant placed in hole with roots.",
  "project_title": "PrQpOSEd Rip Rap Revetment fOr ROnald COENE",
  "project_location": "Eastern BanCh Corrotoman River, PO. BOX 315, Site Street, AdOress: SCGIE: KilmarnOCK, VA 679 Mastons WHarf ROad 22482",
  "permit_id": "10999",
  "date_received": "August 16, 2023",
  "proposed_structure": "PrQpOSEd Rip Rap Revetment",
  "purpose": "To stabilize the shoreline and prevent erosion.",
  "draft": "5'-8'",
  "registration_number": "5",
  "for_marinas_commercial_piers_governmental_piers_community_piers_and_other_non_private_piers": "Provide the following information:
A) Have you obtained approval for sanitary facilities from the Virginia Department of Health? (required pursuant to Section 28.2-1205 C of the Code of Virginia).",
  "petroleum_products_or_other_hazardous_materials_are_stored_or_handled_at_your_facility?": "No.",
  "will_petroleum_products_or_other_hazardous_materials_be_stored_

In [None]:
all_text

'\n--- Page 1 ---\nFrom:\nJoey Scott\nTo:\nMRC\njpa Permits\nCc:\nWoodward_Jay (MRC); Olivia Hall; Alex Green; Kelsey English\nSubject:\nCoene\nRap Project\nDate:\nWednesday, August 16, 2023 2:13.28 PM\nAttachments:\nCoeneJPAVMRCFinalSig pdf\nAttached is a rip rap\nproject in Lancaster county.\nI\nbelieve the entire project is subaqueous, however, there is & portion of nourishment that will\noccur in the inter-tidal area.\nbut I do not believe that wetlands approval is needed to nourish\nan\nexisting inter-tidal area.\nLet me know if that is not the case.\nNo\ndeal_ just need to\ncollect the fee if a hearing is required.\nThanks!\nReceived by VMRC August 16,2023\n/blh\nRip\nbig\nJoey\n--- Page 2 ---\nDEQ: Permit application fees required for Virginia Water Protection permits\nwhile detailed in\n9VAC25-20\n3\nare\nconveyed to the applicant by the applicable DEQ office\n(http:/Iwww deg virginia gov/Locations aspx):\nComplete the Permit Application Fee Form and\nsubmit it per the instruct

In [None]:
import re

def clean_text(text):
    # Remove headers, footers, and page numbers if patterns are known
  text = re.sub(r'Page \d+ of \d+', '', text)
  text = re.sub(r'\s+', ' ', text)  # collapse whitespace
  text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # fix broken lines
  return text.strip()

In [None]:
clean_text = clean_text(all_text)

In [17]:
clean_text

'--- Page 1 --- From: Joey Scott To: MRC jpa Permits Cc: Woodward_Jay (MRC); Olivia Hall; Alex Green; Kelsey English Subject: Coene Rap Project Date: Wednesday, August 16, 2023 2:13.28 PM Attachments: CoeneJPAVMRCFinalSig pdf Attached is a rip rap project in Lancaster county. I believe the entire project is subaqueous, however, there is & portion of nourishment that will occur in the inter-tidal area. but I do not believe that wetlands approval is needed to nourish an existing inter-tidal area. Let me know if that is not the case. No deal_ just need to collect the fee if a hearing is required. Thanks! Received by VMRC August 16,2023 /blh Rip big Joey --- Page 2 --- DEQ: Permit application fees required for Virginia Water Protection permits while detailed in 9VAC25-20 3 are conveyed to the applicant by the applicable DEQ office (http:/Iwww deg virginia gov/Locations aspx): Complete the Permit Application Fee Form and submit it per the instructions to the address listed on the form. Inst

In [None]:
raw_text = clean_text
output = llm_extractor.extract_structured_data(raw_text)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{
  "applicant_name": "",
  "project_title": "",
  "project_location": "",
  "permit_id": "",
  "date_received": "August 16, 2023",
  "proposed_structure": "Any applicable local ordinances, such as piers include an attached boat lift and an open-sided roof designed to shelter a single boat slip or boat lift: In cases in which open-sided roofs designed to shelter a single boat, boat slip or boat lift will exceed 700 square feet in coverage or the open-sided shelter roofs or gazebo structures exceed 400 square feet, and in cases in which an adjoining property owner objects to a proposed roof structure, permits shall be required as provided in $ 28.2-1204.",
  "purpose": "",
  "justification": "",
  "type": "",
  "size": "",
  "registration_number": "",
  "for_marinas_commercial_piers_governmental_piers_community_piers_other_non_private_piers": {
    "have_you_obtained_approval_for_sanitary_facilities_from_the_virginia_department_of_health?": "Required pursuant to Section 28.2-1205 C of t