In [8]:
import fitz  # PyMuPDF
import re
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output

def get_year_and_week():
    """Return the current year and week of the year as strings."""
    now = datetime.now()
    year = now.strftime("%Y")         # Current year
    week_of_year = now.strftime("%U") # Week number of the year
    return year, week_of_year

def extract_report_data(pdf_path):
    # Extract text in a visual order using PyMuPDF
    def extract_text_sorted(pdf_path):
        doc = fitz.open(pdf_path)
        full_text = ""
        for page in doc:
            blocks = page.get_text("blocks")  # Extract text blocks
            sorted_blocks = sorted(blocks, key=lambda b: (b[1], b[0]))  # Sort by y (top-to-bottom), then x
            for block in sorted_blocks:
                text = block[4]
                # Remove lines starting with "FORM ID" or Page numbers
                if not text.strip().startswith("FORM ID") and not re.match(r"^Page \d+", text.strip()):
                    full_text += text + "\n"
        return full_text

    full_text = extract_text_sorted(pdf_path)

    # Extract project name as the second non-empty line under Information section
    information_pattern = r"Information\n(.*?)(?=DESCRIPTION)"
    information_match = re.search(information_pattern, full_text, re.S)
    information = information_match.group(1).strip() if information_match else "No Information Found"
    information_lines = [line.strip() for line in information.splitlines() if line.strip()]
    project_name = information_lines[1] if len(information_lines) > 1 else "No_Project_Name"

    # Extract all text from General Information section
    general_info_pattern = r"General Information\n(.*?)(?=Housekeeping|Fall Protection|Incidents)"
    general_info_match = re.search(general_info_pattern, full_text, re.S)
    general_info = general_info_match.group(1).strip() if general_info_match else "No General Information Found"

    # Extract incidents - include everything in the incidents section and calculate count
    incidents_pattern = r"Corrective Action Required:\n(.*?)(?=Additional Photos)"
    incidents_match = re.search(incidents_pattern, full_text, re.S)
    num_incidents = 0

    if incidents_match:
        incidents_section = incidents_match.group(1).strip()
        if "Description\nNo Response" in incidents_section:
            incidents = "No Incidents Reported"
        else:
            incidents = incidents_section  # Include everything from the incidents section
            # Count incidents based on occurrences of Description, Responsible Party, Completion Date
            description_count = incidents_section.count("Description")
            responsible_party_count = incidents_section.count("Responsible Party")
            completion_date_count = incidents_section.count("Completion Date")
            num_incidents = min(description_count, responsible_party_count, completion_date_count)
    else:
        incidents = "No Incidents Reported"

    # Define the output file name using the project name
    year, week = get_year_and_week()
    output_txt_path = f"{year}-{week}-Safety-{project_name.replace(' ', '_')}.txt"
    # Write the output to a text file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write("Health and Safety Weekly Report\n\n")
        txt_file.write("Information:\n")
        txt_file.write(f"{information}\n\n")
        txt_file.write("General Information:\n")
        txt_file.write(f"{general_info}\n\n")
        txt_file.write("Incidents:\n")
        txt_file.write(f"Number of Incidents: {num_incidents}\n\n")
        txt_file.write(f"{incidents}\n")

    print(f"Report generated and saved to {output_txt_path}")

def run_batch_processing():
    output = widgets.Output()

    # Widgets for input and output selection
    pdf_upload = widgets.FileUpload(accept='.pdf', multiple=True)
    process_button = widgets.Button(description="Process Files", button_style='success')

    display(widgets.VBox([
        widgets.Label("Upload PDF Files:"), pdf_upload,
        widgets.HBox([process_button]),
        output
    ]))

    def process_files(b):
        with output:
            if not pdf_upload.value:
                print("No PDF files uploaded.")
                return
            for uploaded_file in pdf_upload.value:
                try:
                    extract_report_data(uploaded_file)
                except Exception as e:
                    print(f"Error processing file {uploaded_file['name']}: {e}")
            print("Reports generated successfully!")


    process_button.on_click(process_files)

if __name__ == "__main__":
    run_batch_processing()



VBox(children=(Label(value='Upload PDF Files:'), FileUpload(value=(), accept='.pdf', description='Upload', mul…