In [None]:
import pdfplumber
import re
import pandas as pd

def sanitize_text(text):
    illegal_char_pattern = r'[\000-\010]|[\013-\014]|[\016-\037]'
    sanitized_text = re.sub(illegal_char_pattern, '', text)
    return sanitized_text

def extract_info_from_pdf(pdf_path):
    principal_pattern = r"Principal:\s*(.+)"
    email_pattern = r"E-mail:\s*(\S+)"
    grade_span_pattern = r"Grade Span:\s*(.+)"

    all_data = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            sanitized_text = sanitize_text(text)

            principal_match = re.search(principal_pattern, sanitized_text)
            if principal_match:
                principal_name = principal_match.group(1)
            else:
                principal_name = "Not found"

            email_match = re.search(email_pattern, sanitized_text)
            if email_match:
                email = email_match.group(1)
            else:
                email = "Not found"

            grade_span_match = re.search(grade_span_pattern, sanitized_text)
            if grade_span_match:
                grade_span = grade_span_match.group(1)
            else:
                grade_span = "Not found"

            all_data.append({'Principal': principal_name, 'Email': email, 'Grade Span': grade_span})

    return all_data

pdf_path = "school.pdf"
all_info = extract_info_from_pdf(pdf_path)

df = pd.DataFrame(all_info)

excel_file_path = "final-list.xlsx"
df.to_excel(excel_file_path, index=False)

print("Data saved to:", excel_file_path)
