In [2]:
import requests
import pdfplumber
from io import BytesIO
import openai
from openai import OpenAI
import json
import re
import pandas as pd
from dotenv import load_dotenv
import os
from llama_parse import LlamaParse
import nest_asyncio
nest_asyncio.apply()

In [8]:
enrollment_template_prompt = """
You are given tables extracted from a school's PDF disclosure document.

Your task is to extract enrollment-related data from these tables. The fields of interest include, but are not limited to, the following:

Fields marked as most important:
- Undergraduate Headcount
- Graduate Headcount
- Professional Headcount (Tip: combined the enrollement of the profession related school such as med school, law school, etc)
- Total Headcount
- Undergraduate FTE
- Graduate FTE
- Professional FTE
- Total Full-Time Equivalent Students
- Applications Rcvd
- Acceptances
- Matriculants
- Retention Rate %
- Full-Time Employee Equivalents
- Tuition
- Room & Board (20 meals)

IF the year is not included in the table, use the pre context to figure out the school year the table is responsible for.

DO NOT assume the year of the data. 

Extract data STRICTLY for the years 2019 through 2024, if available (although not all data going to be in that order).

Important:
**Do not assume** that the document’s publication or posting year is the same as the data year. Carefully read and extract the actual years provided in the tables themselves. For example, a document published in 2025 might contain data for the fiscal year ending in 2024.

### Output Format:
Return a JSON object where each key is a field name and each value is a dictionary of {year: value}, such as:

{
  "Undergraduate Headcount": {
    "2024": 4107,  
    "2023": 4100,
    "2022":
    ...
  },
  "Graduate Headcount": {
    "2024": 49800,
    "2023": 48000.
    "2022":
    ...
  }
}
The output format order HAS to follow the order of the fields I listed above at the "all the fields" part.

If a field is not found, omit it. I would prefer no data over faulty data. Return **only** the JSON.
"""


In [24]:
def extract_tables(llama_api_key, filePath):
    documents_with_instruction = LlamaParse(
        api_key=llama_api_key,
        result_type="markdown",
        system_prompt="""
        This is the a school financial report. I would need you to extract information which are related to enrollment, headcount, FTE, admission, faculty, tuition and fees, student characteristics(Graduation rate, ACT, SAT scores etc.)
        tips: Sometimes Graduate could be written as GR or Grad or, some university has multiple graduate schools, such as school of business, college of liberal artsm or GR Non-degree. you should include these data as well.
        tips: Include the according year of the information, sometimes it's no inlucded in the table, so look around it.
        """
        ).load_data(filePath)
    extracted_tables = [] 
    
    for document in documents_with_instruction:
        text = document.text
        if re.search(r"It seems that the text provided is not", text):
            continue
        if re.search(r"does not contain", text):
            continue
                
        if text.lstrip().startswith("Here is the extracted information") or re.search(r"^\d{4}\s+", text, re.MULTILINE):
            extracted_tables.append(document)
    return extracted_tables

def data_extraction(tables, prompt, max_chars = 100000):
    comb_tables = "\n\n".join([doc.text for doc in tables])
    input_prompt = f"{prompt}\n\n### Tables Extracted:\n\n{comb_tables}"

    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": input_prompt}],
            temperature=0.2,
            max_tokens=2000
        )

        results = response.choices[0].message.content
    except Exception as e:
        print(f"Error extracting from {filename}: {e}")

    return results

def clean_output(raw_text):

    cleaned = re.sub(r"```json|```", "", raw_text).strip()

    # Remove inline comments: // anything after
    cleaned = re.sub(r'//.*', '', cleaned)

    # Optionally strip any trailing commas
    cleaned = re.sub(r',\s*}', '}', cleaned)
    cleaned = re.sub(r',\s*\]', ']', cleaned)

    return json.loads(cleaned)

In [12]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
llama_api_key = os.environ.get("LLAMAPARSE_API_KEY")

client = OpenAI()

In [46]:
pdf_folder = "pdfs"
output_folder = "output"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for school_folder in os.listdir(pdf_folder):
    school_path = os.path.join(pdf_folder, school_folder)
    if not os.path.isdir(school_path):
        continue

    print(f"\n Processing: {school_folder}")

    for file in os.listdir(school_path):
        if not file.endswith(".pdf"):
            continue

        pdf_path = os.path.join(school_path, file)
        print(f"File: {file}")

        try:
            tables = extract_tables(llama_api_key, pdf_path)
            gpt_output = data_extraction(tables, enrollment_template_prompt)
            data = clean_output(gpt_output)

        except Exception as e:
            print(f"Skipping {file} due to error: {e}")

        df = pd.DataFrame(data).T
        df.columns.name = "Year"

        output_file = os.path.join(output_folder, f"{school_folder}.xlsx")
        df.to_excel(output_file, index=True)
        print(f"Saved: {output_file}")



 Processing: MT_ST_MARY
File: Mt St Mary's fall 24 continuing disclosure.pdf
Started parsing the file under job_id e1f11cd4-c0f9-4fda-8a8b-91f4e5d243d1
Saved: output/MT_ST_MARY.xlsx

 Processing: U_La_Verne
File: U La Verne CA fall 23 continuing disclosure.pdf
Started parsing the file under job_id 0038a2fd-f000-41d3-973e-63a24826516b
Saved: output/U_La_Verne.xlsx

 Processing: .ipynb_checkpoints
