In [50]:
import requests
import pdfplumber
from io import BytesIO
import openai
from openai import OpenAI
import json
import re
import pandas as pd
from dotenv import load_dotenv
import os
from llama_parse import LlamaParse
import nest_asyncio
nest_asyncio.apply()

In [52]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [129]:
# def extract_tables(pdf_path, api_key=None, use_ocr=True):
#     if not os.path.isfile(pdf_path):
#         raise FileNotFoundError(f"File not found: {pdf_path}")
    
#     parser = LlamaParse(
#         api_key="llx-HdgBKfPPO6vVO0CeuSkOduMH5DaCuLwuDsBHJsjMLS2Y6dod",
#         result_type="markdown",
#         extract_charts=True,
#         auto_mode=True,
#         auto_mode_trigger_on_image_in_page=True,
#         auto_mode_trigger_on_table_in_page=True,
#     )

#     docs = parser.load_data(pdf_path)
#     tables = []

#     for doc in docs:
#         if hasattr(doc, 'metadata') and 'tables' in doc.metadata:
#             tables = doc.metadata['tables']
#         else:
#             lines = doc.text.split('\n')
#             start_idxs, end_idxs = [], []
#             in_table, start = False, 0

#             for i, line in enumerate(lines):
#                 line = line.strip()
#                 if line.startswith('|') and '|' in line[1:]:
#                     if not in_table:
#                         in_table, start = True, i
#                 elif in_table and '---' in line and '|' in line:
#                     continue
#                 elif in_table and (not line.startswith('|') or line == ''):
#                     in_table = False
#                     start_idxs.append(start)
#                     end_idxs.append(i)

#             if in_table:
#                 start_idxs.append(start)
#                 end_idxs.append(len(lines))

#             for s, e in zip(start_idxs, end_idxs):
#                 content = '\n'.join(lines[s:e])
#                 tables.append({
#                     'content': content,
#                     'type': 'table',
#                     'page_number': None,
#                 })

#         if not tables and use_ocr and hasattr(doc, 'pages'):
#             for i, page in enumerate(doc.pages):
#                 text = page.text if hasattr(page, 'text') else ""
#                 if text.count('|') > 10:
#                     lines = text.split('\n')
#                     t_lines = []
#                     in_block = False

#                     for line in lines:
#                         if '|' in line and line.count('|') >= 3:
#                             if not in_block:
#                                 in_block = True
#                                 t_lines = [line]
#                             else:
#                                 t_lines.append(line)
#                         elif in_block and len(line.strip()) < 3:
#                             in_block = False
#                             if len(t_lines) >= 3:
#                                 tables.append({
#                                     'content': '\n'.join(t_lines),
#                                     'type': 'table',
#                                     'page_number': i + 1,
#                                     'extracted_via': 'heuristic'
#                                 })
#                             t_lines = []
#                         elif in_block:
#                             t_lines.append(line)

#                     if in_block and len(t_lines) >= 3:
#                         tables.append({
#                             'content': '\n'.join(t_lines),
#                             'type': 'table',
#                             'page_number': i + 1,
#                             'extracted_via': 'heuristic'
#                         })

#     return tables

In [204]:
def extract_tables(pdf_path, api_key=None, use_ocr=True, context_chars=100):
    if not os.path.isfile(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    parser = LlamaParse(
        api_key="llx-HdgBKfPPO6vVO0CeuSkOduMH5DaCuLwuDsBHJsjMLS2Y6dod",
        result_type="markdown",
        extract_charts=True,
        auto_mode=True,
        auto_mode_trigger_on_image_in_page=True,
        auto_mode_trigger_on_table_in_page=True,
    )

    docs = parser.load_data(pdf_path)
    tables = []

    for doc in docs:
        full_text = doc.text
        lines = full_text.split('\n')

        # Build char offsets to map line index to char position
        char_offsets = [0]
        for line in lines:
            char_offsets.append(char_offsets[-1] + len(line) + 1)

        start_idxs, end_idxs = [], []
        in_table, start = False, 0

        for i, line in enumerate(lines):
            line = line.strip()
            if line.startswith('|') and '|' in line[1:]:
                if not in_table:
                    in_table, start = True, i
            elif in_table and '---' in line and '|' in line:
                continue
            elif in_table and (not line.startswith('|') or line == ''):
                in_table = False
                start_idxs.append(start)
                end_idxs.append(i)

        if in_table:
            start_idxs.append(start)
            end_idxs.append(len(lines))

        for s, e in zip(start_idxs, end_idxs):
            table_text = '\n'.join(lines[s:e])
            char_start = char_offsets[s]
            char_end = char_offsets[e] if e < len(char_offsets) else len(full_text)

            pre = full_text[max(0, char_start - context_chars):char_start].strip()

            tables.append({
                'content': table_text,
                'type': 'table',
                'page_number': None,
                'context_before': pre,
            })

        if not tables and use_ocr and hasattr(doc, 'pages'):
            for i, page in enumerate(doc.pages):
                text = page.text if hasattr(page, 'text') else ""
                if text.count('|') > 10:
                    lines = text.split('\n')
                    in_block = False
                    block_lines = []

                    for j, line in enumerate(lines):
                        if '|' in line and line.count('|') >= 3:
                            if not in_block:
                                in_block = True
                                block_lines = [line]
                            else:
                                block_lines.append(line)
                        elif in_block and len(line.strip()) < 3:
                            in_block = False
                            if len(block_lines) >= 3:
                                block_text = '\n'.join(block_lines)

                                # Get char positions for context
                                flat_text = '\n'.join(lines)
                                block_str = '\n'.join(block_lines)
                                block_idx = flat_text.find(block_str)
                                pre = flat_text[max(0, block_idx - context_chars):block_idx].strip()

                                tables.append({
                                    'content': block_text,
                                    'type': 'table',
                                    'page_number': i + 1,
                                    'context_before': pre,
                                    'extracted_via': 'heuristic'
                                })
                            block_lines = []
                        elif in_block:
                            block_lines.append(line)

                    if in_block and len(block_lines) >= 3:
                        block_text = '\n'.join(block_lines)
                        flat_text = '\n'.join(lines)
                        block_idx = flat_text.find(block_text)
                        pre = flat_text[max(0, block_idx - context_chars):block_idx].strip()

                        tables.append({
                            'content': block_text,
                            'type': 'table',
                            'page_number': i + 1,
                            'context_before': pre,
                            'extracted_via': 'heuristic'
                        })

    return tables


In [285]:
enrollment_template_prompt = """
You are given tables extracted from a school's PDF disclosure document.

Your task is to extract enrollment-related data from these tables. The fields of interest include, but are not limited to, the following:

Fields marked as most important:
- Undergraduate Headcount
- Graduate Headcount
- Professional Headcount (Tip: combined the enrollement of the profession related school such as med school, law school, etc)
- Total Headcount
- Undergraduate FTE
- Graduate FTE
- Professional FTE
- Total Full-Time Equivalent Students
- Applications Rcvd
- Acceptances
- Matriculants
- Retention Rate %
- Full-Time Employee Equivalents
- Tuition
- Room & Board (20 meals)

IF the year is not included in the table, use the pre context to figure out the school year the table is responsible for.

DO NOT assume the year of the data. 

Extract data STRICTLY for the years 2019 through 2024, if available (although not all data going to be in that order).

Important:
**Do not assume** that the document’s publication or posting year is the same as the data year. Carefully read and extract the actual years provided in the tables themselves. For example, a document published in 2025 might contain data for the fiscal year ending in 2024.

### Output Format:
Return a JSON object where each key is a field name and each value is a dictionary of {year: value}, such as:

{
  "Undergraduate Headcount": {
    "2024": 4107,  
    "2023": 4100,
    "2022":
    ...
  },
  "Graduate Headcount": {
    "2024": 49800,
    "2023": 48000.
    "2022":
    ...
  }
}
The output format order HAS to follow the order of the fields I listed above at the "all the fields" part.

If a field is not found, omit it. I would prefer no data over faulty data. Return **only** the JSON.
"""


In [287]:
def data_extraction(tables, prompt, max_chars = 100000):
    # if any(isinstance(tbl, list) for tbl in tables):
    #     tables = [t for sublist in tables for t in sublist]
    
    table_texts = [t['content'] for t in tables if isinstance(t, dict) and 'content' in t]
    combined_table_text = "\n\n".join(table_texts)
    input_prompt = f"{prompt}\n\n### Tables Extracted:\n\n{combined_table_text}"

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": input_prompt}],
            temperature=0.2,
            max_tokens=2000
        )

        results = response.choices[0].message.content
    except Exception as e:
        print(f"Error extracting from {filename}: {e}")

    return results

In [289]:
def clean_output(raw_text):

    cleaned = re.sub(r"```json|```", "", raw_text).strip()

    # Remove inline comments: // anything after
    cleaned = re.sub(r'//.*', '', cleaned)

    # Optionally strip any trailing commas
    cleaned = re.sub(r',\s*}', '}', cleaned)
    cleaned = re.sub(r',\s*\]', ']', cleaned)

    return json.loads(cleaned)


In [291]:
pdf_folder = "pdfs/Mt St Mary's fall 24 continuing disclosure.pdf"

all_tables = extract_tables(pdf_folder)

gpt_output = data_extraction(all_tables, enrollment_template_prompt)

try:
    results = clean_output(gpt_output)
except Exception as e:
    print("JSON parsing failed:", e)
    print("GPT Raw Output:\n", gpt_output)
    results = {}

with open("enrollment_data.json", "w") as f:
    json.dump(results, f, indent=2)

Started parsing the file under job_id 0bed4249-c8b5-412c-a01d-0f355be9d413


In [292]:
df1 = pd.DataFrame(results)
df1

Unnamed: 0,Undergraduate Headcount,Graduate Headcount,Professional Headcount,Total Headcount,Undergraduate FTE,Graduate FTE,Professional FTE,Total Full-Time Equivalent Students,Applications Rcvd,Acceptances,Matriculants,Retention Rate %,Full-Time Employee Equivalents,Tuition,Room & Board (20 meals)
2024,1830.0,2408.0,2408.0,2408.0,1830.0,2134.0,2134.0,2134.0,6041.0,4459.0,545.0,77.0,,48630,15050
2023,1873.0,2499.0,2499.0,2499.0,1873.0,2255.0,2255.0,2255.0,5437.0,3896.0,482.0,75.0,,47240,14750
2022,1896.0,2456.0,2456.0,2456.0,1896.0,2248.0,2248.0,2248.0,4346.0,3458.0,459.0,79.0,,45870,14320
2021,2055.0,2570.0,2570.0,2570.0,2055.0,2367.0,2367.0,2367.0,4620.0,3777.0,532.0,74.0,,44750,13960
2020,2072.0,2561.0,2561.0,2561.0,2072.0,2379.0,2379.0,2379.0,6442.0,5221.0,647.0,76.0,,43650,13630
2019,,,,,,,,,,,,,,42590,13330


In [293]:
pdf_folder = "pdfs/U La Verne CA fall 23 continuing disclosure.pdf"

all_tables = extract_tables(pdf_folder)

gpt_output = data_extraction(all_tables, enrollment_template_prompt)

try:
    results = clean_output(gpt_output)
except Exception as e:
    print("JSON parsing failed:", e)
    print("GPT Raw Output:\n", gpt_output)
    results = {}

with open("enrollment_data.json", "w") as f:
    json.dump(results, f, indent=2)

Started parsing the file under job_id 98c8cc93-fbbc-419c-88be-6b8bda6be9ca


In [294]:
df2 = pd.DataFrame(results)
df2

Unnamed: 0,Undergraduate Headcount,Graduate Headcount,Professional Headcount,Total Headcount,Undergraduate FTE,Graduate FTE,Professional FTE,Total Full-Time Equivalent Students,Applications Rcvd,Acceptances,Matriculants,Tuition,Room & Board (20 meals)
2024,3833,2215,620,8277,3121,1419,544,4984,,,,,
2023,3379,1731,633,7343,3088,1167,428,4683,5994.0,5659.0,734.0,48300.0,15750.0
