In [8]:
import fitz  # PyMuPDF
import pandas as pd

# Path to the PDF file
pdf_path = r'C:\Users\Foxhound\OneDrive\Documents\UMGC\Summit\CDERAA.pdf'

# Open the PDF
pdf_document = fitz.open(pdf_path)

# Extract text from all pages
page_text = ""
for page_num in range(pdf_document.page_count):
    page = pdf_document[page_num]
    page_text += page.get_text("text")

# Split the text into lines
lines = page_text.split('\n')

# Define headers for the DataFrame
headers = [
    "Application Number", "Proprietary Name", "Established Name", "Applicant", 
    "FDA Received Date", "Accelerated Approval Date", "Total Time to Accelerated Approval (Months)", 
    "Accelerated Approval Indication", "Conversion-Withdrawal Status", "Full Approval Conversion-Withdrawal Date"
]

# Initialize an empty list to store the table rows
table_data = []

# Temporary row to store current row values
current_row = []

# Function to check if a line starts a new entry
def starts_new_entry(line):
    return line.startswith("NDA") or line.startswith("BLA")

# Iterate over the lines and extract the rows
for line in lines:
    if starts_new_entry(line):
        if current_row:
            table_data.append(current_row)
        current_row = line.split(maxsplit=3)
    else:
        # Continue appending the rest of the columns to the current row
        current_row.extend(line.split())

# Don't forget to append the last collected row
if current_row:
    table_data.append(current_row)

# Clean rows to match header length by truncating or padding with None
cleaned_data = []
for row in table_data:
    if len(row) > len(headers):
        cleaned_data.append(row[:len(headers)])
    else:
        cleaned_data.append(row + [None] * (len(headers) - len(row)))

# Convert the list of rows into a DataFrame
df = pd.DataFrame(cleaned_data, columns=headers)

# Display the DataFrame
print(df)

# Optionally, save to a CSV file
df.to_csv("CDER_Accelerated_Approvals.csv", index=False)


    Application Number Proprietary Name Established Name       Applicant  \
0          Application           Number      Proprietary            Name   
1                  NDA           203469       Supplement              37   
2                  NDA           217785        REZDIFFRA      RESMETIROM   
3                  NDA           213217       Supplement              11   
4                  NDA           216059       Supplement               1   
..                 ...              ...              ...             ...   
309                NDA           020412            ZERIT       STAVUDINE   
310                NDA           050697         Original               1   
311                NDA           050698           BIAXIN  CLARITHROMYCIN   
312                BLA           103471        BETASERON      INTERFERON   
313                NDA           020199            HIVID     ZALCITABINE   

    FDA Received Date Accelerated Approval Date  \
0         Established               