In [8]:
from tabula import read_pdf
import pandas as pd

# Path to the PDF file
pdf_path = "CDERAA.pdf"

# Read the PDF into a list of DataFrames
dfs = read_pdf(pdf_path, pages="all", multiple_tables=True, encoding="utf-8")

# Combine all DataFrames into one
df_combined = pd.concat(dfs, ignore_index=True)

# Remove the first 3 rows
df_combined = df_combined.iloc[4:].reset_index(drop=True)

# Set the first row as the header
df_combined.columns = df_combined.iloc[0]
df_combined = df_combined[1:].reset_index(drop=True)

# Remove the last nan column that's empty and who's title is NaN
df_combined = df_combined.drop(df_combined.columns[-1], axis=1)

# Replace carriage return and new line
df_combined.columns = df_combined.columns.str.replace('\r', ' ').str.replace('\n', '')
df_combined.columns = df_combined.columns.str.strip()
# Remove extra spaces/double spaces
df_combined.columns = df_combined.columns.str.replace(r'\s+', ' ', regex=True)

# Remove parsed index column
# df_combined = df_combined.drop(columns='index')

In [9]:
# Split Application Number and Supplement Number
# Regex \d means to 'match any digit'
df_combined['Supplement Number'] = df_combined['Application Number'].str.extract(r'(Supplement \d+)', expand=False)

# Cleam the original "Application Number" column to remove the "Supplement" part
df_combined['Application Number'] = df_combined['Application Number'].str.replace(r'\s*Supplement \d+', '', regex=True)

#

# Drop rows that contain NaN in Proprietary Name
# Proprietary name should always have a value in this list. 
# This provides an accurate way of knowing which rows are junk text from the PDF
df_combined = df_combined.dropna(subset=['Proprietary Name'])

# Remove non-integer characters and convert to integers
# Regex: /D identifies any character not a digit
df_combined['Total Time to Accelerated Approval (Months)'] = df_combined['Total Time to Accelerated Approval (Months)'].str.replace(r'\D', '', regex=True)
df_combined = df_combined[~df_combined['Total Time to Accelerated Approval (Months)'].astype(str).str.contains('†', regex=False)]

# The column contains dates that have extra numbers, making it invalid
# This code ensures the dates are in a date format and removes extra numbers
df_combined['Full Approval Conversion- Withdrawal Date'] = df_combined['Full Approval Conversion- Withdrawal Date'].str.split().str[0]

# Check for # of NaN/NaT

numofnans = df_combined['Full Approval Conversion- Withdrawal Date'].isna().sum()
print(numofnans)

# convert to datetime, coerce errors to Not a Time
# Removes any NaN/NaT
df_combined['Full Approval Conversion- Withdrawal Date'] = pd.to_datetime(df_combined['Full Approval Conversion- Withdrawal Date'], errors='coerce')

# Check again for # of NaN just to make sure we didn't remove things that shouldn't have
numofnans = df_combined['Full Approval Conversion- Withdrawal Date'].isna().sum()
print(numofnans)

87
87


In [14]:
# Change column dtypes 

df_combined['Application Number'] = df_combined['Application Number'].astype("string")
df_combined['Proprietary Name'] = df_combined['Proprietary Name'].astype("string")
df_combined['Established Name'] = df_combined['Established Name'].astype('string')
df_combined['Applicant'] = df_combined['Applicant'].astype("string")
df_combined['FDA Received Date'] = df_combined['FDA Received Date'].astype("datetime64[ns]")
df_combined['Accelerated Approval Date'] = df_combined['Accelerated Approval Date'].astype("datetime64[ns]")
df_combined['Total Time to Accelerated Approval (Months)'] = df_combined['Total Time to Accelerated Approval (Months)'].astype("float")
df_combined['Accelerated Approval Indication'] = df_combined['Accelerated Approval Indication'].astype("string")
df_combined['Conversion-Withdrawal Status'] = df_combined['Conversion-Withdrawal Status'].astype("string")
df_combined['Full Approval Conversion- Withdrawal Date'] = df_combined['Full Approval Conversion- Withdrawal Date'].astype("datetime64[ns]")
df_combined['Supplement Number'] = df_combined['Supplement Number'].astype("string")

# Define desired order
desired_order = ['Application Number', 'Supplement Number', 'Proprietary Name', 'Established Name', 'Applicant', 'FDA Received Date', 'Accelerated Approval Date', 'Total Time to Accelerated Approval (Months)', 'Accelerated Approval Indication', 'Conversion-Withdrawal Status', 'Full Approval Conversion- Withdrawal Date', 'Supplement Number']
# Re-index the dataframe
df_combined = df_combined[desired_order]



In [11]:
# Check datatypes
df_combined.dtypes

0
Application Number                             string[python]
Proprietary Name                               string[python]
Established Name                               string[python]
Applicant                                      string[python]
FDA Received Date                              datetime64[ns]
Accelerated Approval Date                      datetime64[ns]
Total Time to Accelerated Approval (Months)           float64
Accelerated Approval Indication                string[python]
Conversion-Withdrawal Status                   string[python]
Full Approval Conversion- Withdrawal Date      datetime64[ns]
Supplement Number                              string[python]
dtype: object

In [12]:
# check columns
df_combined.columns

Index(['Application Number', 'Proprietary Name', 'Established Name',
       'Applicant', 'FDA Received Date', 'Accelerated Approval Date',
       'Total Time to Accelerated Approval (Months)',
       'Accelerated Approval Indication', 'Conversion-Withdrawal Status',
       'Full Approval Conversion- Withdrawal Date', 'Supplement Number'],
      dtype='object', name=0)