In [None]:
import pandas as pd
import tabula

# Path to the PDF file
pdf_path = 'CDERAA.pdf'

# Extract tables from the PDF
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, lattice=True)

# Combine all extracted tables into a single DataFrame
df_combined = pd.concat(tables, ignore_index=True)

PreProcessing Stage

In [None]:
# Remove the first 3 rows
df_combined = df_combined.iloc[4:].reset_index(drop=True)

# Set the first row as the header
df_combined.columns = df_combined.iloc[0]
df_combined = df_combined[1:].reset_index(drop=True)

# Remove extra spaces/double spaces
df_combined.columns = df_combined.columns.str.replace(r'\s+', ' ', regex=True)


In [None]:
# Display rows where the 'Proprietary Name' column is NaN
nan_rows = df_combined[df_combined['Established Name'].isna()]
print(nan_rows)

In [None]:
# Drop rows that contain NaN in Established Name
# Established name should always have a value in this list. 
# This provides an accurate way of knowing which rows are junk text from the PDF
df_combined = df_combined.dropna(subset=['Established Name'])

In [None]:
# Some App #s have a superscript. Let's remove it.

def remove_appnum_hangingdigit(row):
    if row['Proprietary Name'] == 'KEYTRUDA' and row['Application Number'].endswith('4'):
        return row['Application Number'][:-1]
    elif row['Proprietary Name'] == 'RUBRACA' and row['Application Number'].endswith('5'):
        return row['Application Number'][:-1]
    return row['Application Number']


# Apply the function to the DF
df_combined['Application Number'] = df_combined.apply(remove_appnum_hangingdigit, axis=1)

In [None]:
# Split Application Number and Supplement Number
# Regex \d means to 'match any digit'
df_combined['Supplement Number'] = df_combined['Application Number'].str.extract(r'(Supplement \d+)', expand=False)

# Cleam the original "Application Number" column to remove the "Supplement" part
df_combined['Application Number'] = df_combined['Application Number'].str.replace(r'\s*Supplement \d+', '', regex=True)

# Remove non-integer characters and convert to integers
# Regex: /D identifies any character not a digit
df_combined['Total Time to Accelerated Approval (Months)'] = df_combined['Total Time to Accelerated Approval (Months)'].str.replace(r'\D', '', regex=True)
df_combined = df_combined[~df_combined['Total Time to Accelerated Approval (Months)'].astype(str).str.contains('†', regex=False)]

# The column contains dates that have extra numbers, making it invalid
# This code ensures the dates are in a date format and removes extra numbers
df_combined['Full Approval Conversion- Withdrawal Date'] = df_combined['Full Approval Conversion- Withdrawal Date'].str.split().str[0]

# Check for # of NaN/NaT

numofnans = df_combined['Full Approval Conversion- Withdrawal Date'].isna().sum()
print(numofnans)

# convert to datetime, coerce errors to Not a Time
# Removes any NaN/NaT
df_combined['Full Approval Conversion- Withdrawal Date'] = pd.to_datetime(df_combined['Full Approval Conversion- Withdrawal Date'], errors='coerce')

# Check again for # of NaN just to make sure we didn't remove things that shouldn't have
numofnans = df_combined['Full Approval Conversion- Withdrawal Date'].isna().sum()
print(numofnans)

In [None]:
# Change column dtypes 

df_combined['Application Number'] = df_combined['Application Number'].astype("string")
df_combined['Proprietary Name'] = df_combined['Proprietary Name'].astype("string")
df_combined['Established Name'] = df_combined['Established Name'].astype('string')
df_combined['Applicant'] = df_combined['Applicant'].astype("string")
df_combined['FDA Received Date'] = df_combined['FDA Received Date'].astype("datetime64[ns]")
df_combined['Accelerated Approval Date'] = df_combined['Accelerated Approval Date'].astype("datetime64[ns]")
df_combined['Total Time to Accelerated Approval (Months)'] = df_combined['Total Time to Accelerated Approval (Months)'].astype("float")
df_combined['Accelerated Approval Indication'] = df_combined['Accelerated Approval Indication'].astype("string")
df_combined['Conversion-Withdrawal Status'] = df_combined['Conversion-Withdrawal Status'].astype("string")
df_combined['Full Approval Conversion- Withdrawal Date'] = df_combined['Full Approval Conversion- Withdrawal Date'].astype("datetime64[ns]")
df_combined['Supplement Number'] = df_combined['Supplement Number'].astype("string")

# Define desired order
desired_order = ['Application Number', 'Supplement Number', 'Proprietary Name', 'Established Name', 'Applicant', 'FDA Received Date', 'Accelerated Approval Date', 'Total Time to Accelerated Approval (Months)', 'Accelerated Approval Indication', 'Conversion-Withdrawal Status', 'Full Approval Conversion- Withdrawal Date', 'Supplement Number']
# Re-index the dataframe
df_combined = df_combined[desired_order]



In [None]:
# Replace Supplement Number NaN values with a space, for cleanliness
df_combined['Supplement Number'] = df_combined['Supplement Number'].fillna(' ')

# Replace NaN in 'Total Time to Accelerated Approval (Months)' with None (null)
df_combined['Total Time to Accelerated Approval (Months)'] = df_combined['Total Time to Accelerated Approval (Months)'].where(pd.notnull(df_combined['Total Time to Accelerated Approval (Months)']), None)


In [None]:
# Check datatypes
df_combined.dtypes

In [None]:
# check columns
df_combined.columns