In [None]:
import requests
import pandas as pd
import re
from configparser import ConfigParser
import time

# Read the config file
config = ConfigParser()
config.read('config.conf')

# Extract the secret from the config file
api_key = config['api']['api_key']

# Example endpoint (replace with the specific endpoint you need)
base_url = 'https://api.fda.gov/drug/drugsfda.json'

# Parameters for the API request
params = {
    'api_key': api_key,
    # PULL IN SPECIFIC APP NUMBERS
    'search': 'application_number:("NDA203469" OR "NDA217785" OR "NDA213217" OR "NDA216059" OR "BLA125514" OR "NDA214938" OR "NDA212725" OR "NDA218550" OR "BLA761345" OR "BLA761342" OR "BLA761309" OR "BLA761324" OR "NDA215887" OR "BLA761137" OR "BLA761334" OR "NDA216403" OR "NDA213411" OR "BLA761269" OR "BLA761263" OR "NDA216340" OR "BLA761310" OR "BLA761291" OR "NDA214801" OR "NDA213246" OR "BLA761139" OR "NDA216387" OR "NDA202806" OR "NDA204114" OR "NDA215039" OR "NDA208712" OR "NDA213137" OR "NDA216157" OR "NDA215935" OR "NDA215358" OR "BLA761208" OR "NDA215310" OR "BLA761223" OR "BLA761178" OR "NDA214622" OR "NDA214665" OR "BLA761210" OR "BLA761196" OR "BLA761174" OR "BLA761115" OR "BLA125557" OR "NDA214383" OR "NDA213026" OR "BLA761097" OR "NDA213176" OR "NDA214096" OR "BLA761145" OR "NDA214701" OR "BLA761171" OR "NDA213721" OR "NDA212154" OR "NDA213464" OR "BLA761158" OR "BLA761163" OR "NDA212306" OR "NDA213400" OR "NDA213702" OR "NDA204384" OR "NDA212269" OR "NDA209115" OR "NDA204026" OR "NDA213591" OR "NDA213736" OR "NDA208574" OR "BLA125377" OR "BLA125554" OR "NDA211723" OR "NDA211970" OR "NDA206947" OR "NDA212726" OR "BLA761121" OR "NDA212018" OR "BLA761034" OR "NDA210861" OR "NDA211710" OR "NDA208573" OR "NDA210868" OR "NDA207356" OR "NDA211155" OR "NDA208623" OR "NDA021462" OR "NDA210563" OR "NDA203341" OR "NDA210259" OR "NDA209936" OR "NDA209570" OR "NDA207968" OR "BLA761078" OR "BLA761069" OR "NDA208772" OR "BLA761049" OR "NDA205552" OR "BLA761038" OR "NDA206488" OR "NDA207999" OR "NDA204630" OR "NDA208434" OR "BLA761036" OR "NDA208065" OR "BLA761025" OR "NDA208030" OR "NDA206910" OR "NDA205353" OR "NDA207103" OR "NDA206162" OR "NDA205858" OR "NDA206256" OR "NDA205755" OR "NDA203202" OR "BLA125409" OR "BLA125151" OR "NDA021882" OR "NDA203585" OR "NDA203985" OR "NDA202497" OR "NDA202714" OR "NDA022334" OR "NDA021825" OR "NDA202570" OR "BLA125388" OR "NDA022393" OR "NDA021945" OR "NDA021986" OR "NDA022068" OR "NDA022059" OR "BLA125326" OR "NDA022468" OR "BLA125085" OR "NDA021588" OR "NDA022273" OR "NDA022291" OR "NDA020634" OR "NDA020635" OR "NDA021721" OR "NDA022187" OR "NDA022145" OR "NDA022128" OR "BLA125147" OR "NDA021976" OR "NDA021430" OR "NDA021968" OR "NDA020726" OR "NDA021877" OR "NDA021814" OR "NDA021673" OR "BLA125011" OR "NDA021272" OR "BLA125104" OR "NDA021322" OR "NDA021677" OR "NDA021752" OR "BLA125084" OR "NDA021335" OR "NDA021602" OR "NDA021399" OR "BLA103979" OR "NDA021481" OR "NDA020541" OR "NDA021492" OR "BLA125019" OR "NDA021356" OR "BLA103948" OR "NDA021205" OR "NDA021226" OR "NDA021251" OR "NDA019858" OR "NDA020780" OR "NDA019537" OR "NDA019847" OR "NDA019857" OR "NDA021174" OR "NDA021156" OR "NDA050747" OR "NDA021029" OR "NDA050718" OR "NDA021007" OR "NDA021039" OR "NDA021041" OR "BLA103767" OR "NDA020977" OR "NDA020978" OR "NDA020972" OR "NDA020636" OR "NDA020933" OR "BLA103772" OR "NDA021024" OR "NDA019832" OR "NDA020896" OR "NDA020705" OR "NDA020778" OR "NDA020779" OR "NDA019815" OR "NDA020604" OR "NDA020571" OR "NDA020449" OR "NDA020221" OR "NDA020685" OR "NDA020659" OR "NDA020680" OR "NDA020628" OR "NDA020564" OR "NDA020596" OR "NDA020498" OR "NDA020212" OR "NDA020412" OR "NDA050697" OR "NDA050698" OR "BLA103471" OR "NDA020199")',
    'limit': 99,  # openFDA API call limit 100? or 240?
}

# Initiate where we're going to store stuff
all_results = []
skip = 0
requests_per_minute = 240
delay = 60 / requests_per_minute

while True:
    params['skip'] = skip
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        if 'results' in data:
            all_results.extend(data['results'])
            skip += 100
            print(f"Retrieved {len(data['results'])} records, total records so far: {len(all_results)}")
            time.sleep(delay) # Delay between requests to avoid rate limit
        else:
            break
    elif response.status_code == 429: # Too many requests
        print("Rate limit exceeded.")
        time.sleep(60)
    else:
        print(f"Request failed with status code {response.status_code}")
        break

# Normalize the main part of the data
df_main = pd.json_normalize(all_results, sep='_', errors='ignore')

# Create a new column with a space between letters and numbers in application_number
df_main['application_number_with_space'] = df_main['application_number'].apply(lambda x: re.sub(r'([A-Za-z]+)(\d+)', r'\1 \2', x))

# Check and normalize the 'submissions' field
submissions_list = []
for item in all_results:
    if 'submissions' in item:
        for sub in item['submissions']:
            sub['application_number'] = item['application_number']
            submissions_list.append(sub)
if submissions_list:
    df_submissions = pd.json_normalize(submissions_list, sep='_').add_prefix('submissions_')
    df_main = df_main.merge(df_submissions, left_on='application_number', right_on='submissions_application_number', how='left')

# Check and normalize the 'openfda' field
openfda_list = []
for item in all_results:
    if 'openfda' in item:
        openfda_data = item['openfda']
        openfda_data['application_number'] = item['application_number']
        openfda_list.append(openfda_data)
if openfda_list:
    df_openfda = pd.json_normalize(openfda_list, sep='_').add_prefix('openfda_')
    df_main = df_main.merge(df_openfda, left_on='application_number', right_on='openfda_application_number', how='left')

# Check and normalize the 'products' field
products_list = []
for item in all_results:
    if 'products' in item:
        for product in item['products']:
            product['application_number'] = item['application_number']
            products_list.append(product)
if products_list:
    df_products = pd.json_normalize(products_list, sep='_').add_prefix('products_')
    df_main = df_main.merge(df_products, left_on='application_number', right_on='products_application_number', how='left')

# Drop the original nested columns
df_main = df_main.drop(columns=['submissions', 'openfda', 'products'], errors='ignore')

# Print the dataframe
print(df_main.head())

In [None]:
# Save the dataframe from openFDA as a parquet file

output_file = 'openfda.parquet'
df_main.to_parquet(output_file, index=False)
print(f"Data has been saved to {output_file}")