# Open FDA Initial Data Pre-Processing
## Authors: 
    1. Lam Ho
    2. Jonah Breslow
    3. Jeffrey Kagan
## Purpose:
The purpose of this notebook is to do some preliminary cleanup/filtering of the [OpenFDA Drug Data](https://open.fda.gov/apis/drug/) data. Primarily, we flatten the JSON data into a tabular format before using it for entity matching.

### Importing modules

In [18]:
import json
import os
import pandas as pd
import re

### Parameters


In [19]:
run_flatten = True

### Importing JSON data

In [20]:
if run_flatten == True:
        
    # IMPORTING JSON 
    files = os.listdir('Data/openfda_json')
    files = [each for each in files if each.endswith('.json')]
    file_paths = ['Data/openfda_json/' + each for each in files]
    
    
    # Flattening JSON file into a dataframe
    # NOTE: This took nearly 7-9 hours to run.
    my_dict = {}
    df = pd.DataFrame()
    for idx, file in enumerate(file_paths):
        print(f'Working on file {idx}')
        with open(file, 'r') as open_file:
            json_tmp = json.load(open_file)
        for entry in json_tmp['results']:
            df = df.append(entry['openfda'], ignore_index=True)
            
            
    # Save dataframe to a CSV
    df.to_csv('../Data/Outputs_Cleanup/FDA/openfda.csv', index=False)

Parameters Work!!


## Read in dataframe and clean it up
#### Separate step to not repeat 9 hour execution above

In [21]:
df = pd.read_csv('Data/Outputs_Cleanup/FDA/openfda.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Data/Outputs_Cleanup/FDA/openfda.csv'

In [4]:
df.dropna(subset=['brand_name', 'generic_name', 'manufacturer_name', 'substance_name'], inplace=True)
try:
    del df['Unnamed: 0']
except:
    pass

In [5]:
columns = df.columns

# Remove []
for column in columns:
    df[column] = df[column].apply(lambda x: str(x)[1:-1] if str(x).startswith('[') else str(x))

# Remove ''
for column in columns:
    df[column] = df[column].apply(lambda x: x.replace('\'', ''))
    
df.reset_index(inplace=True)
df.rename(columns = {'index':'fda_drug_id'}, inplace=True)
display(df.head())

Unnamed: 0,fda_drug_id,brand_name,generic_name,manufacturer_name,product_ndc,product_type,route,substance_name,spl_id,spl_set_id,...,nui,pharm_class_epc,pharm_class_pe,pharm_class_cs,unii,application_number,rxcui,upc,original_packager_product_ndc,pharm_class_moa
0,6,Dentox,"BERBERIS VULGARIS, GLYCYRRHIZA GLABRA, LAPPA M...","BioActive Nutritional, Inc.",43857-0039,HUMAN OTC DRUG,ORAL,"BERBERIS VULGARIS ROOT BARK, GLYCYRRHIZA GLABR...",e4462f59-9ba5-43a9-9b2b-3e5a807545dd,ae51c063-63b4-45cc-b8d0-e98584b65a66,...,"N0000185371, N0000185375, N0000175629, N000018...",Non-Standardized Food Allergenic Extract [EPC]...,"Increased Histamine Release [PE], Cell-mediate...","Allergens [CS], Dietary Proteins [CS], Plant P...","1TH8Q20J0U, 2788Z9758H, 597E9BI3Z3, 11E6VI8VEG...",,,,,
1,19,Cactus Cinis Avenae,CACTUS CINIS AVENAE,Uriel Pharmacy Inc.,48951-3239,HUMAN OTC DRUG,ORAL,"MAGNESIUM PHOSPHATE, TRIBASIC, PENTAHYDRATE, S...",b6c13079-0ed0-0d79-e053-2a95a90a8d7c,b6c13079-0ecf-0d79-e053-2a95a90a8d7c,...,,,,,"453COF7817, 7114SV0MYK, 6OM09RPY36, OZ0E5Y15PZ",,,,,
2,22,Zinc Oxide,ZINC OXIDE,Ultra Seal Corporation,42213-365,HUMAN OTC DRUG,TOPICAL,ZINC OXIDE,b4c9e519-4ee3-4cba-9338-148b9cade2ac,e45ac82c-508c-4c41-ba2c-f48d36bc63fe,...,,,,,SOI2LOH54Z,part347,198911.0,1500365569.0,,
3,23,good sense nasal,OXYMETAZOLINE HCL,L. Perrigo Company,0113-0388,HUMAN OTC DRUG,NASAL,OXYMETAZOLINE HYDROCHLORIDE,244eff85-348f-42a5-a26e-aaaee89bfc69,5d42adb9-3625-4072-a6b1-29a229f255c8,...,,,,,K89MJ0S5VY,part341,1000990.0,,,
4,33,Amantadine Hydrochloride,AMANTADINE HYDROCHLORIDE,Strides Pharma Inc.,42543-493,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,05b79d14-440a-4526-bccf-8bc26a2ec8a0,e4d8f36f-d668-4728-8dca-b1c22bd9aedb,...,,,,,M6Q1EO9TD0,ANDA209047,849389.0,342543493018.0,,


### Pre-Processing the FDA data

In [6]:
def remove_punc(string):
    '''
    takes in a string and removed all punctuation.
    '''
    return re.sub(r'[^\w\s]','',string)

def preProcess(df):
    '''
    takes a dataframe, applies the remove_punc function
    and then upper cases all text columns
    '''
    df = df.applymap(lambda s:remove_punc(s).upper() if isinstance(s, str) else s)
    return df

### Outputting the Open FDA processed data

In [7]:
df['manufacturer_name_normalized'] = preProcess(df[['manufacturer_name']])
manu = df[['manufacturer_name_normalized']].copy().drop_duplicates()
manu.reset_index(inplace=True)
manu.columns = ['fda_manuf_id', 'manufacturer_name_normalized']
#manu['fda_manuf_id'] = manu['fda_manuf_id'].apply(lambda x: 'fda' + str(x))
df = df.merge(manu, how='left', on='manufacturer_name_normalized')
df.to_csv('Data/Outputs_Cleanup/FDA/openfda_processed.csv', index=False)