# Dedupe Preparation
Description: Convert openfda into a dedupe input by obtaining unique values from openfda, 
and assigning an Id column to the resulting dataframe.

INPUT: openfda_processed.csv (From openfda json to table)

OUTPUT: dataID.csv

In [32]:
import pandas as pd
import numpy as np
import pickle
import re
import os

In [33]:
def remove_punc(string):
    '''
    takes in a string and removed all punctuation.
    '''
    return re.sub(r'[^\w\s]','',string)

def preProcess(df):
    '''
    takes a dataframe, applies the remove_punc function
    and then upper cases all text columns
    '''
    df = df.applymap(lambda s:remove_punc(s).upper() if isinstance(s, str) else s)
    return df

In [34]:
file = '../../Data/Outputs_Cleanup/FDA/openfda_processed.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,fda_drug_id,brand_name,generic_name,manufacturer_name,product_ndc,product_type,route,substance_name,spl_id,spl_set_id,...,pharm_class_pe,pharm_class_cs,unii,application_number,rxcui,upc,original_packager_product_ndc,pharm_class_moa,manufacturer_name_normalized,fda_manuf_id
0,6,Dentox,"BERBERIS VULGARIS, GLYCYRRHIZA GLABRA, LAPPA M...","BioActive Nutritional, Inc.",43857-0039,HUMAN OTC DRUG,ORAL,"BERBERIS VULGARIS ROOT BARK, GLYCYRRHIZA GLABR...",e4462f59-9ba5-43a9-9b2b-3e5a807545dd,ae51c063-63b4-45cc-b8d0-e98584b65a66,...,"Increased Histamine Release [PE], Cell-mediate...","Allergens [CS], Dietary Proteins [CS], Plant P...","1TH8Q20J0U, 2788Z9758H, 597E9BI3Z3, 11E6VI8VEG...",,,,,,BIOACTIVE NUTRITIONAL INC,0
1,19,Cactus Cinis Avenae,CACTUS CINIS AVENAE,Uriel Pharmacy Inc.,48951-3239,HUMAN OTC DRUG,ORAL,"MAGNESIUM PHOSPHATE, TRIBASIC, PENTAHYDRATE, S...",b6c13079-0ed0-0d79-e053-2a95a90a8d7c,b6c13079-0ecf-0d79-e053-2a95a90a8d7c,...,,,"453COF7817, 7114SV0MYK, 6OM09RPY36, OZ0E5Y15PZ",,,,,,URIEL PHARMACY INC,1
2,22,Zinc Oxide,ZINC OXIDE,Ultra Seal Corporation,42213-365,HUMAN OTC DRUG,TOPICAL,ZINC OXIDE,b4c9e519-4ee3-4cba-9338-148b9cade2ac,e45ac82c-508c-4c41-ba2c-f48d36bc63fe,...,,,SOI2LOH54Z,part347,198911.0,1500365569.0,,,ULTRA SEAL CORPORATION,2
3,23,good sense nasal,OXYMETAZOLINE HCL,L. Perrigo Company,0113-0388,HUMAN OTC DRUG,NASAL,OXYMETAZOLINE HYDROCHLORIDE,244eff85-348f-42a5-a26e-aaaee89bfc69,5d42adb9-3625-4072-a6b1-29a229f255c8,...,,,K89MJ0S5VY,part341,1000990.0,,,,L PERRIGO COMPANY,3
4,33,Amantadine Hydrochloride,AMANTADINE HYDROCHLORIDE,Strides Pharma Inc.,42543-493,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,05b79d14-440a-4526-bccf-8bc26a2ec8a0,e4d8f36f-d668-4728-8dca-b1c22bd9aedb,...,,,M6Q1EO9TD0,ANDA209047,849389.0,342543493018.0,,,STRIDES PHARMA INC,4


In [35]:
# df.reset_index(inplace=True)
# df.rename(columns={"index": "fda_drug_id"}, inplace=True)
df = df[df.product_type != 'HUMAN OTC DRUG']

df.dropna(subset=['brand_name'], inplace=True)
df['is_original_packager'] = np.where(df.is_original_packager==True,1,0)
df = df[df.is_original_packager == 1]
display(df.head())
print(df.columns)

Unnamed: 0,fda_drug_id,brand_name,generic_name,manufacturer_name,product_ndc,product_type,route,substance_name,spl_id,spl_set_id,...,pharm_class_pe,pharm_class_cs,unii,application_number,rxcui,upc,original_packager_product_ndc,pharm_class_moa,manufacturer_name_normalized,fda_manuf_id
4,33,Amantadine Hydrochloride,AMANTADINE HYDROCHLORIDE,Strides Pharma Inc.,42543-493,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,05b79d14-440a-4526-bccf-8bc26a2ec8a0,e4d8f36f-d668-4728-8dca-b1c22bd9aedb,...,,,M6Q1EO9TD0,ANDA209047,849389,342543493018.0,,,STRIDES PHARMA INC,4
8,56,cholestyramine light,CHOLESTYRAMINE LIGHT,"Eon Labs, Inc.",0185-0939,HUMAN PRESCRIPTION DRUG,ORAL,CHOLESTYRAMINE,a060c45e-83ef-4c9f-b54a-89e5241706e2,a7dd335a-6413-446e-8098-d4f97e935986,...,,,4B33BGI082,ANDA074558,1801279,301850939978.0,,Bile-acid Binding Activity [MoA],EON LABS INC,8
10,59,Sulfacetamide Sodium,SULFACETAMIDE SODIUM,Bausch & Lomb Incorporated,24208-670,HUMAN PRESCRIPTION DRUG,OPHTHALMIC,SULFACETAMIDE SODIUM,98ac569a-010a-4804-9b78-a28b54899c99,6c1b2658-e756-4f31-9bc8-f99a0baa4637,...,,,4NRT660KJQ,ANDA040066,1006120,,,,BAUSCH LOMB INCORPORATED,10
11,67,Dicyclomine hydrochloride,DICYCLOMINE HYDROCHLORIDE,Hikma Pharmaceuticals USA Inc.,0143-3126,HUMAN PRESCRIPTION DRUG,ORAL,DICYCLOMINE HYDROCHLORIDE,008f2526-f7fc-4c9b-939d-04661bcec6be,1415ad6f-8d92-415c-ae80-5af0795a78c0,...,,,CQ903KQA31,ANDA040204,991061,,,,HIKMA PHARMACEUTICALS USA INC,11
19,116,Magnesium Sulfate in Dextrose,MAGNESIUM SULFATE IN DEXTROSE,"Hospira, Inc.",0409-6727,HUMAN PRESCRIPTION DRUG,INTRAVENOUS,MAGNESIUM SULFATE HEPTAHYDRATE,1a9001bb-2fa0-4d2b-bd9c-19c3c2705a01,03ebeabb-8386-4af4-3086-bdf3c3fc4a5a,...,,,SK47B8698T,NDA020488,829757,,,,HOSPIRA INC,19


Index(['fda_drug_id', 'brand_name', 'generic_name', 'manufacturer_name',
       'product_ndc', 'product_type', 'route', 'substance_name', 'spl_id',
       'spl_set_id', 'package_ndc', 'is_original_packager', 'nui',
       'pharm_class_epc', 'pharm_class_pe', 'pharm_class_cs', 'unii',
       'application_number', 'rxcui', 'upc', 'original_packager_product_ndc',
       'pharm_class_moa', 'manufacturer_name_normalized', 'fda_manuf_id'],
      dtype='object')


In [36]:
df = preProcess(df)
display(df.head())
df.shape

Unnamed: 0,fda_drug_id,brand_name,generic_name,manufacturer_name,product_ndc,product_type,route,substance_name,spl_id,spl_set_id,...,pharm_class_pe,pharm_class_cs,unii,application_number,rxcui,upc,original_packager_product_ndc,pharm_class_moa,manufacturer_name_normalized,fda_manuf_id
4,33,AMANTADINE HYDROCHLORIDE,AMANTADINE HYDROCHLORIDE,STRIDES PHARMA INC,42543493,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,05B79D14440A4526BCCF8BC26A2EC8A0,E4D8F36FD66847288DCAB1C22BD9AEDB,...,,,M6Q1EO9TD0,ANDA209047,849389,342543493018.0,,,STRIDES PHARMA INC,4
8,56,CHOLESTYRAMINE LIGHT,CHOLESTYRAMINE LIGHT,EON LABS INC,1850939,HUMAN PRESCRIPTION DRUG,ORAL,CHOLESTYRAMINE,A060C45E83EF4C9FB54A89E5241706E2,A7DD335A6413446E8098D4F97E935986,...,,,4B33BGI082,ANDA074558,1801279,301850939978.0,,BILEACID BINDING ACTIVITY MOA,EON LABS INC,8
10,59,SULFACETAMIDE SODIUM,SULFACETAMIDE SODIUM,BAUSCH LOMB INCORPORATED,24208670,HUMAN PRESCRIPTION DRUG,OPHTHALMIC,SULFACETAMIDE SODIUM,98AC569A010A48049B78A28B54899C99,6C1B2658E7564F319BC8F99A0BAA4637,...,,,4NRT660KJQ,ANDA040066,1006120,,,,BAUSCH LOMB INCORPORATED,10
11,67,DICYCLOMINE HYDROCHLORIDE,DICYCLOMINE HYDROCHLORIDE,HIKMA PHARMACEUTICALS USA INC,1433126,HUMAN PRESCRIPTION DRUG,ORAL,DICYCLOMINE HYDROCHLORIDE,008F2526F7FC4C9B939D04661BCEC6BE,1415AD6F8D92415CAE805AF0795A78C0,...,,,CQ903KQA31,ANDA040204,991061,,,,HIKMA PHARMACEUTICALS USA INC,11
19,116,MAGNESIUM SULFATE IN DEXTROSE,MAGNESIUM SULFATE IN DEXTROSE,HOSPIRA INC,4096727,HUMAN PRESCRIPTION DRUG,INTRAVENOUS,MAGNESIUM SULFATE HEPTAHYDRATE,1A9001BB2FA04D2BBD9C19C3C2705A01,03EBEABB83864AF43086BDF3C3FC4A5A,...,,,SK47B8698T,NDA020488,829757,,,,HOSPIRA INC,19


(15186, 24)

In [37]:
columns = ['fda_drug_id', 'brand_name', 'generic_name', 'product_type', 'route',
       'substance_name', 'manufacturer_name', 'pharm_class_epc', 'pharm_class_pe',
       'pharm_class_cs', 'pharm_class_moa']

groupby_cols = ['brand_name', 'generic_name', 'product_type', 'route',
       'substance_name', 'manufacturer_name', 'pharm_class_epc', 'pharm_class_pe',
       'pharm_class_cs', 'pharm_class_moa']

fill_na_cols = ['pharm_class_epc', 'pharm_class_pe', 'pharm_class_cs', 'pharm_class_moa']
for col in fill_na_cols:
    df[col] = df[col].fillna('""')

df = df[columns]
df.drop_duplicates(groupby_cols, inplace=True)
display(df)
#display(df.groupby(groupby_cols).count().sort_values('brand_name', ascending=False))
df.columns

Unnamed: 0,fda_drug_id,brand_name,generic_name,product_type,route,substance_name,manufacturer_name,pharm_class_epc,pharm_class_pe,pharm_class_cs,pharm_class_moa
4,33,AMANTADINE HYDROCHLORIDE,AMANTADINE HYDROCHLORIDE,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,STRIDES PHARMA INC,"""""","""""","""""",""""""
8,56,CHOLESTYRAMINE LIGHT,CHOLESTYRAMINE LIGHT,HUMAN PRESCRIPTION DRUG,ORAL,CHOLESTYRAMINE,EON LABS INC,BILE ACID SEQUESTRANT EPC,"""""","""""",BILEACID BINDING ACTIVITY MOA
10,59,SULFACETAMIDE SODIUM,SULFACETAMIDE SODIUM,HUMAN PRESCRIPTION DRUG,OPHTHALMIC,SULFACETAMIDE SODIUM,BAUSCH LOMB INCORPORATED,"""""","""""","""""",""""""
11,67,DICYCLOMINE HYDROCHLORIDE,DICYCLOMINE HYDROCHLORIDE,HUMAN PRESCRIPTION DRUG,ORAL,DICYCLOMINE HYDROCHLORIDE,HIKMA PHARMACEUTICALS USA INC,"""""","""""","""""",""""""
19,116,MAGNESIUM SULFATE IN DEXTROSE,MAGNESIUM SULFATE IN DEXTROSE,HUMAN PRESCRIPTION DRUG,INTRAVENOUS,MAGNESIUM SULFATE HEPTAHYDRATE,HOSPIRA INC,"""""","""""","""""",""""""
...,...,...,...,...,...,...,...,...,...,...,...
98029,193085,EZETIMIBE,EZETIMIBE,HUMAN PRESCRIPTION DRUG,ORAL,EZETIMIBE,ZYDUS PHARMACEUTICALS USA INC,DIETARY CHOLESTEROL ABSORPTION INHIBITOR EPC,DECREASED CHOLESTEROL ABSORPTION PE,"""""",""""""
98041,193097,ABACAVIR,ABACAVIR,HUMAN PRESCRIPTION DRUG,ORAL,ABACAVIR SULFATE,XLCARE PHARMACEUTICALS INC,"""""","""""","""""",""""""
98046,193102,ALBUTEROL,ALBUTEROL,HUMAN PRESCRIPTION DRUG,ORAL,ALBUTEROL SULFATE,VIRTUS PHARMACEUTICALS LLC,"""""","""""","""""",""""""
98065,193121,NITROGEN,NITROGEN,HUMAN PRESCRIPTION DRUG,RESPIRATORY INHALATION,NITROGEN,HELGET GAS PRODUCTS,"""""","""""","""""",""""""


Index(['fda_drug_id', 'brand_name', 'generic_name', 'product_type', 'route',
       'substance_name', 'manufacturer_name', 'pharm_class_epc',
       'pharm_class_pe', 'pharm_class_cs', 'pharm_class_moa'],
      dtype='object')

In [38]:
df.groupby('brand_name').count().sort_values('substance_name')
df_grouped = df.groupby('brand_name').agg(list)
#df_grouped[''] = df.groupby('brand_name')['route'].apply(list)

In [39]:
#df[df.brand_name == 'LIDOCAINE HYDROCHLORIDE']

In [41]:
#os.getcwd()
pickle.dump(df_grouped, open( "../../Data/Outputs_Cleanup/FDA/Openfda_Drug_Deduplicated.p", "wb" ) )
df_grouped.to_csv('../../Data/Outputs_Cleanup/FDA/Openfda_Drug_Deduplicated.csv')
#df_grouped

AttributeError: 'DataFrame' object has no attribute 'compute'