# Dedupe Preparation
Description: Convert openfda into a dedupe input by obtaining unique values from openfda, 
and assigning an Id column to the resulting dataframe.

INPUT: openfda_processed.csv (From openfda json to table)

OUTPUT: dataID.csv

In [12]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
def remove_punc(string):
    '''
    takes in a string and removed all punctuation.
    '''
    return re.sub(r'[^\w\s]','',string)

def preProcess(df):
    '''
    takes a dataframe, applies the remove_punc function
    and then upper cases all text columns
    '''
    df = df.applymap(lambda s:remove_punc(s).upper() if isinstance(s, str) else s)
    return df

In [3]:
file = 'openfda_processed.csv'
df = pd.read_csv(file)


df.reset_index(inplace=True)
df.rename(columns={"index": "fda_drug_id"}, inplace=True)
df = df[df.product_type != 'HUMAN OTC DRUG']

df.dropna(subset=['brand_name'], inplace=True)
df['is_original_packager'] = np.where(df.is_original_packager==True,1,0)
df = df[df.is_original_packager == 1]
display(df.head())
print(df.columns)

Unnamed: 0.1,fda_drug_id,Unnamed: 0,brand_name,generic_name,manufacturer_name,product_ndc,product_type,route,substance_name,spl_id,...,nui,pharm_class_epc,pharm_class_pe,pharm_class_cs,unii,application_number,rxcui,upc,original_packager_product_ndc,pharm_class_moa
4,4,33,Amantadine Hydrochloride,AMANTADINE HYDROCHLORIDE,Strides Pharma Inc.,42543-493,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,05b79d14-440a-4526-bccf-8bc26a2ec8a0,...,,,,,M6Q1EO9TD0,ANDA209047,849389,342543493018.0,,
8,8,56,cholestyramine light,CHOLESTYRAMINE LIGHT,"Eon Labs, Inc.",0185-0939,HUMAN PRESCRIPTION DRUG,ORAL,CHOLESTYRAMINE,a060c45e-83ef-4c9f-b54a-89e5241706e2,...,"N0000180292, N0000175365",Bile Acid Sequestrant [EPC],,,4B33BGI082,ANDA074558,1801279,301850939978.0,,Bile-acid Binding Activity [MoA]
10,10,59,Sulfacetamide Sodium,SULFACETAMIDE SODIUM,Bausch & Lomb Incorporated,24208-670,HUMAN PRESCRIPTION DRUG,OPHTHALMIC,SULFACETAMIDE SODIUM,98ac569a-010a-4804-9b78-a28b54899c99,...,,,,,4NRT660KJQ,ANDA040066,1006120,,,
11,11,67,Dicyclomine hydrochloride,DICYCLOMINE HYDROCHLORIDE,Hikma Pharmaceuticals USA Inc.,0143-3126,HUMAN PRESCRIPTION DRUG,ORAL,DICYCLOMINE HYDROCHLORIDE,008f2526-f7fc-4c9b-939d-04661bcec6be,...,,,,,CQ903KQA31,ANDA040204,991061,,,
19,19,116,Magnesium Sulfate in Dextrose,MAGNESIUM SULFATE IN DEXTROSE,"Hospira, Inc.",0409-6727,HUMAN PRESCRIPTION DRUG,INTRAVENOUS,MAGNESIUM SULFATE HEPTAHYDRATE,1a9001bb-2fa0-4d2b-bd9c-19c3c2705a01,...,,,,,SK47B8698T,NDA020488,829757,,,


Index(['fda_drug_id', 'Unnamed: 0', 'brand_name', 'generic_name',
       'manufacturer_name', 'product_ndc', 'product_type', 'route',
       'substance_name', 'spl_id', 'spl_set_id', 'package_ndc',
       'is_original_packager', 'nui', 'pharm_class_epc', 'pharm_class_pe',
       'pharm_class_cs', 'unii', 'application_number', 'rxcui', 'upc',
       'original_packager_product_ndc', 'pharm_class_moa'],
      dtype='object')


In [4]:
df = preProcess(df)
display(df.head())
df.shape

Unnamed: 0.1,fda_drug_id,Unnamed: 0,brand_name,generic_name,manufacturer_name,product_ndc,product_type,route,substance_name,spl_id,...,nui,pharm_class_epc,pharm_class_pe,pharm_class_cs,unii,application_number,rxcui,upc,original_packager_product_ndc,pharm_class_moa
4,4,33,AMANTADINE HYDROCHLORIDE,AMANTADINE HYDROCHLORIDE,STRIDES PHARMA INC,42543493,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,05B79D14440A4526BCCF8BC26A2EC8A0,...,,,,,M6Q1EO9TD0,ANDA209047,849389,342543493018.0,,
8,8,56,CHOLESTYRAMINE LIGHT,CHOLESTYRAMINE LIGHT,EON LABS INC,1850939,HUMAN PRESCRIPTION DRUG,ORAL,CHOLESTYRAMINE,A060C45E83EF4C9FB54A89E5241706E2,...,N0000180292 N0000175365,BILE ACID SEQUESTRANT EPC,,,4B33BGI082,ANDA074558,1801279,301850939978.0,,BILEACID BINDING ACTIVITY MOA
10,10,59,SULFACETAMIDE SODIUM,SULFACETAMIDE SODIUM,BAUSCH LOMB INCORPORATED,24208670,HUMAN PRESCRIPTION DRUG,OPHTHALMIC,SULFACETAMIDE SODIUM,98AC569A010A48049B78A28B54899C99,...,,,,,4NRT660KJQ,ANDA040066,1006120,,,
11,11,67,DICYCLOMINE HYDROCHLORIDE,DICYCLOMINE HYDROCHLORIDE,HIKMA PHARMACEUTICALS USA INC,1433126,HUMAN PRESCRIPTION DRUG,ORAL,DICYCLOMINE HYDROCHLORIDE,008F2526F7FC4C9B939D04661BCEC6BE,...,,,,,CQ903KQA31,ANDA040204,991061,,,
19,19,116,MAGNESIUM SULFATE IN DEXTROSE,MAGNESIUM SULFATE IN DEXTROSE,HOSPIRA INC,4096727,HUMAN PRESCRIPTION DRUG,INTRAVENOUS,MAGNESIUM SULFATE HEPTAHYDRATE,1A9001BB2FA04D2BBD9C19C3C2705A01,...,,,,,SK47B8698T,NDA020488,829757,,,


(15186, 23)

In [5]:
columns = ['fda_drug_id', 'brand_name', 'generic_name', 'product_type', 'route',
       'substance_name', 'manufacturer_name', 'pharm_class_epc', 'pharm_class_pe',
       'pharm_class_cs', 'pharm_class_moa']

groupby_cols = ['brand_name', 'generic_name', 'product_type', 'route',
       'substance_name', 'manufacturer_name', 'pharm_class_epc', 'pharm_class_pe',
       'pharm_class_cs', 'pharm_class_moa']

fill_na_cols = ['pharm_class_epc', 'pharm_class_pe', 'pharm_class_cs', 'pharm_class_moa']
for col in fill_na_cols:
    df[col] = df[col].fillna('""')

df = df[columns]
df.drop_duplicates(groupby_cols, inplace=True)
display(df)
#display(df.groupby(groupby_cols).count().sort_values('brand_name', ascending=False))
df.columns

Unnamed: 0,fda_drug_id,brand_name,generic_name,product_type,route,substance_name,manufacturer_name,pharm_class_epc,pharm_class_pe,pharm_class_cs,pharm_class_moa
4,4,AMANTADINE HYDROCHLORIDE,AMANTADINE HYDROCHLORIDE,HUMAN PRESCRIPTION DRUG,ORAL,AMANTADINE HYDROCHLORIDE,STRIDES PHARMA INC,"""""","""""","""""",""""""
8,8,CHOLESTYRAMINE LIGHT,CHOLESTYRAMINE LIGHT,HUMAN PRESCRIPTION DRUG,ORAL,CHOLESTYRAMINE,EON LABS INC,BILE ACID SEQUESTRANT EPC,"""""","""""",BILEACID BINDING ACTIVITY MOA
10,10,SULFACETAMIDE SODIUM,SULFACETAMIDE SODIUM,HUMAN PRESCRIPTION DRUG,OPHTHALMIC,SULFACETAMIDE SODIUM,BAUSCH LOMB INCORPORATED,"""""","""""","""""",""""""
11,11,DICYCLOMINE HYDROCHLORIDE,DICYCLOMINE HYDROCHLORIDE,HUMAN PRESCRIPTION DRUG,ORAL,DICYCLOMINE HYDROCHLORIDE,HIKMA PHARMACEUTICALS USA INC,"""""","""""","""""",""""""
19,19,MAGNESIUM SULFATE IN DEXTROSE,MAGNESIUM SULFATE IN DEXTROSE,HUMAN PRESCRIPTION DRUG,INTRAVENOUS,MAGNESIUM SULFATE HEPTAHYDRATE,HOSPIRA INC,"""""","""""","""""",""""""
...,...,...,...,...,...,...,...,...,...,...,...
98029,98029,EZETIMIBE,EZETIMIBE,HUMAN PRESCRIPTION DRUG,ORAL,EZETIMIBE,ZYDUS PHARMACEUTICALS USA INC,DIETARY CHOLESTEROL ABSORPTION INHIBITOR EPC,DECREASED CHOLESTEROL ABSORPTION PE,"""""",""""""
98041,98041,ABACAVIR,ABACAVIR,HUMAN PRESCRIPTION DRUG,ORAL,ABACAVIR SULFATE,XLCARE PHARMACEUTICALS INC,"""""","""""","""""",""""""
98046,98046,ALBUTEROL,ALBUTEROL,HUMAN PRESCRIPTION DRUG,ORAL,ALBUTEROL SULFATE,VIRTUS PHARMACEUTICALS LLC,"""""","""""","""""",""""""
98065,98065,NITROGEN,NITROGEN,HUMAN PRESCRIPTION DRUG,RESPIRATORY INHALATION,NITROGEN,HELGET GAS PRODUCTS,"""""","""""","""""",""""""


Index(['fda_drug_id', 'brand_name', 'generic_name', 'product_type', 'route',
       'substance_name', 'manufacturer_name', 'pharm_class_epc',
       'pharm_class_pe', 'pharm_class_cs', 'pharm_class_moa'],
      dtype='object')

In [6]:
df.groupby('brand_name').count().sort_values('substance_name')
df_grouped = df.groupby('brand_name').agg(list)
df_grouped
#df_grouped[''] = df.groupby('brand_name')['route'].apply(list)

Unnamed: 0_level_0,fda_drug_id,generic_name,product_type,route,substance_name,manufacturer_name,pharm_class_epc,pharm_class_pe,pharm_class_cs,pharm_class_moa
brand_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
011010 NIACINAMIDE 4 TRETINOIN 0025,[44946],[011010 NIACINAMIDE 4 TRETINOIN 0025],[HUMAN PRESCRIPTION DRUG],[TOPICAL],[TRETINOIN NIACINAMIDE],[SINCERUS FLORIDA LLC],[RETINOID EPC],"[""""]",[RETINOIDS CS],"[""""]"
011013 NIACINAMIDE 4 TRETINOIN 0025,[28941],[011013 NIACINAMIDE 4 TRETINOIN 0025],[HUMAN PRESCRIPTION DRUG],[TOPICAL],[NIACINAMIDE TRETINOIN],[SINCERUS FLORIDA LLC],[RETINOID EPC],"[""""]",[RETINOIDS CS],"[""""]"
011020 NIACINAMIDE 4 TRETINOIN 005,[20068],[011020 NIACINAMIDE 4 TRETINOIN 005],[HUMAN PRESCRIPTION DRUG],[TOPICAL],[NIACINAMIDE TRETINOIN],[SINCERUS FLORIDA LLC],[RETINOID EPC],"[""""]",[RETINOIDS CS],"[""""]"
011021 NIACINAMIDE 4 TRETINOIN 005,[96571],[011021 NIACINAMIDE 4 TRETINOIN 005],[HUMAN PRESCRIPTION DRUG],[TOPICAL],[NIACINAMIDE TRETINOIN],[SINCERUS FLORIDA LLC],[RETINOID EPC],"[""""]",[RETINOIDS CS],"[""""]"
011054 NIACINAMIDE 4 SPIRONOLACTONE 5,[82021],[011054 NIACINAMIDE 4 SPIRONOLACTONE 5],[HUMAN PRESCRIPTION DRUG],[TOPICAL],[NIACINAMIDE SPIRONOLACTONE],[SINCERUS FLORIDA LLC],[ALDOSTERONE ANTAGONIST EPC],"[""""]","[""""]",[ALDOSTERONE ANTAGONISTS MOA]
...,...,...,...,...,...,...,...,...,...,...
ZYPITAMAG,"[2202, 23559]","[PITAVASTATIN MAGNESIUM, PITAVASTATIN MAGNESIUM]","[HUMAN PRESCRIPTION DRUG, HUMAN PRESCRIPTION D...","[ORAL, ORAL]","[PITAVASTATIN, PITAVASTATIN]","[MEDICURE INTERNATIONAL INC, CADILA HEALTHCARE...","[HMGCOA REDUCTASE INHIBITOR EPC, HMGCOA REDUCT...","["""", """"]","["""", """"]",[HYDROXYMETHYLGLUTARYLCOA REDUCTASE INHIBITORS...
ZYPREXA,[75374],[OLANZAPINE],[HUMAN PRESCRIPTION DRUG],[ORAL],[OLANZAPINE],[ELI LILLY AND COMPANY],[ATYPICAL ANTIPSYCHOTIC EPC],"[""""]","[""""]","[""""]"
ZYTIGA,[38876],[ABIRATERONE ACETATE],[HUMAN PRESCRIPTION DRUG],[ORAL],[ABIRATERONE ACETATE],[JANSSEN BIOTECH INC],"[""""]","[""""]","[""""]","[""""]"
ZYVANA,[78025],[ZYVANA],[HUMAN PRESCRIPTION DRUG],[ORAL],[ASCORBIC ACID CHOLECALCIFEROL PYRIDOXINE HYDR...,[STERLINGKNIGHT PHARMACEUTICALS LLC],[VITAMIN C EPC VITAMIN D EPC],"[""""]",[ASCORBIC ACID CS VITAMIN D CS],"[""""]"


In [7]:
#df[df.brand_name == 'LIDOCAINE HYDROCHLORIDE']

In [15]:
#os.getcwd()
df_grouped.to_csv('Openfda_Drug_Deduplicated.csv')
#df_grouped