In [1]:
import pandas as pd

In [2]:
#Loads the data from the completed XMLProduct. This is the csv we wish to attach a DBID to
DM = pd.read_csv('../data/output/XMLProduct.csv')
DM.dropna(subset=["Text"], inplace=True)
DM.head()

Unnamed: 0,Label_ID,Drug_Brand_Name,Active_ingredient,UNII_ID,Formatted_Text,Text
0,../DailyMedExtracter/prescription/temp_xml/000...,SRONYX,Ethinyl Estradiol|Levonorgestrel,423D2T571U|5W7SIA7YZW,Oral contraceptives are indicated for the prev...,['Oral contraceptives are indicated for the pr...
1,../DailyMedExtracter/prescription/temp_xml/000...,Diclofenac Sodium,DICLOFENAC,144O8QL0L1,Carefully consider the potential benefits and ...,Carefully consider the potential benefits and ...
2,../DailyMedExtracter/prescription/temp_xml/001...,Triamterene and Hydrochlorothiazide,HYDROCHLOROTHIAZIDE|TRIAMTERENE,0J48LPH2TH|WS821Z52LQ,\n |Triamterene and hydroc...,\n |This fixed combination...
3,../DailyMedExtracter/prescription/temp_xml/001...,Venlafaxine Hydrochloride,VENLAFAXINE,GRZ5RCB1QG,Venlafaxine hydrochloride extended-release cap...,Venlafaxine hydrochloride extended-release cap...
4,../DailyMedExtracter/prescription/temp_xml/001...,Bleomycin,BLEOMYCIN,40S1VHN69B,Bleomycin for injection should be considered a...,Bleomycin for injection should be considered a...


In [3]:
#The total length of the DailyMed label count and the number of unique labels based off of the active ingredient/Unii ID
print("The total number of labels gathered from the DailyMed website is:       ",len(DM))
print("The total number of UNIQUE labels gathered from the DailyMed website is:", len(DM.drop_duplicates(['UNII_ID'])))

The total number of labels gathered from the DailyMed website is:        36340
The total number of UNIQUE labels gathered from the DailyMed website is: 2455


In [4]:
#Loads the data from DrugBank. This contains a mapping of each DBID/Drug to each Active Ingredient/Unii ID
DB = pd.read_csv('../data/input/drugbank vocabulary.csv')
DB.head()

Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,Standard InChI Key
0,DB00001,BIOD00024 | BTD00024,Lepirudin,138068-37-8,Y43GF64R34,Hirudin variant-1 | Lepirudin recombinant,
1,DB00002,BIOD00071 | BTD00071,Cetuximab,205923-56-4,PQX0D8J21J,Cetuximab | Cétuximab | Cetuximabum | Immunogl...,
2,DB00003,BIOD00001 | BTD00001,Dornase alfa,143831-71-4,953A26OA1Y,Deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BIOD00084 | BTD00084,Denileukin diftitox,173146-27-5,25E79B5CTM,Denileukin | Interleukin-2/diptheria toxin fus...,
4,DB00005,BIOD00052 | BTD00052,Etanercept,185243-69-0,OP401G7OJC,Etanercept-szzs | RHU TNFR:FC | RHU-TNFR:FC | ...,


In [5]:
DB = DB[['DrugBank ID', 'UNII', 'Common name']]
DB.rename(columns={'DrugBank ID':'DB_ID', 'UNII':'UNII_ID', 'Common name': 'Drug_name'},inplace=True)

In [6]:
#The total length of the Drugbank drug count and the number of Drugs based off of the active ingredient/Unii ID
print("The total number of DBID gathered from the Drugbank website is:        ",len(DB))
print("The total number of UNIQUE DBID gathered from the Drugbank website is: ",len(DB.drop_duplicates(['UNII_ID'])))

The total number of DBID gathered from the Drugbank website is:         12112
The total number of UNIQUE DBID gathered from the Drugbank website is:  7791


In [7]:
DB.dropna(subset=["UNII_ID"], inplace=True)

In [8]:
counter = []
for index, row in DM.iterrows():
    test = row['Text']
    #Counts the number of entries at row x and adds it to the counter list
    counter.append(len(test.split()))

#The word count list is now appended to the context file
DM['WordCount'] = counter


In [9]:
DM = DM.sort_values(by = "WordCount", ascending = False)
DM.head()

Unnamed: 0,Label_ID,Drug_Brand_Name,Active_ingredient,UNII_ID,Formatted_Text,Text,WordCount
5752,../DailyMedExtracter/prescription/temp_xml/2e8...,Omnipaque,Iohexol,4419T9MX03,OMNIPAQUE 300 is indicated for intrathecal adm...,['OMNIPAQUE 300 is indicated for intrathecal a...,4374
27955,../DailyMedExtracter/prescription/temp_xml/9fb...,Omnipaque,Iohexol,4419T9MX03,OMNIPAQUE 300 is indicated for intrathecal adm...,['OMNIPAQUE 300 is indicated for intrathecal a...,4374
13041,../DailyMedExtracter/prescription/temp_xml/5ef...,ESTROSTEP Fe,ETHINYL ESTRADIOL|NORETHINDRONE,423D2T571U|T18F433X4S,ESTROSTEP Fe is indicated for the prevention o...,['ESTROSTEP Fe is indicated for the prevention...,3856
34091,../DailyMedExtracter/prescription/temp_xml/d72...,Dexamethasone Sodium Phosphate,DEXAMETHASONE,7S5I7G3JQL,\n |\n ...,"['\n ', '\n ...",2481
32857,../DailyMedExtracter/prescription/temp_xml/cbb...,Ribavirin,RIBAVIRIN,49717AWG6K,Ribavirin capsules are indicated in combinatio...,['Ribavirin capsules are indicated in combinat...,2183


In [10]:
#Drop the instances from DailyMed if they share the same UNII ID
DM.drop_duplicates(['UNII_ID'], inplace=True, keep='first')

In [11]:
#These two datasets should have the same name
newDM = DM.merge(DB, on=["UNII_ID"], how = 'inner')
newDM.head()

Unnamed: 0,Label_ID,Drug_Brand_Name,Active_ingredient,UNII_ID,Formatted_Text,Text,WordCount,DB_ID,Drug_name
0,../DailyMedExtracter/prescription/temp_xml/2e8...,Omnipaque,Iohexol,4419T9MX03,OMNIPAQUE 300 is indicated for intrathecal adm...,['OMNIPAQUE 300 is indicated for intrathecal a...,4374,DB01362,Iohexol
1,../DailyMedExtracter/prescription/temp_xml/d72...,Dexamethasone Sodium Phosphate,DEXAMETHASONE,7S5I7G3JQL,\n |\n ...,"['\n ', '\n ...",2481,DB01234,Dexamethasone
2,../DailyMedExtracter/prescription/temp_xml/cbb...,Ribavirin,RIBAVIRIN,49717AWG6K,Ribavirin capsules are indicated in combinatio...,['Ribavirin capsules are indicated in combinat...,2183,DB00811,Ribavirin
3,../DailyMedExtracter/prescription/temp_xml/174...,Sertraline Hydrochloride,SERTRALINE,QUC7NX6WMB,\n | |Sertraline hydrochlo...,"[None, '\n ', 'Major Depre...",1674,DB01104,Sertraline
4,../DailyMedExtracter/prescription/temp_xml/0a1...,Ciprofloxacin,CIPROFLOXACIN,5E8K9I0O4U,Ciprofloxacin tablets are a fluoroquinolone an...,['Ciprofloxacin tablets\xa0are a fluoroquinolo...,1497,DB00537,Ciprofloxacin


In [17]:
print ("The number of drug labels are reduced from ",len(DM)," to", len(newDM))

The number of drug labels are reduced from  2455  to 1452


In [18]:
newDM.to_csv("../data/output/unlabeled_withDBID.csv", index=False)