In [485]:

from lxml import etree
from datetime import date
import pandas as pd

#http://www.accessdata.fda.gov/spl/stylesheet/spl-common.xsl
namespaces={"v3":"urn:hl7-org:v3",}

def normalize_date(date_string):
    year = int(date_string[0:4])
    month = int(date_string[4:6])
    day = int(date_string[6:8])
    return date(year,month,day).strftime("%b %d, %Y")

class DrugLabel(object):
    """represents a Drug Label in the SPL format.
    takes one argument, spl_label, which can be either an url or a file path"""

    def __init__(self, spl_label):
        self.label_data = spl_label
        self.xml = etree.parse(spl_label)

    def actives(self): #UNII code
        """returns a list of active compounds"""
        #here converting to a set removes duplicates
        return sorted(list(set(active.text for active in self.xml.xpath("//v3:ingredientSubstance/v3:activeMoiety/v3:activeMoiety/v3:name",namespaces=namespaces))))
    actives.label = "active cmpds"


    def unii(self): #UNII code
        """returns the drug's NDC number"""
        #this xpath query is NOT from the SPL xsl file
        return sorted(list(set(self.xml.xpath("//v3:ingredientSubstance/v3:activeMoiety/v3:activeMoiety/v3:code/@code",namespaces=namespaces))))
    unii.label = "unii"

    def iau(self):
        """returns the indications and usage section"""
        #here converting to a set removes duplicates
        return sorted(list(set(active.text for active in self.xml.xpath("//v3:structuredBody/",namespaces=namespaces))))
    actives.label = "active cmpds"
    

    def start_date(self):
        """returns start marketing date as a strftime formatted python date object"""
        date_string = self.xml.xpath("//v3:subjectOf/v3:marketingAct/v3:effectiveTime/v3:low/@value",namespaces=namespaces)[0]
        return normalize_date(date_string)
    start_date.label = "marketing start date"

    # don't bother - it's None for all labels
    # end_date.label = "marketing end date"
    # def end_date(self):
    # 	"""returns end marketing date as a strftime formatted python date object or the string "None if not defined
    # 	refers to the expiration date of the last lot released to the market
    # 	(from http://spl-work-group.wikispaces.com/file/view/creating_otc_sp_documentsl.pdf)"""
    # 	try:
    # 		date_string = self.xml.xpath("//v3:subjectOf/v3:marketingAct/v3:effectiveTime/v3:high/@value",namespaces=namespaces)[0]
    # 		return normalize_date(datestring)
    # 	except:
    # 		return "None"

    def marketing_category(self):
        """returns the marketing category"""
        return self.xml.xpath("//v3:subjectOf/v3:approval/v3:code/@displayName",namespaces=namespaces)[0]
    marketing_category.label = "marketing category"

    def revision_date(self):
        """returns label revision date"""
        date_string = self.xml.xpath("/v3:document/v3:effectiveTime/@value",namespaces=namespaces)[0]
        return normalize_date(date_string)
    revision_date.label = "revision date"

    def label_type(self):
        """returns the drug label type, typically 'HUMAN OTC DRUG LABEL' or 'HUMAN PRESCRIPTION DRUG LABEL' """
        return self.xml.xpath("//v3:code/@displayName",namespaces=namespaces)[0]
    label_type.label = "label type"


    def ndc(self):
        """returns the drug's NDC number"""
        #this xpath query is NOT from the SPL xsl file
        return self.xml.xpath("//v3:manufacturedProduct/v3:manufacturedProduct/v3:code/@code",namespaces=namespaces)[0]
    ndc.label = "ndc"

    def name(self):
        """returns the drug's name"""
        return self.xml.xpath("//v3:manufacturedProduct/v3:manufacturedProduct/v3:name",namespaces=namespaces)[0].text.replace("\t","").replace("\n","")
    name.label = "name"

    def distributor(self):
        """returns the drug's distributor"""
        return self.xml.xpath("//v3:author/v3:assignedEntity/v3:representedOrganization/v3:name",namespaces=namespaces)[0].text
    distributor.label = "distributor"

    def dosage_form(self):
        """returns the drug's dosage form"""
        return self.xml.xpath("//v3:manufacturedProduct/v3:manufacturedProduct/v3:formCode/@displayName",namespaces=namespaces)[0]
    dosage_form.label = "dosage form"

    #just a helper function for the next two functions, so no label
    def _get_word_list(self, word):
        """returns a list of etree instances of all occurances of 'word','Word' or 'WORD' """
        word = str(word)
        #this query also NOT from the SPL xsl file
        query = "//*[text()[contains(.,'%s') or contains(.,'%s') or contains(.,'%s')]]" %(word.lower(),word.upper(),word.capitalize())
        return self.xml.xpath(query,namespaces=namespaces)

    def test_word(self, word):
        if self._get_word_list(word): return 1
        else: return 0
    #TODO test_word.label = "%s?" %self.test_word.word

    def get_word_section(self, word):
        #finds the first ancestor section and returns section/code/@displayName
        word_section_list = []
        for word in self._get_word_list(word):
            word_section_list.extend(word.xpath("ancestor::v3:section[1]/v3:code/@displayName",namespaces=namespaces))
        if not word_section_list:
            return "n/a"
        return list(set(word_section_list))

    def get_text(self):
        """returns the drug's label"""
        text = self.xml.xpath('//v3:section/v3:code[@code="34067-9"]/..//v3:paragraph',namespaces=namespaces)
        length = len(text)
        newText = []
        for row in range(length):
            newText.append(text[row].text)
        return newText
    
    def get_fullText(self):
        """returns the drug's label"""
        text = self.xml.xpath('//v3:section/v3:code[@code="34067-9"]/..//v3:text//*',namespaces=namespaces)
        length = len(text)
        newText = []
        for row in range(length):
            newText.append(text[row].text)
        return newText

    def get_word_time(self, word):
        """
        returns the LATEST effectiveTime/@date for all of the instances of "word" mentioned
        """
        word_time_list = []
        for word in self._get_word_list(word):
            word_time_list += word.xpath("ancestor::v3:section/v3:effectiveTime/@value",namespaces=namespaces)
        if not word_time_list:
            return "n/a"
        try:
            return max(list(set(normalize_date(date) for date in word_time_list)))
        except:
            return max(list(normalize_date(date) for date in word_time_list))

    def build_url(self):
        """helper function that builds and returns the accessdata.fda.gov URL given the XML file name/directory"""
        #maybe won't work on windows because slash direction?
        uuid = self.label_data.split("/")[-1].split(".")[0]
        return "http://www.accessdata.fda.gov/spl/data/%s/%s.xml" %(uuid,uuid)

    def indications(self, xml):
        indications = []
        drugN = self.name()
        activeC = self.actives()
        uniiC = self.unii()
        label = self.get_text()
        try:
            text = '|'.join(label)
        except:
            text = label
        
        full_Label = self.get_fullText()
        try:
            context = '|'.join(full_Label)
        except:
            context = full_Label
        
        data = [xml, drugN, activeC[0], uniiC[0], text, context]
        inidcations = indications.append(data)  
        return indications


In [487]:
DL = DrugLabel("001e2ecb-0849-475b-812f-33db6800f7c7.xml")
indications = DL.indications("001e2ecb-0849-475b-812f-33db6800f7c7.xml")

In [488]:
indications

[['001e2ecb-0849-475b-812f-33db6800f7c7.xml',
  'CARISOPRODOL',
  'CARISOPRODOL',
  '21925K482H',
  'CARISOPRODOL is indicated for the relief of discomfort associated with acute, painful musculoskeletal conditions in adults. CARISOPRODOL should only be used for short periods (up to two or three weeks) because adequate evidence of effectiveness for more prolonged use has not been established and because acute, painful musculoskeletal conditions are generally of short duration. [|CARISOPRODOL is indicated for the relief of discomfort associated with acute, painful musculoskeletal conditions. ',
  ['CARISOPRODOL is indicated for the relief of discomfort associated with acute, painful musculoskeletal conditions in adults. CARISOPRODOL should only be used for short periods (up to two or three weeks) because adequate evidence of effectiveness for more prolonged use has not been established and because acute, painful musculoskeletal conditions are generally of short duration. [',
   '\n      

In [459]:
pd.DataFrame(indications, columns=['Drug Brand Name ', 'Active ingredient', 'UNII ID', 'Text'])

Unnamed: 0,Drug Brand Name,Active ingredient,UNII ID,Text
0,CARISOPRODOL,CARISOPRODOL,21925K482H,CARISOPRODOL is indicated for the relief of di...


In [493]:
import os
import xmltodict
import pprint
import json
import xml.etree.ElementTree as ET

count = 0

for filename in os.listdir('.'):
    if filename.endswith('.xml'):
        
        count = count +1
        try:
            DL = DrugLabel(filename)
        except:
            pass
        
        indications = DL.indications(filename)
        
        if count == 1:
            ind = pd.DataFrame(indications, columns=['Label_ID','Drug Brand Name', 'Active ingredient', 'UNII ID', 'Formatted Text','Text'])
        else:
            newind = pd.DataFrame(indications, columns=['Label_ID','Drug Brand Name', 'Active ingredient', 'UNII ID', 'Formatted Text','Text'])
            ind = ind.append(newind)
            

            
#//component/section/code[@code='34067-9']/..//text

In [435]:
name = DL.name()
active = DL.actives()
uniiC = DL.unii()
ind = pd.DataFrame()
ind  = ind.append([[name, active[0], uniiC[0]]])

3229

In [495]:
ind.to_csv("DailyMedXML2CSV.csv")

In [303]:
text = DL.xml.xpath('//v3:section/v3:code[@code="34067-9"]/..//v3:paragraph',namespaces=namespaces)


'Because of the risks of addiction, abuse, and misuse, with opioids, even at recommended doses [see '

In [307]:
text = DL.get_text()
text[0].text

AttributeError: 'list' object has no attribute 'text'

In [272]:
DL.unii()

'CD35PMG570'

In [128]:
print(DL.actives())

['SEVELAMER']


In [129]:
print(DL.name())

Sevelamer Carbonate


In [130]:
DL.get_word_section("indicat")

['RECENT MAJOR CHANGES SECTION',
 'CONTRAINDICATIONS SECTION',
 'INDICATIONS & USAGE SECTION',
 'DESCRIPTION SECTION']

In [291]:
DL.ndc()

'10702-184'