## Import data and libraries

In [None]:
# get XML file
# !wget -P ../../target -c "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml"

In [4]:
#!tail -n 100 '../input/bioproject.xml'
#!head -n 100 '../input/bioproject.xml'

In [98]:
%%time
import lxml.etree as ET
import pandas as pd
from datetime import datetime

CPU times: user 34 µs, sys: 70 µs, total: 104 µs
Wall time: 116 µs


# Using lxml

https://lxml.de/api/lxml-module.html

In [99]:
%%time

def findTagInfo(fileName):
    count = 0
    col = ['StudyId', 'Name', 'Title', 'Description', 'BiosampleId']
    
    fn = '../../target/biosampleDescriptionDF.tsv'
    
    for _, elem in ET.iterparse(fileName, events=('end',),
                                   tag=('ArchiveID', 'ProjectDescr'),
                                   remove_blank_text=True):
        # Initialize variables
        studyName = None
        studyTitle = None
        studyDesc = None
        biosampleList = []
        df = pd.DataFrame(columns = col)
        
        # Capture information based on the tag
        # being iterated over.
        if elem.tag == 'ArchiveID':
            studyId = elem.get('accession')
        
        elif elem.tag == 'ProjectDescr':
            studyName = elem.findtext('Name')
            studyTitle = elem.findtext('Title')
            studyDesc = elem.findtext('Description')
            locus = elem.findall('LocusTagPrefix')
            for l in locus:
                if l.get('biosample_id') is not None:
                    biosampleList.append('BIOSAMPLE:'+l.get('biosample_id'))

        else:
            elem.clear() # Clear object to free up space.
        
        # If any information missing, treat them as stray.
        if None not in (studyId, studyName, studyTitle, studyDesc) and len(biosampleList)>0:
            count += 1
            for i,v in enumerate(biosampleList):
                df = df.append(pd.DataFrame([[studyId, studyName, studyTitle, studyDesc, v]],columns = col))
                
        # Export DataFrame into TSV
        if count == 1:
            df.to_csv(fn , sep='\t', index=False)
        else:
            df.to_csv(fn, sep='\t', mode='a+', header=False, index=False)

CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs
Wall time: 12.2 µs


In [100]:
%%time

xml = '../../target/bioproject.xml'
findTagInfo(xml)

CPU times: user 1h 7min 57s, sys: 3min 50s, total: 1h 11min 47s
Wall time: 1h 18min 57s


In [None]:
###################DEPRECATED############

# Using standard python library xml.etree.ElementTree (Slow)

'''%%time
xtree = et.parse('../../target/bioproject.xml')
xroot = xtree.getroot()

def nodeParse(node, col):
  
    df = pd.DataFrame(columns = col)
    biosampleIdList = []
    studyName =  None 
    studyTitle = None
    studyDesc = None
    
    
    for childNode in node:
        #print(childNode.tag, childNode.attrib)
        if childNode.tag == 'Name':
            studyName = childNode.text
            #print(studyName)
        if childNode.tag == 'Title':
            studyTitle = childNode.text
            #print(studyTitle)
        if childNode.tag == 'Description':
            studyDesc = childNode.text
            #print(studyDesc)
        if childNode.tag == 'LocusTagPrefix':
            #if 'assembly_id' in childNode.attrib:
                #assemblyId = childNode.attrib['assembly_id']
                #print(assemblyId)
                
            if 'biosample_id' in childNode.attrib:
                biosampleIdList.append('BIOSAMPLE:'+ childNode.attrib['biosample_id'])
                
                for i,v in enumerate(biosampleIdList):
                    if None not in (studyName, studyTitle, studyDesc, v):
                        df = df.append(pd.DataFrame([[studyName, studyTitle, studyDesc, v]],columns = col))

    return df

%timeit
count = 0
fn = '../../target/biosampleDescriptionDF.tsv'
dfCols = ['Name', 'Title', 'Description', 'BiosampleId']
dfMain = pd.DataFrame(columns = dfCols)
for n in xroot.iterparse('ProjectDescr'):
    count += 1
    dfMain = dfMain.append(nodeParse(n, dfCols))
    if count % 1000 == 0:
        print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        if count == 1000:
            dfMain.to_csv(fn , sep='\t', index=False)
        elif count > 1000:
            dfMain.to_csv(fn, sep='\t', mode='a+', header=False, index=False)
            
        dfMain = pd.DataFrame(columns = dfCols)
        #break;'''
