In [1]:
from lxml import etree
# Parse the Q9Y261 XML file
xml_file = open('Q9Y261.xml', 'rb')
tree     = etree.parse(xml_file)
tree

<lxml.etree._ElementTree at 0x249bd5f45c0>

In [2]:
# Define the namespace prefixes and URIs
namespace = {'uniprot': 'http://uniprot.org/uniprot'}

In [3]:
accessions   = tree.xpath('//uniprot:accession/text()', namespaces=namespace)
fullname     = tree.xpath('//uniprot:fullName/text()', namespaces=namespace)
shortname    = tree.xpath('//uniprot:shortName/text()', namespaces=namespace)
name         = tree.xpath('//uniprot:name/text()', namespaces=namespace)
gene         = tree.xpath('//uniprot:protein/gene/text()', namespaces=namespace)
#fullname    = tree.xpath('//uniprot:fullName/text()', namespaces=namespace)
#fullname    = tree.xpath('//uniprot:fullName/text()', namespaces=namespace)
#fullname    = tree.xpath('//uniprot:fullName/text()', namespaces=namespace)


In [7]:
name

['FOXA2_HUMAN', 'FOXA2', 'HNF3B', 'TCF3B', 'Homo sapiens', 'Human', '1', '2']

In [9]:
root  = tree.getroot()
entry = root.find('{http://uniprot.org/uniprot}entry')

In [None]:
protein = entry.findall('.//{http://uniprot.org/uniprot}protein')[0]

In [11]:
# Open the XML file
with open('Q9Y261.xml', 'rb') as f:
    xml = f.read()

# Parse the XML using lxml
root = etree.fromstring(xml)

# Iterate over the entry elements and extract the desired information
for entry in root.findall('.//entry'):
    protein   = entry.find('protein')
    gene      = entry.find('gene')
    organism  = entry.find('organism')
    reference = entry.find('reference')
    
    # Print the extracted information
    print('Protein:', protein.text)
    print('Gene:', gene.attrib['name'])
    print('Organism:', organism.find('name').text)
    print('Reference:', reference.attrib['key'])

In [12]:
[entry.find('protein') for entry in root.findall('.//entry')]

[]

In [17]:
import xml.etree.ElementTree as ET
# Parse the XML file and get the root element
tree = ET.parse('Q9Y261.xml')
root = tree.getroot()
root

<Element '{http://uniprot.org/uniprot}uniprot' at 0x00000249BEAB2A40>

In [16]:

# Loop through each uniprot element and extract the specified elements
for uniprot in root.findall('uniprot'):
    entry     = uniprot.find('entry')
    protein   = entry.find('protein').find('recommendedName').find('fullName').text
    gene      = entry.find('gene').find('name').text
    organism  = entry.find('organism').find('name').text
    reference = entry.find('reference').find('citation').find('title').text
    
    # Print the extracted elements
    print(f'Entry: {entry}')
    print(f'Protein: {protein}')
    print(f'Gene: {gene}')
    print(f'Organism: {organism}')
    print(f'Reference: {reference}\n')

## Using minidom

In [31]:
import xml.dom.minidom as minidom
# Open the XML file
with open('Q9Y261.xml', 'r') as f:
    xml_str = f.read()
# Parse the XML using minidom
dom = minidom.parseString(xml_str)
# Get the root element
root = dom.documentElement

# Iterate over the entry elements and extract the desired information
entries = root.getElementsByTagName('entry')
for entry in entries:
    protein   = entry.getElementsByTagName('protein')[0]
    gene      = entry.getElementsByTagName('gene')[0]
    organism  = entry.getElementsByTagName('organism')[0]
    reference = entry.getElementsByTagName('reference')[0]
    
    # Print the extracted information
    print('Protein:', protein.getElementsByTagName('fullName')[0].firstChild.nodeValue)
    print('Gene:', gene.getAttribute('name'))
    print('Organism:', organism.getElementsByTagName('name')[0].firstChild.nodeValue)
    print('Reference:', reference.getAttribute('key'))

Protein: Hepatocyte nuclear factor 3-beta
Gene: 
Organism: Homo sapiens
Reference: 1


In [None]:
### Add previous elements to graph data model

In [None]:
from py2neo import Graph, Node, Relationship
import xml.dom.minidom as minidom

# Connect to the Neo4j database
graph = Graph('bolt://localhost:7687', auth=('username', 'password'))

# Open the XML file
with open('Q9Y261.xml', 'r') as f:
    xml_str = f.read()
# Parse the XML using minidom
dom = minidom.parseString(xml_str)
# Get the root element
root = dom.documentElement

# Iterate over the entry elements and create nodes and relationships
entries = root.getElementsByTagName('entry')
for entry in entries:
    # Extract information from the entry
    protein   = entry.getElementsByTagName('protein')[0]
    gene      = entry.getElementsByTagName('gene')[0]
    organism  = entry.getElementsByTagName('organism')[0]
    reference = entry.getElementsByTagName('reference')[0]
    
    # Create nodes for the protein, gene, organism, and reference
    protein_node   = Node('Protein', name=protein.getElementsByTagName('fullName')[0].firstChild.nodeValue)
    gene_node      = Node('Gene', name=gene.getAttribute('name'))
    organism_node  = Node('Organism', name=organism.getElementsByTagName('name')[0].firstChild.nodeValue)
    reference_node = Node('Reference', key=reference.getAttribute('key'))
    
    # Create relationships between the nodes
    protein_gene_rel      = Relationship(protein_node, 'IS_PART_OF_GENE', gene_node)
    gene_organism_rel     = Relationship(gene_node, 'IS_PART_OF_ORGANISM', organism_node)
    protein_reference_rel = Relationship(protein_node, 'IS_REFERENCED_BY', reference_node)
    
    # Add the nodes and relationships to the database
    graph.create(protein_node)
    graph.create(gene_node)
    graph.create(organism_node)
    graph.create(reference_node)
    graph.create(protein_gene_rel)
    graph.create(gene_organism_rel)
    graph.create(protein_reference_rel)