In [13]:
import gzip
import pandas as pd
from lxml import etree
from lxml.etree import XMLSyntaxError
from time import time
from tqdm import tqdm
import logging
import sys

compressed_uniref50_xml = "./data/raw/uniref50/uniref50.xml.gz"

# ncbi taxonomy
# arabidopsis： 3702
# fly: 7227
# mouse: 10090
# worm: 6239
# yeast: 559292
# human: 9606
target_taxons = ["3702", "7227", "10090", "6239", "559292", "9606"]

In [14]:
# read xml.gz in a lazy way
def each_chunk(stream, separator):
    """
    Yield lines from `stream` until `separator`. Source: https://stackoverflow.com/a/47927374
    """
    buffer = ""
    while True:  # until EOF
        chunk = stream.read(65536).decode()  # read 2^16 bytes
        if not chunk:  # EOF?
            yield buffer
            break
        buffer += chunk
        while True:  # until no separator is found
            try:
                part, buffer = buffer.split(separator, 1)
            except ValueError:
                break
            else:
                yield part

Parse XML

In [15]:
label = "UniRef50"  # to remove the label in the end of document

# handle a entry
def handle_entry(xml):
    # -- parse the XML
    xml = xml.replace("</{}>\n".format(label), "")
    try:
        root = etree.fromstring(xml)
    except XMLSyntaxError:
        print(xml)
        return None

    clusterid = root.get("id")
    protname = root.xpath("/entry/name/text()")[0]
    mem_string_list = []

    # mem_cnt = int(root.xpath("/entry/property[@type='member count']/@value")[0])
    common_taxon = root.xpath("/entry/property[@type='common taxon']/@value")[0]
    common_taxon_id = root.xpath("/entry/property[@type='common taxon ID']/@value")[0]
   
    # representative member
    rep_mem_accessions = root.xpath("/entry/representativeMember/dbReference/property[@type='UniProtKB accession']/@value")
    rep_ncbi_taxon = root.xpath("/entry/representativeMember/dbReference/property[@type='NCBI taxonomy']/@value")
    assert len(rep_ncbi_taxon) == 1
    if rep_ncbi_taxon[0] in target_taxons:
        mem_string_list.append(",".join(rep_mem_accessions) + ":" + rep_ncbi_taxon[0])

    # members
    members_entries = root.xpath("/entry/member/dbReference[@type='UniProtKB ID']")
    for mem in members_entries:
        mem_accessions = mem.xpath("property[@type='UniProtKB accession']/@value")
        mem_ncbi_taxon = mem.xpath("property[@type='NCBI taxonomy']/@value")
        assert len(mem_ncbi_taxon) == 1
        if mem_ncbi_taxon[0] in target_taxons:
            mem_string_list.append(",".join(mem_accessions) + ":" + mem_ncbi_taxon[0])

    if len(mem_string_list) == 0:
        return None

    member_string = ";".join(mem_string_list)
    return f"{clusterid}\t{protname}\t{common_taxon}\t{common_taxon_id}\t{member_string}"

In [16]:
out_f = gzip.open("./cluster_member_six_module_orgas.tsv.gz", "wb")
out_f.write("Entry\tName\tCommon Taxon\tCommon Taxon ID\tMembers\n".encode())
out_f.flush()

start = time()
sep = "<entry"
in_f = gzip.open(compressed_uniref50_xml, "rb")
buffer_size = 1000
buffer = []

for idx, chunk in tqdm(enumerate(each_chunk(in_f, separator=sep)), mininterval=10):
    if idx == 0:        # skip the header
        continue

    # -- get the XML entry as text
    xml = sep + chunk  # separator has been dropped, add it to get a valid xml    

    cluster = handle_entry(xml)
    if cluster is not None:
        buffer.append(cluster)
    
        if len(buffer) >= buffer_size:
            out_f.write(("\n".join(buffer)+"\n").encode())
            buffer = []
            out_f.flush()

if len(buffer) > 0:
    out_f.write(("\n".join(buffer)+"\n").encode())
    out_f.flush()
    buffer = []

end = time()
in_f.close()
out_f.close()
print("{} entries converted in {:.2f} seconds".format(idx, end - start))

52523203it [4:25:57, 3291.39it/s]

<entry id="UniRef50_UPI001BB8EC77" updated="2022-04-27">
<name>Cluster: Uncharacterized protein</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Thermobacillus xylanilyticus"/>
<property type="common taxon ID" value="76633"/>
<representativeMember>
<dbReference type="UniParc ID" id="UPI001BB8EC77">
<property type="UniRef100 ID" value="UniRef100_UPI001BB8EC77"/>
<property type="UniRef90 ID" value="UniRef90_UPI001BB8EC77"/>
<property type="protein name" value="Uncharacterized protein"/>
<property type="source organism" value="Szabonella"/>
<property type="NCBI taxonomy" value="76633"/>
<property type="length" value="11"/>
<property type="isSeed" value="true"/>
</dbReference>
<sequence length="11" checksum="40EAC3F95AAEB9C7">MVLLFCDIVRV</sequence>
</representativeMember>
</entry>
</UniRef50>

52523202 entries converted in 15957.76 seconds



