In [1]:
#!/usr/bin/env python
#uni2pandas.py - This script is used to convert data from UniProt database format to pandas dataframe.
import click as ck
import numpy as np
import pandas as pd
import gzip
import logging
from utils import Ontology, is_exp_code, FUNC_DICT

logging.basicConfig(level=logging.INFO)

ORGS = set(['HUMAN', 'MOUSE', ])


C:\Users\Mohamed\Anaconda_3_v_1\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\Mohamed\Anaconda_3_v_1\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:

def main(go_file, uniprot_file, filter_exp, prop_annots, out_file,org):
    go = Ontology(go_file, with_rels=True)
    proteins, accessions, sequences, annotations, interpros, orgs, genes,gene_names = load_data(uniprot_file)
    df = pd.DataFrame({
        'proteins': proteins,
        'accessions': accessions,
        'sequences': sequences,
        'annotations': annotations,
        'interpros': interpros,
        'orgs': orgs,
        'genes': genes,
        'gene_names': gene_names
    })
    # Filter proteins
    df = df[df['orgs'] == org]
    logging.info('Filtering proteins with experimental annotations')
    index = []
    annotations = []
    iea_annotations = []
    for i, row in enumerate(df.itertuples()):
        annots = set()
        iea_annots = set()
        for annot in row.annotations:
            go_id, code = annot.split('|')
            anch_set = go.get_anchestors(go_id)
            if is_exp_code(code):
                annots |= anch_set
            iea_annots |= anch_set
        annots = list(annots)
        iea_annots = list(iea_annots)
        annotations.append(annots)
        iea_annotations.append(iea_annots)
    df['exp_annotations'] = annotations
    df['iea_annotations'] = iea_annotations

    df.to_pickle(out_file)
    logging.info('Successfully saved %d proteins' % (len(df),) )
    
def load_data(uniprot_file):
    print('....31')
    proteins = list()
    accessions = list()
    sequences = list()
    annotations = list()
    interpros = list()
    orgs = list()
    genes = list()
    gene_names = list()
    with gzip.open(uniprot_file, 'rt') as f:
        prot_id = ''
        prot_ac = ''
        seq = ''
        org = ''
        gene_id = ''
        names = list()
        annots = list()
        ipros = list()
        for line in f:
            items = line.strip().split('   ')
            if items[0] == 'ID' and len(items) > 1:
                if prot_id != '':
                    proteins.append(prot_id)
                    accessions.append(prot_ac)
                    sequences.append(seq)
                    annotations.append(annots)
                    interpros.append(ipros)
                    orgs.append(org)
                    genes.append(gene_id)
                    gene_names.append(names)
                prot_id = items[1]
                annots = list()
                ipros = list()
                names = list()
                seq = ''
                gene_id = ''
            elif items[0] == 'AC' and len(items) > 1:
                prot_ac = items[1]
            elif items[0] == 'OX' and len(items) > 1:
                if items[1].startswith('NCBI_TaxID='):
                    org = items[1][11:]
                    end = org.find(' ')
                    org = org[:end]
                else:
                    org = ''
            elif items[0] == 'GN' and len(items) > 1:
                items = items[1][:-1].split('; ')
                for item in items:
                    if item.startswith('Name='):
                        names.append(item[5:].split()[0])
                    elif item.startswith('Synonyms='):
                        for item in list(item[9:].split(', ')):
                            names.append(item.split()[0])
                    elif item.startswith('ORFNames='):
                        for item in list(item[9:].split(', ')):
                            names.append(item.split()[0])
                    elif item.startswith('OrderedLocusNames='):
                        for item in list(item[9:].split(', ')):
                            names.append(item.split()[0])
            elif items[0] == 'DR' and len(items) > 1:
                items = items[1].split('; ')
                if items[0] == 'GO':
                    go_id = items[1]
                    code = items[3].split(':')[0]
                    annots.append(go_id + '|' + code)
                if items[0] == 'InterPro':
                    ipro_id = items[1]
                    ipros.append(ipro_id)
                if items[0] == 'GeneID':
                    gene_id = items[1]
            elif items[0] == 'SQ':
                seq = next(f).strip().replace(' ', '')
                while True:
                    sq = next(f).strip().replace(' ', '')
                    if sq == '//':
                        break
                    else:
                        seq += sq

        proteins.append(prot_id)
        accessions.append(prot_ac)
        sequences.append(seq)
        annotations.append(annots)
        interpros.append(ipros)
        orgs.append(org)
        genes.append(gene_id)
        gene_names.append(names)
    return proteins, accessions, sequences, annotations, interpros, orgs, genes,gene_names




In [11]:
main('data/go.obo','data/uniprot_sprot.dat.gz',True,True,'data/My_Implementations/swissprot_version.pkl' ,'9606')

....31


INFO:root:Filtering proteins with experimental annotations
INFO:root:Successfully saved 20365 proteins


In [13]:
unpickled_df = pd.read_pickle('data/My_Implementations/swissprot_version.pkl')

In [14]:
unpickled_df

Unnamed: 0,proteins,accessions,sequences,annotations,interpros,orgs,genes,gene_names,exp_annotations,iea_annotations
321,1433B_HUMAN,P31946; A8K9K2; E1P616;,MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...,"[GO:0005737|IDA, GO:0005829|IDA, GO:0070062|HD...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7529,[YWHAB],"[GO:0008150, GO:0007006, GO:0035329, GO:000718...","[GO:0008150, GO:0007006, GO:0035329, GO:000718..."
338,1433E_HUMAN,P62258; B3KY71; D3DTH5; P29360; P42655; Q4VJB6...,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,"[GO:0005623|IEA, GO:0090724|IEA, GO:0005737|ID...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7531,[YWHAE],"[GO:0005261, GO:0043066, GO:0008150, GO:000700...","[GO:0005261, GO:0043066, GO:0008150, GO:000700..."
344,1433F_HUMAN,Q04917;,MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...,"[GO:0005737|ISS, GO:0005829|TAS, GO:0070062|HD...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7533,"[YWHAH, YWHA1]","[GO:0005261, GO:0086010, GO:0031958, GO:000815...","[GO:0005261, GO:0086010, GO:0051093, GO:000815..."
350,1433G_HUMAN,P61981; O70457; P35214; Q6FH52; Q9UDP2; Q9UN99;,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...,"[GO:0005829|TAS, GO:0070062|HDA, GO:0005925|HD...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7532,[YWHAG],"[GO:0008150, GO:0007006, GO:0032501, GO:000701...","[GO:0008150, GO:0007006, GO:0032501, GO:000701..."
356,1433S_HUMAN,P31947; Q6FH30; Q6FH51; Q96DH0;,MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV...,"[GO:0005829|IDA, GO:0070062|HDA, GO:0005615|TA...","[IPR000308, IPR023409, IPR036815, IPR023410, I...",9606,2810,"[SFN, HME1]","[GO:0004860, GO:0043066, GO:0008150, GO:000700...","[GO:0043066, GO:0008150, GO:0007006, GO:003250..."
...,...,...,...,...,...,...,...,...,...,...
561865,ZY11A_HUMAN,Q6WRX3; A6NCK5;,MVHFLHPGHTPRNIVPPDAQKDALGCCVVQEEASPYTLVNICLNVL...,[GO:0031462|IBA],"[IPR011989, IPR016024, IPR032675]",9606,440590,"[ZYG11A, ZYG11]",[],"[GO:0044464, GO:1902494, GO:0005622, GO:000015..."
561867,ZY11B_HUMAN,Q9C0D3; Q8N2X3; Q9H8L8;,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,"[GO:0031462|IDA, GO:0032436|IMP, GO:0006515|IMP]","[IPR011989, IPR016024, IPR001611, IPR032675, I...",9606,79699,"[ZYG11B, KIAA1730]","[GO:0044267, GO:0008150, GO:0030163, GO:003016...","[GO:0044267, GO:0008150, GO:0030163, GO:003016..."
561880,ZYX_HUMAN,Q15942; A4D2G6; B4DQX7; Q6I9S4;,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,"[GO:0005737|IBA, GO:0005829|IDA, GO:0005925|ID...",[IPR001781],9606,7791,[ZYX],"[GO:0030029, GO:0008150, GO:0043226, GO:000701...","[GO:0030029, GO:0008150, GO:0043226, GO:000701..."
561884,ZZEF1_HUMAN,O43149; A7MBM5; Q6NXG0; Q6ZRA1; Q6ZSF4; Q9NVB9;,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,"[GO:0005509|IEA, GO:0008270|IEA]","[IPR004939, IPR011992, IPR002048, IPR008979, I...",9606,23140,"[ZZEF1, KIAA0399]",[],"[GO:0003674, GO:0046914, GO:0005488, GO:004316..."


In [15]:
unpickled_df = pd.read_pickle('data/swissprot.pkl')

In [16]:
unpickled_df

Unnamed: 0,proteins,accessions,sequences,annotations,interpros,orgs,genes,gene_names,exp_annotations,iea_annotations
318,1433B_HUMAN,P31946; A8K9K2; E1P616;,MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...,"[GO:0005737|IDA, GO:0005829|IDA, GO:0070062|HD...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7529,[YWHAB],"[GO:0010648, GO:1905477, GO:0010608, GO:004426...","[GO:0010648, GO:0044260, GO:0008637, GO:007184..."
335,1433E_HUMAN,Q7M4R4;,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,"[GO:0090724|IEA, GO:0005737|IDA, GO:0005829|TA...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7531,[YWHAE],"[GO:0034220, GO:0086013, GO:0051051, GO:004426...","[GO:0034220, GO:0086013, GO:0051051, GO:004426..."
341,1433F_HUMAN,Q04917;,MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...,"[GO:0005737|ISS, GO:0005829|TAS, GO:0070062|HD...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7533,"[YWHAH, YWHA1]","[GO:1905477, GO:0034220, GO:0044260, GO:007088...","[GO:0034220, GO:0044260, GO:0050804, GO:005109..."
347,1433G_HUMAN,P61981; O70457; P35214; Q6FH52; Q9UDP2; Q9UN99;,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...,"[GO:0005829|TAS, GO:0070062|HDA, GO:0005925|HD...","[IPR000308, IPR023409, IPR036815, IPR023410]",9606,7532,[YWHAG],"[GO:1905477, GO:0050804, GO:0000278, GO:000863...","[GO:0044260, GO:0050804, GO:0008637, GO:007184..."
353,1433S_HUMAN,P31947; Q6FH30; Q6FH51; Q96DH0;,MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSV...,"[GO:0005829|IDA, GO:0070062|HDA, GO:0005615|TA...","[IPR000308, IPR023409, IPR036815, IPR023410, I...",9606,2810,"[SFN, HME1]","[GO:1905477, GO:0097153, GO:0044260, GO:007242...","[GO:0044260, GO:0072422, GO:0045930, GO:003250..."
...,...,...,...,...,...,...,...,...,...,...
559588,ZY11A_HUMAN,Q6WRX3; A6NCK5;,MVHFLHPGHTPRNIVPPDAQKDALGCCVVQEEASPYTLVNICLNVL...,[GO:0031462|IBA],"[IPR011989, IPR016024, IPR032675]",9606,440590,"[ZYG11A, ZYG11]",[],"[GO:1990234, GO:1902494, GO:0032991, GO:004446..."
559590,ZY11B_HUMAN,Q9C0D3; Q8N2X3; Q9H8L8;,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,[GO:0031462|IBA],"[IPR011989, IPR016024, IPR001611, IPR032675, I...",9606,79699,"[ZYG11B, KIAA1730]",[],"[GO:1990234, GO:1902494, GO:0032991, GO:004446..."
559603,ZYX_HUMAN,Q15942; A4D2G6; B4DQX7; Q6I9S4;,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,"[GO:0005913|IDA, GO:0005829|IDA, GO:0005925|ID...",[IPR001781],9606,7791,[ZYX],"[GO:0044444, GO:0070887, GO:0071840, GO:005101...","[GO:0044444, GO:0051707, GO:0030139, GO:007088..."
559607,ZZEF1_HUMAN,O43149; A7MBM5; Q6NXG0; Q6ZRA1; Q6ZSF4; Q9NVB9;,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,"[GO:0005509|IEA, GO:0008270|IEA]","[IPR004939, IPR011992, IPR002048, IPR008979, I...",9606,23140,"[ZZEF1, KIAA0399]",[],"[GO:0005488, GO:0005509, GO:0046872, GO:004316..."


# Generating data for mouse

In [3]:
main('data/go.obo','data/uniprot_sprot.dat.gz',True,True,'data/My_Implementations/swissprot_version_mouse.pkl','10090' )

....31


INFO:root:Filtering proteins with experimental annotations
INFO:root:Successfully saved 17033 proteins


In [6]:
unpickled_df = pd.read_pickle('data/My_Implementations/swissprot_version_mouse.pkl')

In [7]:
unpickled_df

Unnamed: 0,proteins,accessions,sequences,annotations,interpros,orgs,genes,gene_names,exp_annotations,iea_annotations
323,1433B_MOUSE,Q9CQV8; O70455; Q3TY33; Q3UAN6;,MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...,"[GO:0005737|ISO, GO:0005829|ISO, GO:0042470|IE...","[IPR000308, IPR023409, IPR036815, IPR023410]",10090,54401,[Ywhab],"[GO:0005575, GO:0046907, GO:0005623, GO:000367...","[GO:0044267, GO:0005575, GO:0046907, GO:000367..."
339,1433E_MOUSE,P62259; P29360; P42655; Q63631;,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,"[GO:0030424|ISO, GO:0005623|IEA, GO:0090724|IS...","[IPR000308, IPR023409, IPR036815, IPR023410]",10090,22627,[Ywhae],"[GO:0044267, GO:0005575, GO:0046907, GO:000367...","[GO:0044267, GO:0005575, GO:0046907, GO:004698..."
345,1433F_MOUSE,P68510; P11576; P70198; Q3TGZ9;,MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...,"[GO:0005737|IDA, GO:0005829|TAS, GO:0014704|IC...","[IPR000308, IPR023409, IPR036815, IPR023410]",10090,22629,[Ywhah],"[GO:0005575, GO:0005911, GO:0046907, GO:000367...","[GO:0005575, GO:0031958, GO:0046982, GO:000367..."
351,1433G_MOUSE,P61982; O70457; P35214; Q3UFD6; Q4FK44; Q9UDP2...,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...,"[GO:0005829|TAS, GO:0043209|HDA, GO:0098793|IS...","[IPR000308, IPR023409, IPR036815, IPR023410]",10090,22628,[Ywhag],"[GO:0005575, GO:0046907, GO:0044444, GO:000367...","[GO:0005575, GO:0046907, GO:0003674, GO:004846..."
357,1433S_MOUSE,O70456; Q3TEZ1;,MERASLIQKAKLAEQAERYEDMAAFMKSAVEKGEELSCEERNLLSV...,"[GO:0005737|IDA, GO:0005829|ISO, GO:0005576|IE...","[IPR000308, IPR023409, IPR036815, IPR023410, I...",10090,55948,"[Sfn, Mkrn3]","[GO:0044267, GO:0005575, GO:0046907, GO:000367...","[GO:0044267, GO:0005575, GO:0046907, GO:000367..."
362,1433T_MOUSE,P68254; P35216; Q3TW69; Q3UJN5; Q5SP76; Q5U423...,MEKTELIQKAKLAEQAERYDDMATCMKAVTEQGAELSNEERNLLSV...,"[GO:0005737|ISO, GO:0005829|TAS, GO:0032991|IS...","[IPR000308, IPR023409, IPR036815, IPR023410, I...",10090,22630,[Ywhaq],"[GO:0005575, GO:0051716, GO:0046907, GO:004444...","[GO:0005575, GO:0046907, GO:0003674, GO:003476..."
374,1433Z_MOUSE,P63101; P35215; P70197; P97286; Q3TSF1; Q5EBQ1;,MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSV...,"[GO:0005623|IEA, GO:0031252|ISO, GO:0005829|TA...","[IPR000308, IPR023409, IPR036815, IPR023410]",10090,22631,[Ywhaz],"[GO:0044267, GO:0005575, GO:0071944, GO:004690...","[GO:0044267, GO:0005575, GO:0008037, GO:004690..."
552,1A1L1_MOUSE,A2AIG8; Q8CHS6;,MFCLPQQESTAPTTCTGSASTQDMDSGYGDGLQGECLRKPDQTQPK...,"[GO:0003824|IEA, GO:0042802|ISO, GO:0030170|IE...","[IPR004839, IPR015424, IPR015422, IPR015421]",10090,329470,[Accs],[],"[GO:0003674, GO:0019842, GO:0009058, GO:000815..."
555,1A1L2_MOUSE,Q3UX83; Q3TQ30;,MSENRNEGSSQAAKANSDTQTPSHFKVTHPRLRDQLKKKSSKKKGF...,"[GO:0003824|IEA, GO:0030170|IEA, GO:0009058|IEA]","[IPR004839, IPR015424, IPR015422, IPR015421]",10090,381411,"[Accsl, Gm1967]",[],"[GO:0030170, GO:0005488, GO:0003674, GO:004803..."
653,2A5A_MOUSE,Q6PD03; Q8R1U7;,MSSPSPPAPVACAAISASEKVDGFTRKSVRKAQRQKRSQGSSQFRS...,"[GO:0005813|ISO, GO:0000775|IEA, GO:0005737|IS...","[IPR011989, IPR016024, IPR002554]",10090,226849,[Ppp2r5a],"[GO:0005575, GO:0099080, GO:0044444, GO:003143...","[GO:0044267, GO:0005575, GO:0003674, GO:003530..."
