In [3]:
# import click as ck
# import numpy as np
#import pandas as pd
#import gzip
# import logging
# from utils import Ontology, is_exp_code, is_cafa_target, FUNC_DICT
# from collections import Counter

import os
import sys
#sys.path.append('.')
import gzip

import click as ck
import numpy as np
import pandas as pd
from collections import Counter, deque
from utils import (
    Ontology, FUNC_DICT, NAMESPACES, MOLECULAR_FUNCTION, BIOLOGICAL_PROCESS,
    CELLULAR_COMPONENT, HAS_FUNCTION,
    is_exp_code, is_cafa_target,)
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
#from deepgo.extract_esm import extract_esm

In [6]:
go = Ontology('go.obo', with_rels=True)

<utils.Ontology at 0x112d322f0>

## Uni2pandas.py

In [3]:
swissprot_file = "data/uniprot_sprot-only2021_04/uniprot_sprot.dat.gz"
#out_file = "plant.pkl"
device = "cpu"

In [25]:
swissprot_file = "data_plant/uniprot_sprot.dat.gz"

In [4]:
def load_data(swissprot_file):
    """
    Parses UniProtKB data file and loads list of proteins and their
    annotations to lists
    Args:
       swissprot_file (string): A path to the data file
    Returns:
       Tuple of 8 lists (proteins, accessions, sequences, string_ids,
       orgs, genes, interpros)
    """
    
    proteins = list()
    accessions = list()
    sequences = list()
    annotations = list()
    string_ids = list()
    orgs = list()
    genes = list()
    interpros = list()
    with gzip.open(swissprot_file, 'rt') as f:
        prot_id = ''
        prot_ac = ''
        seq = ''
        org = ''
        annots = list()
        strs = list()
        iprs = list()
        gene_id = ''
        for line in f:
            items = line.strip().split('   ')
            if items[0] == 'ID' and len(items) > 1:
                if prot_id != '':
                    proteins.append(prot_id)
                    accessions.append(prot_ac)
                    sequences.append(seq)
                    annotations.append(annots)
                    string_ids.append(strs)
                    orgs.append(org)
                    genes.append(gene_id)
                    interpros.append(iprs)
                prot_id = items[1]
                annots = list()
                strs = list()
                iprs = list()
                seq = ''
                gene_id = ''
            elif items[0] == 'AC' and len(items) > 1:
                prot_ac = items[1]
            elif items[0] == 'OX' and len(items) > 1:
                if items[1].startswith('NCBI_TaxID='):
                    org = items[1][11:]
                    end = org.find(' ')
                    org = org[:end]
                else:
                    org = ''
            elif items[0] == 'DR' and len(items) > 1:
                items = items[1].split('; ')
                if items[0] == 'GO':
                    go_id = items[1]
                    code = items[3].split(':')[0]
                    annots.append(go_id + '|' + code)
                elif items[0] == 'STRING':
                    str_id = items[1]
                    strs.append(str_id)
                elif items[0] == 'GeneID':
                    gene_id = items[1]
                elif items[0] == 'InterPro':
                    ipr_id = items[1]
                    iprs.append(ipr_id)
            elif items[0] == 'SQ':
                seq = next(f).strip().replace(' ', '')
                while True:
                    sq = next(f).strip().replace(' ', '')
                    if sq == '//':
                        break
                    else:
                        seq += sq

        proteins.append(prot_id)
        accessions.append(prot_ac)
        sequences.append(seq)
        annotations.append(annots)
        string_ids.append(strs)
        orgs.append(org)
        genes.append(gene_id)
        interpros.append(iprs)
    return proteins, accessions, sequences, annotations, string_ids, orgs, genes, interpros

In [26]:
proteins, accessions, sequences, annotations, string_ids, orgs, genes, interpros = load_data(swissprot_file)

In [10]:
len(proteins)

565928

In [None]:
len(proteins)

In [9]:
string_ids[:10]

[[], [], [], [], [], [], [], [], [], []]

In [27]:
len(proteins)

571609

In [52]:
len(accessions)

571609

In [28]:
df = pd.DataFrame({
    'proteins': proteins,
    'accessions': accessions,
    'genes': genes,
    'sequences': sequences,
    'annotations': annotations,
    'string_ids': string_ids,
    'orgs': orgs,
    'interpros': interpros
})

In [29]:
df.shape

(571609, 8)

In [14]:
df.head()

Unnamed: 0,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros
0,001R_FRG3G,Q6GZX4;,2947773,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,[GO:0046782|IEA],[],654924,[IPR007031]
1,002L_FRG3G,Q6GZX3;,2947774,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[GO:0033644|IEA, GO:0016021|IEA]",[],654924,[IPR004251]
2,002R_IIV3,Q197F8;,4156251,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,[],[],345201,[]
3,003L_IIV3,Q197F7;,4156252,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,[],[],345201,[]
4,003R_FRG3G,Q6GZX2;,2947775,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,[],[],654924,[]


In [32]:
df.string_ids

0         []
1         []
2         []
3         []
4         []
          ..
571604    []
571605    []
571606    []
571607    []
571608    []
Name: string_ids, Length: 571609, dtype: object

In [15]:
df[df['genes'] == '842929']

Unnamed: 0,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros
67698,CLE18_ARATH,Q3ECH9;,842929,MHLLKGGVVLIITLILFLITSSIVAIREDPSLIGVDRQIPTGPDPL...,"[GO:0048046|ISS, GO:0005615|IEA, GO:0033612|IP...",[3702.AT1G66145.1],3702,[]


In [9]:
#df.to_pickle("./data_plant/swissprot_full.pkl")
df[df['orgs'] == '3702'].shape

(16386, 8)

In [21]:
df = pd.read_pickle("./data_plant/swissprot_full.pkl")

In [13]:
df[df['orgs'] == '3702'].to_pickle("./data_plant/arab_full.pkl")

In [11]:
pd.concat([df[df['orgs'] == '39947'], df[df['orgs'] == '39946']]).shape

(5059, 8)

In [12]:
pd.concat([df[df['orgs'] == '39947'], df[df['orgs'] == '39946']]).to_pickle("./data_plant/rice_full.pkl")

In [14]:
pd.concat([df[df['orgs'] == '3702'], df[df['orgs'] == '39947'], df[df['orgs'] == '39946']]).to_pickle("./data_plant/plant_full.pkl")

In [59]:
df.shape

(571609, 8)

In [30]:
index = []
annotations = []
for i, row in enumerate(df.itertuples()):
    annots = []
    for annot in row.annotations:
        go_id, code = annot.split('|')
        if is_exp_code(code):
            annots.append(go_id)
    # Ignore proteins without experimental annotations
    if len(annots) == 0:
        continue
    index.append(i)
    annotations.append(annots)
df2 = df.iloc[index]
df2 = df2.reset_index()

In [10]:
df2.shape

(77647, 9)

In [31]:
df2.shape

(80815, 9)

In [61]:
df2['exp_annotations'] = annotations

In [62]:
prop_annotations = []
for i, row in df2.iterrows():
    # Propagate annotations
    annot_set = set()
    annots = row['exp_annotations']
    for go_id in annots:
        annot_set |= go.get_ancestors(go_id)
    annots = list(annot_set)
    prop_annotations.append(annots)
df2['prop_annotations'] = prop_annotations

In [63]:
df2.head(3)

Unnamed: 0,index,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros,exp_annotations,prop_annotations
0,242,11K_PAVHV,P0DJZ0;,,MQNNTTGMDTKSLKNCGQPKAVCTHCKHSPPCPQPGCVTKRPPVPP...,[GO:0030430|IDA],[],648237,[],[GO:0030430],"[GO:0005575, GO:0043656, GO:0030430, GO:001899..."
1,243,11S1_CARIL,B5KVH4;,,MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...,"[GO:0019863|IEA, GO:0045735|IC, GO:0048316|IEP...",[],32201,"[IPR022379, IPR006044, IPR006045, IPR014710, I...","[GO:0045735, GO:0048316, GO:0010431]","[GO:0008150, GO:0032504, GO:0010154, GO:000300..."
2,245,11S1_MACIN,C0HLR7;,,LSTLNTHNLPLLRLVYLEREDLVAVHVDDLNNQANQLDQKLDGGLL...,"[GO:0019863|IDA, GO:0045735|IEA]",[],60698,[IPR011051],[GO:0019863],"[GO:0044877, GO:0003674, GO:0019865, GO:001986..."


In [12]:
df2[df2['orgs'] == '3702'].shape

(10679, 11)

In [13]:
df2[df2['orgs'] == '39947'].shape[0] + df2[df2['orgs'] == '39946'].shape[0]

1093

In [14]:
arab = df2[df2['orgs'] == '3702']
rice = pd.concat([df2[df2['orgs'] == '39947'], df2[df2['orgs'] == '39946']])

In [15]:
arab.shape

(10679, 11)

In [16]:
rice.shape

(1093, 11)

In [17]:
plant = pd.concat([arab, rice])

In [18]:
plant.shape

(11772, 11)

In [19]:
plant.to_pickle("data_plant/plant_exp.pkl")
rice.to_pickle("data_plant/rice_exp.pkl")
arab.to_pickle("data_plant/arab_exp.pkl")

In [66]:
df2.to_pickle("data/base.pkl")

In [64]:
cafa_target = []
for i, row in enumerate(df2.itertuples()):
    if is_cafa_target(row.orgs):
        cafa_target.append(True)
    else:
        cafa_target.append(False)
df2['cafa_target'] = cafa_target

In [27]:
os.path.splitext(swissprot_file)[0]

'data_plant/uniprot_sprot.dat'

In [27]:
# Extract ESM2 embeddings
# Save sequences to a FASTA file
# print('Extracting ESM2 embeddings')
############## For arab
# fasta_file = os.path.splitext("plant")[0] + '.fa'
fasta_file = "data_plant/arab.fa"
with open(fasta_file, 'w') as f:
    for row in arab.itertuples():
        record = SeqRecord(
            Seq(row.sequences),
            id=row.proteins,
            description=''
        )
        SeqIO.write(record, f, 'fasta')
prots, esm2_data = extract_esm(fasta_file, device=device)
esm2_data = list(esm2_data)
arab['esm2'] = esm2_data

Read data_plant/arab.fa with 10679 sequences
Processing 1 of 1349 batches (58 sequences)
Processing 2 of 1349 batches (52 sequences)
Processing 3 of 1349 batches (48 sequences)
Processing 4 of 1349 batches (44 sequences)
Processing 5 of 1349 batches (42 sequences)
Processing 6 of 1349 batches (40 sequences)
Processing 7 of 1349 batches (39 sequences)
Processing 8 of 1349 batches (37 sequences)
Processing 9 of 1349 batches (36 sequences)
Processing 10 of 1349 batches (35 sequences)
Processing 11 of 1349 batches (34 sequences)
Processing 12 of 1349 batches (33 sequences)
Processing 13 of 1349 batches (32 sequences)
Processing 14 of 1349 batches (32 sequences)
Processing 15 of 1349 batches (31 sequences)
Processing 16 of 1349 batches (30 sequences)
Processing 17 of 1349 batches (30 sequences)
Processing 18 of 1349 batches (29 sequences)
Processing 19 of 1349 batches (29 sequences)
Processing 20 of 1349 batches (28 sequences)
Processing 21 of 1349 batches (28 sequences)
Processing 22 of 13

KeyboardInterrupt: 

In [None]:
############## For rice
fasta_file = "data_plant/rice.fa"
with open(fasta_file, 'w') as f:
    for row in rice.itertuples():
        record = SeqRecord(
            Seq(row.sequences),
            id=row.proteins,
            description=''
        )
        SeqIO.write(record, f, 'fasta')
prots, esm2_data = extract_esm(fasta_file, device=device)
esm2_data = list(esm2_data)
rice['esm2'] = esm2_data

In [None]:
############## For plant
fasta_file = "data_plant/plant.fa"
with open(fasta_file, 'w') as f:
    for row in plant.itertuples():
        record = SeqRecord(
            Seq(row.sequences),
            id=row.proteins,
            description=''
        )
        SeqIO.write(record, f, 'fasta')
prots, esm2_data = extract_esm(fasta_file, device=device)
esm2_data = list(esm2_data)
plant['esm2'] = esm2_data

#logging.info('Successfully saved %d proteins' % (len(df),) )

### PPI_data.py

In [None]:
string_db_actions_file = 
data_file = 

In [None]:
#df = pd.read_pickle(data_file)
proteins = df['proteins']
prot_idx = {v: k for k, v in enumerate(proteins)}

In [None]:
mapping = {}
for i, row in enumerate(df.itertuples()):
    for st_id in row.string_ids:
        mapping[st_id] = row.proteins
relations = {}
inters = {}

In [None]:
with gzip.open(string_db_actions_file, 'rt') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t')
        p1, p2 = it[0], it[1]
        if p1 not in mapping or p2 not in mapping:
            continue
        score = int(it[6])
        if score < 700:
            continue
        p1, p2 = mapping[p1], mapping[p2]
        rel = it[2]
        if rel not in relations:
            relations[rel] = len(relations)
        is_dir = it[4] == 't'
        a_is_act = it[5] == 't'
        if p1 not in inters:
            inters[p1] = set()
        inters[p1].add((rel, p2))
interactions = []

In [None]:
for i, row in enumerate(df.itertuples()):
    p_id = row.proteins
    if p_id in inters:
        interactions.append(inters[p_id])
    else:
        interactions.append([])

In [None]:
df['interactions'] = interactions

In [None]:
df.to_pickle(data_file)

### Pkl2fasta.py

In [None]:
data_file = 'data/swissprot_exp.pkl'
out_file = 'data/swissprot_exp.fa'
# Load interpro data
df = pd.read_pickle(data_file)
print(len(df)) 
with open(out_file, 'w') as f:
    for row in df.itertuples():
        prot_id = row.proteins
        f.write('>' + prot_id + '\n')
        f.write(row.sequences + '\n')

In [2]:
cc = pd.read_pickle("data/cc/train_data.pkl")

AttributeError: Can't get attribute '_unpickle_block' on <module 'pandas._libs.internals' from '/Users/ngoc/anaconda3/envs/deepgoplus/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-darwin.so'>

### TRY TO GET MORE GO ANNOTATIONS FOR RICE

In [20]:
import os
import sys
#sys.path.append('.')
import gzip

import click as ck
import numpy as np
import pandas as pd
from collections import Counter, deque
from deepgo.utils import (
    Ontology, FUNC_DICT, NAMESPACES, MOLECULAR_FUNCTION, BIOLOGICAL_PROCESS,
    CELLULAR_COMPONENT, HAS_FUNCTION,
    is_cafa_target,)
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
#from deepgo.extract_esm import extract_esm

def is_exp_code(code):
    # originally selected code: ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'TAS', 'IC', 'HTP', 'HDA', 'HMP', 'HGI', 'HEP']
    # return code in set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'IC', 'HTP', 'HDA', 'HMP', 'HGI', 'HEP', 'TAS',
    #                    'ISS', 'ISA', 'ISO', 
    #                    'IBA', 'IBD', 'IKR', 'IRD'])
    return code in set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'TAS', 'IC', 'HTP', 'HDA', 'HMP', 'HGI', 'HEP'])

In [8]:
go = Ontology('data/go.obo', with_rels=True)

In [28]:
#rice = pd.read_pickle("./data_plant/rice_full.pkl")
plant = pd.read_pickle("./data_plant/arab_full.pkl")

In [29]:
#rice.shape
plant.shape

(16386, 8)

In [30]:
plant.head(3)

Unnamed: 0,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros
283,14310_ARATH,P48347; Q9LME5;,838837,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,"[GO:0005829|HDA, GO:0005739|HDA, GO:0005634|HD...",[3702.P48347],3702,"[IPR000308, IPR023409, IPR036815, IPR023410]"
285,14311_ARATH,Q9S9Z8; A0JQ87; F4HWN0; Q0WL19;,840380,MENERAKQVYLAKLNEQAERYDEMVEAMKKVAALDVELTIEERNLL...,"[GO:0005737|IEA, GO:0005634|HDA, GO:0051117|ID...",[3702.Q9S9Z8],3702,"[IPR000308, IPR023409, IPR036815, IPR023410]"
286,14312_ARATH,Q9C5W6; Q541X6; Q8LEN1; Q9FZD3;,839189,MSSSGSDKERETFVYMAKLSEQAERYDEMVETMKKVARVNSELTVE...,"[GO:0005737|IEA, GO:0005634|HDA, GO:0007165|IEA]",[3702.Q9C5W6],3702,"[IPR000308, IPR023409, IPR036815, IPR023410]"


In [31]:
rice = plant

In [32]:
index = []
annotations = []
for i, row in enumerate(rice.itertuples()):
    annots = []
    for annot in row.annotations:
        go_id, code = annot.split('|')
        if is_exp_code(code):
            annots.append(go_id)
    # Ignore proteins without experimental annotations
    if len(annots) == 0:
        continue
    index.append(i)
    annotations.append(annots)
rice = rice.iloc[index]
rice = rice.reset_index()

In [33]:
rice.shape

(11723, 9)

In [34]:
rice['exp_annotations'] = annotations

In [35]:
prop_annotations = []
for i, row in rice.iterrows():
    # Propagate annotations
    annot_set = set()
    annots = row['exp_annotations']
    for go_id in annots:
        annot_set |= go.get_ancestors(go_id)
    annots = list(annot_set)
    prop_annotations.append(annots)
rice['prop_annotations'] = prop_annotations

#### check obsolete terms

In [36]:
prop_annot_a = rice['prop_annotations'].values
prop_annot_a = list(map(lambda x: set(x), prop_annot_a))

In [37]:
prop_cleaned_annots_a = [list({elem for elem in annot if elem in list(go.ont.keys())}) for annot in prop_annot_a]

In [38]:
exp_annot_a = rice['exp_annotations'].values
exp_annot_a = list(map(lambda x: set(x), exp_annot_a))
exp_cleaned_annots_a = [list({elem for elem in annot if elem in list(go.ont.keys())}) for annot in exp_annot_a]

In [39]:
indx = []
for i in range(len(prop_annot_a)):
    uncleaned = sorted(list(prop_annot_a[i]))
    cleaned = sorted(prop_cleaned_annots_a[i])
    if uncleaned != cleaned:
        indx.append(i)
    else:
        continue
len(indx)

0

In [40]:
rice['prop_annotations'] = prop_cleaned_annots_a
rice['exp_annotations'] = exp_cleaned_annots_a

In [41]:
rice.shape

(11723, 11)

In [42]:
rice[rice.duplicated(subset='sequences')]

Unnamed: 0,index,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros,exp_annotations,prop_annotations
303,9170,ACT3_ARATH,P0CJ47; P10671; P53493; Q9M351;,824542,MADGEDIQPLVCDNGTGMVKAGFAGDDAPRAVFPSIVGRPRHTGVM...,"[GO:0005856|IEA, GO:0005829|HDA, GO:0005634|HD...",[3702.P0CJ47],3702,"[IPR004000, IPR020902, IPR004001, IPR043129]","[GO:0005200, GO:0009505, GO:0009506, GO:000582...","[GO:0005575, GO:0008150, GO:0009506, GO:001602..."
772,21945,ARF2B_ARATH,P0DH91; Q9S9K6; Q9SGY6; Q9SRC3;,843385,MGLSFAKLFSRLFAKKEMRILMVGLDAAGKTTILYKLKLGEIVTTI...,"[GO:0005794|HDA, GO:0005739|HDA, GO:0000325|HD...",[3702.P0DH91],3702,"[IPR045872, IPR027417, IPR005225, IPR006689]","[GO:0005525, GO:0005794, GO:0016004, GO:000372...","[GO:0005575, GO:0017076, GO:0008150, GO:001600..."
1015,33742,ATP6B_ARATH,P0DO45; A0A178V8C6; Q9SN96;,836081,MRLFDPWPVFFKREWKRCWPFLTGFAVTGVLITKLTAGLTEEDAKN...,"[GO:0005743|IEA, GO:0045263|IEA, GO:1902600|IE...",[],3702,[],"[GO:1902074, GO:0009409, GO:0009414]","[GO:0009266, GO:1901700, GO:0008150, GO:000110..."
1690,52207,CALM3_ARATH,P0DH98; P25069;,824847,MADQLTDDQISEFKEAFSLFDKDGDGCITTKELGTVMRSLGQNPTE...,"[GO:0005737|IDA, GO:0005829|HDA, GO:0000325|HD...",[3702.P0DH98],3702,"[IPR011992, IPR018247, IPR002048]","[GO:0000325, GO:0005829, GO:0005737, GO:0019722]","[GO:0005575, GO:0008150, GO:0050789, GO:001972..."
1778,55573,CB1B_ARATH,Q8VZ87; P04777; P83754;,839870,MAASTMALSSPAFAGKAVNLSPAASEVLGSGRVTMRKTVAKPKGPS...,"[GO:0009507|HDA, GO:0009941|IEA, GO:0009534|HD...",[3702.Q8VZ87],3702,"[IPR001344, IPR022796]","[GO:0009507, GO:0009535, GO:0005634, GO:001616...","[GO:0031090, GO:0005575, GO:0009507, GO:000952..."
3178,115599,EF1A2_ARATH,Q8W4H7; B9DGN1; P13905; Q8VZE8; Q94AD0; Q9ASU9...,837309,MGKEKFHINIVVIGHVDSGKSTTTGHLIYKLGGIDKRVIERFEKEA...,"[GO:0005739|HDA, GO:0009506|HDA, GO:0005773|HD...",[3702.Q8W4H7],3702,"[IPR004161, IPR031157, IPR027417, IPR000795, I...","[GO:0005739, GO:0003729, GO:0009506, GO:0005773]","[GO:0005575, GO:0009506, GO:0016020, GO:011016..."
3179,115616,EF1A3_ARATH,Q0WL56; A8MSE8; P13905; Q0WSD5;,837309,MGKEKFHINIVVIGHVDSGKSTTTGHLIYKLGGIDKRVIERFEKEA...,"[GO:0009507|HDA, GO:0005829|HDA, GO:0005739|HD...",[3702.Q0WL56],3702,"[IPR004161, IPR031157, IPR027417, IPR000795, I...","[GO:0005730, GO:0009507, GO:0009506, GO:000582...","[GO:0005575, GO:0009507, GO:0009506, GO:001602..."
3180,115622,EF1A4_ARATH,Q8GTY0; P13905; Q0WSD5; Q39093; Q9C5L4;,837309,MGKEKFHINIVVIGHVDSGKSTTTGHLIYKLGGIDKRVIERFEKEA...,"[GO:0005737|HDA, GO:0005739|HDA, GO:0005634|HD...",[3702.Q8GTY0],3702,"[IPR004161, IPR031157, IPR027417, IPR000795, I...","[GO:0005737, GO:0009506, GO:0005634, GO:000372...","[GO:0005575, GO:0009506, GO:0016020, GO:007194..."
3728,139645,FLZ18_ARATH,P0DO12; Q8GWT6;,841826,MTKISVGLQLVTRDSREKLNNIVIKSSLRLNRSNPNISELCFLKTC...,"[GO:0005737|IDA, GO:0005634|IDA, GO:0046872|IE...",[],3702,"[IPR044181, IPR007650]","[GO:0005737, GO:0005634, GO:0009749, GO:190558...","[GO:0005575, GO:0008150, GO:0016020, GO:011016..."
4637,180725,HIS6B_ARATH,P0DI07; Q6S4C1; Q6S4D6; Q949X3; Q9LD56;,843523,MGVINVQGSPSFSIHSSESNLRKSRALKKPFCSIRNRVYCAQSSSA...,"[GO:0009570|HDA, GO:0004400|IGI, GO:0030170|IE...",[3702.P0DI07],3702,"[IPR004839, IPR005861, IPR015424, IPR015421, I...","[GO:0004400, GO:0000105, GO:0009570]","[GO:0044283, GO:0005575, GO:0006082, GO:000950..."


### remove duplicates due to memory issue

In [43]:
rice_dupp = rice.drop_duplicates(subset='sequences', keep='first')
rice_dupp.reset_index(drop=True, inplace=True)

In [44]:
rice_dupp.shape

(11697, 11)

In [45]:
rice_dupp.to_pickle("data_plant/arab_pls_dup.pkl")

In [2]:
go_rels = Ontology('./data/fixed4/go.obo', with_rels=True)

### 19 June 2024 - Remove duplicates and obsolete terms from the benchmark dataset (control)

In [52]:
train = pd.read_pickle('/Volumes/T7/deepgoplus/0624github/control/train_data.pkl')
test = pd.read_pickle('/Volumes/T7/deepgoplus/0624github/control/test_data.pkl')

In [5]:
prop_annotations_a = train['prop_annotations'].values
prop_annotations_a = list(map(lambda x: set(x), prop_annotations_a))
prop_test_annotations_a = test['prop_annotations'].values
prop_test_annotations_a = list(map(lambda x: set(x), prop_test_annotations_a))

exp_annotations_a = train['exp_annotations'].values
exp_annotations_a = list(map(lambda x: set(x), exp_annotations_a))
exp_test_annotations_a = test['exp_annotations'].values
exp_test_annotations_a = list(map(lambda x: set(x), exp_test_annotations_a))

In [6]:
prop_cleaned_annots_a = [list({elem for elem in annot if elem in list(go_rels.ont.keys())}) for annot in prop_annotations_a]
prop_cleaned_test_annots_a = [list({elem for elem in annot if elem in list(go_rels.ont.keys())}) for annot in prop_test_annotations_a]
exp_cleaned_annots_a = [list({elem for elem in annot if elem in list(go_rels.ont.keys())}) for annot in exp_annotations_a]
exp_cleaned_test_annots_a = [list({elem for elem in annot if elem in list(go_rels.ont.keys())}) for annot in exp_test_annotations_a]

In [8]:
indx = []
for i in range(len(prop_annotations_a)):
    uncleaned = sorted(list(prop_annotations_a[i]))
    cleaned = sorted(prop_cleaned_annots_a[i])
    if uncleaned != cleaned:
        indx.append(i)
    else:
        continue

In [9]:
len(indx)

13291

In [10]:
removed_all = []
for i in indx:
    uncleaned = sorted(list(prop_annotations_a[i]))
    cleaned = sorted(prop_cleaned_annots_a[i])
    removed = list(set(uncleaned) - set(cleaned))
    removed_all.append(removed)
removed_all_flatten = [item for sublist in removed_all for item in sublist]
# Count the occurrences of each element
removed_all_flatten_counts = Counter(removed_all_flatten)

In [11]:
removed_all_flatten_counts

Counter({'GO:0141187': 8146,
         'GO:0141188': 1811,
         'GO:0170039': 723,
         'GO:0170035': 225,
         'GO:0170033': 764,
         'GO:0170040': 216,
         'GO:0141124': 3106,
         'GO:0170055': 84,
         'GO:0141193': 144,
         'GO:0170041': 255,
         'GO:0170038': 345,
         'GO:0170034': 351,
         'GO:0141137': 153,
         'GO:0170036': 70,
         'GO:0141154': 4,
         'GO:0141194': 58,
         'GO:0170044': 62,
         'GO:0170043': 63,
         'GO:0180039': 7,
         'GO:0180028': 5,
         'GO:0160128': 10,
         'GO:0141133': 1,
         'GO:0141147': 18,
         'GO:0141189': 1,
         'GO:0141118': 3,
         'GO:0180027': 4,
         'GO:0180030': 40,
         'GO:0170037': 8,
         'GO:0160133': 3,
         'GO:0160127': 1,
         'GO:0141152': 3,
         'GO:0160144': 3,
         'GO:0180031': 1,
         'GO:0170049': 10,
         'GO:0141119': 3,
         'GO:0141142': 6,
         'GO:0170046': 1,
  

In [13]:
train_df_cleaned_go_a = train.drop('prop_annotations', axis=1)
train_df_cleaned_go_a = train_df_cleaned_go_a.drop('exp_annotations', axis=1)
test_df_cleaned_go_a = test.drop('prop_annotations', axis=1)
test_df_cleaned_go_a = test_df_cleaned_go_a.drop('exp_annotations', axis=1)

In [14]:
train_df_cleaned_go_a['prop_annotations'] = prop_cleaned_annots_a
train_df_cleaned_go_a['exp_annotations'] = exp_cleaned_annots_a
test_df_cleaned_go_a['prop_annotations'] = prop_cleaned_test_annots_a
test_df_cleaned_go_a['exp_annotations'] = exp_cleaned_test_annots_a

In [15]:
train_df_cleaned_go_a.shape

(76774, 10)

In [34]:
train_df_cleaned_go_a['sequences'].nunique()

75693

In [17]:
test_df_cleaned_go_a.shape

(4041, 10)

In [18]:
test_df_cleaned_go_a['sequences'].nunique()

4039

In [3]:
#train_df_cleaned_go_a.to_pickle("/Volumes/T7/deepgoplus/0624github/control/train_cleaned.pkl")
#test_df_cleaned_go_a.to_pickle("/Volumes/T7/deepgoplus/0624github/control/test_cleaned.pkl")
train_df_cleaned_go_a = pd.read_pickle("/Volumes/T7/deepgoplus/0624github/control/train_cleaned.pkl")
test_df_cleaned_go_a = pd.read_pickle("/Volumes/T7/deepgoplus/0624github/control/test_cleaned.pkl")

In [61]:
#all_cleaned = pd.concat([train_df_cleaned_go_a, test_df_cleaned_go_a])
cnt = Counter()
annotations = list()
for i, row in all_cleaned.iterrows():
    for term in row['prop_annotations']:
        cnt[term] += 1

# Filter terms with annotations more than min_count
res = {}
for key, val in cnt.items():
    if val >= 50:
        ont = key.split(':')[0]
        if ont not in res:
            res[ont] = []
        res[ont].append(key)
terms = []
for key, val in res.items():
    print(key, len(val))
    terms += val

# Save the list of terms
terms_df = pd.DataFrame({'terms': terms})
#terms_df.to_pickle('/Volumes/T7/deepgoplus/0624github/control/terms_cleaned_20.pkl')

GO 5725


### confirming that duplicated sequences between species are real, same proteins of different species have the same prop annotations

In [4]:
train_df_cleaned_go_a[train_df_cleaned_go_a.duplicated(subset='sequences')]

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,cafa_target,prop_annotations,exp_annotations
60061,411833,RR7_ORYNI,Q6ENC1;,MSRRGTAEKRTAKSDPIFRNRLVNMVVNRIMKDGKKSLAYQILYRA...,"[GO:0009507|IEA, GO:0009536|IC, GO:0015935|IEA...","[IPR000235, IPR005717, IPR020606, IPR023798, I...",4536,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]
49891,310684,PETN_ORYSI,P0C396;,MDIVSLAWAALMVVFTFSLSLVVWGRSGL,"[GO:0009535|IEA, GO:0009512|IEA, GO:0009536|IC...","[IPR036143, IPR005497]",39946,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]
72408,498522,TRP6_ONCFA,P86581;,APSMGFMGMR,"[GO:0005576|HDA, GO:0007218|IEA]",[],7536,False,"[GO:0110165, GO:0005576, GO:0005575]",[GO:0005576]
55741,350303,RAB3A_RAT,P63012; P05713;,MASATDSRYGQKESSDQNFDYMFKILIIGNSSVGKTSFLFRYADDS...,"[GO:0001669|ISO, GO:0030424|IDA, GO:0005829|IE...","[IPR027417, IPR037872, IPR005225, IPR001806]",10116,True,"[GO:0023052, GO:0006996, GO:0030659, GO:003256...","[GO:0032482, GO:0014059, GO:0030672, GO:003042..."
9298,52208,CALM3_HUMAN,Q96HK3;,MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTE...,"[GO:0034704|IDA, GO:1902494|IDA, GO:0005813|ID...","[IPR011992, IPR018247, IPR002048]",9606,True,"[GO:0050790, GO:0006811, GO:0015085, GO:003240...","[GO:0010881, GO:0035307, GO:0000922, GO:000551..."
...,...,...,...,...,...,...,...,...,...,...
4587,21976,ARF6_MOUSE,P62331; P26438;,MGKVLSKIFGNKEMRILMLGLDAAGKTTILYKLKLGQSVTTIPTVG...,"[GO:0005938|IEA, GO:0032154|ISS, GO:0005737|ID...","[IPR041838, IPR027417, IPR005225, IPR006689]",10090,True,"[GO:0120180, GO:0048869, GO:0032561, GO:000576...","[GO:0005768, GO:0051549, GO:0010975, GO:009016..."
72369,498467,TRP1_NEZVI,P86575;,GPSGFLGMR,"[GO:0005576|HDA, GO:0007218|IEA]",[],85310,False,"[GO:0110165, GO:0005576, GO:0005575]",[GO:0005576]
16172,88604,CYF_ORYSJ,P0C389; P07888; Q6QXT8; Q6QY65;,MENRNTFSWVKEQMTRSISVSIMIYVITRTSISNAYPIFAQQGYEN...,"[GO:0009535|IEA, GO:0009536|IC, GO:0009055|IEA...","[IPR024058, IPR002325, IPR024094, IPR036826, I...",39947,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]
58107,367661,RK33_ORYSI,P0C456; P12141; Q6QY61; Q6Z503;,MAKGKDVRIRVILQCVSCVRKGANEESAGISRYSTQKNRHNTPGQL...,"[GO:0009507|IEA, GO:0009536|IC, GO:1990904|IEA...","[IPR001705, IPR018264, IPR038584, IPR011332]",39946,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]


In [36]:
train_df_cleaned_go_a[train_df_cleaned_go_a['sequences'] == 'MDIVSLAWAALMVVFTFSLSLVVWGRSGL']

Unnamed: 0,index,proteins,accessions,sequences,annotations,interpros,orgs,cafa_target,prop_annotations,exp_annotations
49892,310685,PETN_ORYSJ,P61042; P12178; P56789;,MDIVSLAWAALMVVFTFSLSLVVWGRSGL,"[GO:0009535|IEA, GO:0009512|IEA, GO:0009536|IC...","[IPR036143, IPR005497]",39947,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]
49891,310684,PETN_ORYSI,P0C396;,MDIVSLAWAALMVVFTFSLSLVVWGRSGL,"[GO:0009535|IEA, GO:0009512|IEA, GO:0009536|IC...","[IPR036143, IPR005497]",39946,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]
49889,310682,PETN_ORYNI,Q6ENI5;,MDIVSLAWAALMVVFTFSLSLVVWGRSGL,"[GO:0009535|IEA, GO:0009512|IEA, GO:0009536|IC...","[IPR036143, IPR005497]",4536,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]
49890,310683,PETN_ORYSA,P0C395;,MDIVSLAWAALMVVFTFSLSLVVWGRSGL,"[GO:0009535|IEA, GO:0009512|IEA, GO:0009536|IC...","[IPR036143, IPR005497]",4530,False,"[GO:0016020, GO:0005575, GO:0043227, GO:004323...",[GO:0009536]


In [47]:
len(set(tuple(x) for x in list(train_df_cleaned_go_a[train_df_cleaned_go_a['sequences'] == 'MDIVSLAWAALMVVFTFSLSLVVWGRSGL']['prop_annotations'])))

1

In [43]:
list(train_df_cleaned_go_a[train_df_cleaned_go_a['sequences'] == 'MDIVSLAWAALMVVFTFSLSLVVWGRSGL']['prop_annotations'])[1]

['GO:0016020',
 'GO:0005575',
 'GO:0043227',
 'GO:0043231',
 'GO:0005622',
 'GO:0043226',
 'GO:0005737',
 'GO:0043229',
 'GO:0009536',
 'GO:0110165']

In [6]:
dup = list(train_df_cleaned_go_a[train_df_cleaned_go_a.duplicated(subset='sequences')]['sequences'])

In [7]:
dup2 = []
for d in dup:
    props = list(train_df_cleaned_go_a[train_df_cleaned_go_a['sequences'] == 'MDIVSLAWAALMVVFTFSLSLVVWGRSGL']['prop_annotations'])
    len_unique_props = len(set(tuple(x) for x in props))
    if len_unique_props > 1:
        dup2.append(dup)
    else:
        continue

In [8]:
dup2

[]

In [41]:
train_df_cleaned_go_a_dupp = train_df_cleaned_go_a.drop_duplicates(subset='sequences', keep='first')
train_df_cleaned_go_a_dupp.reset_index(drop=True, inplace=True)

In [42]:
test_df_cleaned_go_a_dupp = test_df_cleaned_go_a.drop_duplicates(subset='sequences', keep='first')
test_df_cleaned_go_a_dupp.reset_index(drop=True, inplace=True)

In [43]:
print(train_df_cleaned_go_a.shape)
print(train_df_cleaned_go_a_dupp.shape)

(76774, 10)
(75693, 10)


In [44]:
print(test_df_cleaned_go_a.shape)
print(test_df_cleaned_go_a_dupp.shape)

(4041, 10)
(4039, 10)


In [45]:
train_df_cleaned_go_a_dupp.to_pickle("/Volumes/T7/deepgoplus/0624github/control/train_cleaned_dupp.pkl")
test_df_cleaned_go_a_dupp.to_pickle("/Volumes/T7/deepgoplus/0624github/control/test_cleaned_dupp.pkl")

In [62]:
#all_cleaned_dupp = pd.concat([train_df_cleaned_go_a_dupp, test_df_cleaned_go_a_dupp])
cnt = Counter()
annotations = list()
for i, row in all_cleaned_dupp.iterrows():
    for term in row['prop_annotations']:
        cnt[term] += 1

# Filter terms with annotations more than min_count
res = {}
for key, val in cnt.items():
    if val >= 10:
        ont = key.split(':')[0]
        if ont not in res:
            res[ont] = []
        res[ont].append(key)
terms = []
for key, val in res.items():
    print(key, len(val))
    terms += val

# Save the list of terms
terms_df = pd.DataFrame({'terms': terms})
terms_df.to_pickle('/Volumes/T7/deepgoplus/0624github/control/terms_cleaned_dupp_10.pkl')

GO 13617


### now checking the obsolete go terms

In [48]:
terms = pd.read_pickle("/Volumes/T7/deepgoplus/0624github/control/terms.pkl")

In [50]:
len(list(terms['terms']))

5742

In [30]:
obs = []
not_found = []
for t in list(terms['terms']):
    if t in list(go_rels.ont.keys()):
        if go_rels.get_term(t)['is_obsolete'] == True:
            obs.append(t)
    else:
        not_found.append(t)

In [27]:
go_rels.get_term('GO:0033643')['is_obsolete'] == True

False

In [29]:
go_rels.ont

{'GO:0000001': {'is_a': ['GO:0048308', 'GO:0048311'],
  'part_of': [],
  'regulates': [],
  'alt_ids': [],
  'is_obsolete': False,
  'id': 'GO:0000001',
  'name': 'mitochondrion inheritance',
  'namespace': 'biological_process',
  'children': set()},
 'GO:0000002': {'is_a': ['GO:0007005'],
  'part_of': [],
  'regulates': [],
  'alt_ids': [],
  'is_obsolete': False,
  'id': 'GO:0000002',
  'name': 'mitochondrial genome maintenance',
  'namespace': 'biological_process',
  'children': {'GO:0032042'}},
 'GO:0000003': {'is_a': ['GO:0008150'],
  'part_of': [],
  'regulates': [],
  'alt_ids': ['GO:0019952', 'GO:0050876'],
  'is_obsolete': False,
  'id': 'GO:0000003',
  'name': 'reproduction',
  'namespace': 'biological_process',
  'children': {'GO:0019953',
   'GO:0019954',
   'GO:0022414',
   'GO:0032504',
   'GO:0032505',
   'GO:0044702',
   'GO:0075325',
   'GO:0075326',
   'GO:0075327',
   'GO:1990277'}},
 'GO:0000006': {'is_a': ['GO:0005385'],
  'part_of': [],
  'regulates': [],
  'alt_i

In [31]:
obs

[]

In [37]:
sorted(not_found)

['GO:0141124',
 'GO:0141137',
 'GO:0141187',
 'GO:0141188',
 'GO:0141193',
 'GO:0141194',
 'GO:0170033',
 'GO:0170034',
 'GO:0170035',
 'GO:0170036',
 'GO:0170038',
 'GO:0170039',
 'GO:0170040',
 'GO:0170041',
 'GO:0170043',
 'GO:0170044',
 'GO:0170055']

In [35]:
removed_all_flatten_counts = {'GO:0141187': 8146, 'GO:0141188': 1811,'GO:0170039': 723, 'GO:0170035': 225, 'GO:0170033': 764, 'GO:0170040': 216, 'GO:0141124': 3106, 'GO:0170055': 84, 'GO:0141193': 144, 'GO:0170041': 255, 'GO:0170038': 345, 'GO:0170034': 351, 'GO:0141137': 153, 'GO:0170036': 70, 'GO:0141154': 4, 'GO:0141194': 58, 'GO:0170044': 62, 'GO:0170043': 63, 'GO:0180039': 7, 'GO:0180028': 5, 'GO:0160128': 10, 'GO:0141133': 1, 'GO:0141147': 18, 'GO:0141189': 1, 'GO:0141118': 3, 'GO:0180027': 4, 'GO:0180030': 40, 'GO:0170037': 8, 'GO:0160133': 3, 'GO:0160127': 1, 'GO:0141152': 3, 'GO:0160144': 3, 'GO:0180031': 1, 'GO:0170049': 10, 'GO:0141119': 3, 'GO:0141142': 6, 'GO:0170046': 1, 'GO:0170045': 1, 'GO:0170053': 4, 'GO:0160135': 1, 'GO:0170054': 3, 'GO:0141177': 2, 'GO:0160134': 1, 'GO:0170047': 3, 'GO:0160129': 1, 'GO:0141125': 2, 'GO:0141157': 2, 'GO:0141171': 1, 'GO:0141165': 2}

In [40]:
sorted(list(removed_all_flatten_counts.keys()))

['GO:0141118',
 'GO:0141119',
 'GO:0141124',
 'GO:0141125',
 'GO:0141133',
 'GO:0141137',
 'GO:0141142',
 'GO:0141147',
 'GO:0141152',
 'GO:0141154',
 'GO:0141157',
 'GO:0141165',
 'GO:0141171',
 'GO:0141177',
 'GO:0141187',
 'GO:0141188',
 'GO:0141189',
 'GO:0141193',
 'GO:0141194',
 'GO:0160127',
 'GO:0160128',
 'GO:0160129',
 'GO:0160133',
 'GO:0160134',
 'GO:0160135',
 'GO:0160144',
 'GO:0170033',
 'GO:0170034',
 'GO:0170035',
 'GO:0170036',
 'GO:0170037',
 'GO:0170038',
 'GO:0170039',
 'GO:0170040',
 'GO:0170041',
 'GO:0170043',
 'GO:0170044',
 'GO:0170045',
 'GO:0170046',
 'GO:0170047',
 'GO:0170049',
 'GO:0170053',
 'GO:0170054',
 'GO:0170055',
 'GO:0180027',
 'GO:0180028',
 'GO:0180030',
 'GO:0180031',
 'GO:0180039']

In [65]:
go_rels.get_term('GO:0141188') is None

True

In [None]:
go_rels

In [None]:
go_rels.