In [1]:
from Bio import SeqIO
from tqdm.notebook import tqdm as tqdm
import numpy as np
import portpred as pp
import pandas as pd

### How to generate embeddings
#### The next cells shows how to generate the embeddings used in the PortPred paper.

In [31]:
data = pp.load_data('Transp_eg.fasta')

In [30]:
#pip install update bio-embeddings[all]
#from bio_embeddings.embed import ESM1bEmbedder,ProtTransBertBFDEmbedder
from bio_embeddings.embed import SeqVecEmbedder,UniRepEmbedder

In [32]:
embedder = SeqVecEmbedder()

In [36]:
def reduce_per_protein(embedding):
    return embedding.sum(0).mean(0)

In [33]:
for k,v in tqdm(data.items()):
    emb=embedder.embed(v)
    emb=reduce_per_protein(emb)
    np.save(k,emb)

  0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
seqvec_dict,seqvec_emb=pp.load_embedding('seqvec','seqvec/',dictio=True)

In [35]:
seqvec_emb=np.concatenate(seqvec_emb)

In [37]:
# TO GENERATE OTHER EMBEDDINGS UNCOMMENT
#embedder_esm1b = ESM1bEmbedder()
#embedder_unirep = UniRepEmbedder()
#embedder_protbert = ProtTransBertBFDEmbedder()

## PortPred Tool

In [38]:
##Uplad the data set
protbert_dict_val,protbert_emb_val=pp.load_embedding('protbert','pero_trans/pero_trans/pero_trans_fasta_protbert/',dictio=True)
esmb1_dict_val,esmb1_emb_val=pp.load_embedding('esmb1','pero_trans/pero_trans/pero_trans_fasta_esm1b/',dictio=True)
seqvec_dict_val,seqvec_emb_val=pp.load_embedding('seqvec','pero_trans/pero_trans/pero_trans_fasta_seqvec/',dictio=True)
unirep_dict_val,unirep_emb_val=pp.load_unirep_embedding('pero_trans/pero_trans/pero_trans_fasta_unirep/',dictio=True)

In [39]:
full_dict,full_emb=pp.concatenate_embeddings(protbert_dict_val,esmb1_dict_val,\
                       seqvec_dict_val,unirep_dict_val)

In [43]:
def predict_transporter(X,model='HFE',classification='all',output='PortPred_results.csv'):
    #model HFE, SeqVec
    if model=='HFE':
        m = np.load('binary_classification_LR_hybrid_features_model.sav', allow_pickle=True)
        df=pd.DataFrame(X.values())
        df=df[pp.col_list_b]
        pred=m.predict_proba(df)
        pred_col,pred_k=[],[]
        for p,k in zip(pred,X.keys()):
            pred_col.append(p[1])
            #print(pred_col)
            pred_k.append(k)
        df_results=pd.DataFrame(pred_k,columns=['protein_ID'])
        df_results.insert(len(df_results.columns),'probability',pred_col)
        df_results.loc[df_results['probability'] >= 0.5, 'transporter'] = 'True'
        df_results.loc[df_results['probability'] < 0.5, 'transporter'] = 'False'
        df_results.to_csv(output)
    # if  model=='SeqVec':
    #     m = np.load('seqvec_model.sav', allow_pickle=True)
    #     df=pd.DataFrame(X.values())
    #     pred=m.predict_proba(df)
    #     pred_col,pred_k=[],[]
    #     for p,k in zip(pred,X.keys()):
    #         pred_col.append(p[1])
    #         #print(pred_col)
    #         pred_k.append(k)
    #     df_results=pd.DataFrame(pred_k,columns=['protein_ID'])
    #     df_results.insert(len(df_results.columns),'probability',pred_col)
    #     df_results.loc[df_results['probability'] >= 0.5, 'transporter'] = 'True'
    #     df_results.loc[df_results['probability'] < 0.5, 'transporter'] = 'False'
    #     df_results.to_csv(output)
        return df_results
        

In [44]:
df_results=predict_transporter(full_dict)

In [45]:
df_results

Unnamed: 0,protein_ID,probability,transporter
0,Q19951,0.649573,True
1,P18163,0.716822,True
2,P28288,0.997350,True
3,Q54MD1,0.905490,True
4,P38225,0.107356,False
...,...,...,...
162,Q9NP80,0.093685,False
163,Q9QXY9,0.110972,False
164,Q9ET67,0.477149,False
165,P55096,0.995768,True
