In [1]:
import os
import shutil
import logging
import requests
import numpy as np
import torch
import json
import pandas as pd
from torch.utils.data import DataLoader
import warnings
# warnings.filterwarnings("ignore")

from deepec.process_data import read_EC_actual_Fasta
from deepec.data_loader import DeepECDataset
from deepec.utils import argument_parser, run_neural_net, save_dl_result
from deepec.homology import run_blastp, read_best_blast_result, merge_predictions

  from .autonotebook import tqdm as notebook_tqdm


### input

In [2]:
# input 
file_name = 'Saccharomyces_cerevisiae'
target_protein_fasta_file = 'example/' + file_name + '.fasta'

# output
deepprozyme_result_file = 'Sce_DeepECv2.txt'

## predict

### pip install protobuf==3.19.6

In [3]:
def deepprozyme(target_protein_fasta_file,deepprozyme_result_file):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s-%(name)s-%(levelname)s-%(message)s')

    input_data_file = target_protein_fasta_file

    output_dir = './example/results'
    device = 'cpu'  # 注意加引号，表示字符串
    batch_size = 128
    num_cpu = 2
    torch.set_num_threads(num_cpu)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if not os.path.exists(output_dir + '/tmp'):
        os.makedirs((output_dir+'/tmp'))

    model = torch.load('./model/model.pth')
    model = model.to(device)

    explainECs = model.explainECs
    pred_thrd = model.thresholds
    input_seqs, input_ids = read_EC_actual_Fasta(input_data_file)
    id2ind = {seq_id: i for i, seq_id in enumerate(input_ids)}
    pseudo_labels = np.zeros((len(input_seqs)))

    proteinDataset = DeepECDataset(data_X=input_seqs, data_Y=pseudo_labels, explainECs=explainECs, pred=True)
    proteinDataloader = DataLoader(proteinDataset, batch_size=batch_size, shuffle=False)

    y_pred, y_score = run_neural_net(model, proteinDataloader, pred_thrd, device=device)
    failed_cases = save_dl_result(y_pred, y_score, input_ids, explainECs, output_dir+'/tmp')

    shutil.copy(output_dir+'/tmp/DL_prediction_result.txt', output_dir)
    os.rename(output_dir+'/DL_prediction_result.txt', output_dir+deepprozyme_result_file)
    # res = output_dir+'/DeepECv2_result.txt'
    # res = pd.read_csv(res,sep='\t')
    # res = res[res['prediction']!='None']
    # res['prediction'] = res['prediction'].apply(lambda x:x.split(':')[1])
    # res = res[['sequence_ID', 'prediction']]

    # res = res.groupby('sequence_ID')['prediction'].apply(list).reset_index()
    # deepprozyme_result = dict(zip(res['sequence_ID'], res['prediction']))
    # with open(deepprozyme_result_file, 'w') as json_file:
    #     json.dump(deepprozyme_result, json_file)  

    # return deepprozyme_result  

In [4]:
deepprozyme(target_protein_fasta_file,deepprozyme_result_file)

INFO:root:Deep leanrning prediction starts on the dataset
 96%|█████████▌| 45/47 [09:34<00:25, 12.77s/it]