In [2]:
import pandas as pd
from functools import partial
from multiprocessing import Pool, cpu_count



def fasta_reader(file):
    '''Converts .fasta to a pandas dataframe with accession as index
    and Protein in a column 'Protein'
    '''
    fasta_df = pd.read_csv(file, sep='>', lineterminator='>', header=None,\
                          )
    fasta_df[['Accession', 'Protein']] = fasta_df[0].str.split('\n', 1, \
                                        expand=True)

    fasta_df['Accession'] = fasta_df['Accession']
    fasta_df['Protein'] = fasta_df['Protein'].replace('\n', '', regex=True).\
                            astype(str).str.upper().replace('U', 'T')
    total_seq = fasta_df.shape[0]
    fasta_df.drop(0, axis=1, inplace=True)

    fasta_df = fasta_df[fasta_df.Protein != '']
    fasta_df = fasta_df[fasta_df.Protein != 'NONE']
    final_df = fasta_df.dropna()
    remained_seq = final_df.shape[0]
    if total_seq != remained_seq:
        warnings.warn("{} Proteins were removed due to inconsistencies in"
                      "provided file.".format(total_seq-remained_seq))
    return final_df

def progress(iteration, total, message=None):
    '''Simple progressbar
    '''
    if message is None:
        message = ''
    bars_string = int(float(iteration) / float(total) * 50.)
    print("\r|%-50s| %d%% (%s/%s) %s "% ('█'*bars_string+ "░" * \
                                     (50 - bars_string), float(iteration)/\
                                     float(total) * 100, iteration, total, \
                                     message), end='\r', flush=True)

    if iteration == total:
        print('\nCompleted!')

def predict(obj):
    res = obj.predict()
    try:
        f = obj.fungi()
    except Exception:
        f = ['Not a Signal peptide']
        
    try:
        t = obj.toxin()
    except Exception:
        t = ['Not a Signal peptide']

    return res +  f + t


import time
import warnings
warnings.filterwarnings("ignore")

from razor import RAZOR

In [4]:
rev_df_ = fasta_reader('../../Signal_Manuscript/Uniprot_all/uniprot-reviewed_yes.fasta.gz')
rev_df = rev_df_[~rev_df_.Protein.str[:75].str.contains('B|J|O|X|Z')].copy()

In [5]:
seqs_obj = [RAZOR(x, 60) for x in rev_df.Protein]

In [6]:



start_time = time.time()
pools = Pool(cpu_count()//2)
results = []
for res in pools.imap(predict, seqs_obj):
    results.append(res)
    progress(len(results), len(seqs_obj))
pools.close()
pools.join()
print("--- %s seconds --- for %d sequences.\n %f sequences per second"%(time.time() - start_time, rev_df.shape[0], \
                                             rev_df.shape[0]/(time.time() - start_time)))


|██████████████████████████████████████████████████| 100% (561776/561776)  
Completed!
--- 1642.5613355636597 seconds --- for 561776 sequences.
 342.012167 sequences per second


In [7]:
rev_df['Preds'] = results
rev_df.to_pickle('uniprot_reviewed_scored.pkl.gz')