# Summarize the results of disorder prediction

In [1]:
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter
from tqdm.notebook import tqdm
from pyscripts.config import path2
from pyscripts.datasets import Metadata
metadata = Metadata()

In [2]:
def get_disorder_summary(gcf):
    nc = pd.read_pickle(path2.data/'iupred2a'/'noncoding-intermediates'/f'{gcf}.pkl.bz2')
    cd = pd.read_pickle(path2.data/'iupred2a'/'cds-intermediates'/f'{gcf}.pkl.bz2')
    dat = {
        'acc': gcf,
        'summary': [
            {
                'len_thresh': len_thresh , 'prob_thresh': prob_thresh, 
                'counts': Counter([
                    i 
                    for ent in filter(lambda e: len(e['loc']) >= len_thresh, chain(nc, cd))
                    for i in ent['relfr'][ent['iupred2'] >= prob_thresh]
                ])
            }
            for len_thresh  in (30, 100, 300) 
            for prob_thresh in (0, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9)
        ]
    }
    return dat


In [3]:
summary = pd.DataFrame(
    columns=pd.MultiIndex(levels=[[],[],[]], codes=[[],[],[]], names=['gcf','len_thresh','prob_thresh']),
    dtype=pd.Int64Dtype()
)

from multiprocessing import Pool
with Pool(100) as pool:    
    for dat in tqdm(pool.imap_unordered(get_disorder_summary, metadata.acc['refseq']), total=len(metadata.acc)):
        gcf = dat['acc']
        for s in dat['summary']:
            len_thresh, prob_thresh, counts = s['len_thresh'], s['prob_thresh'], s['counts']
            summary[(gcf, len_thresh, prob_thresh)] = pd.Series(counts, dtype=pd.Int64Dtype())

summary = summary.sort_index(axis=0).sort_index(axis=1)
pd.to_pickle(summary, path2.data/'iupred2a'/'summary.pkl.bz2')

  0%|          | 0/2624 [00:00<?, ?it/s]