# Disorder prediction of translations from bacterial genomic sequences

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from Bio.Seq import reverse_complement
from Bio.SeqFeature import ExactPosition, FeatureLocation, CompoundLocation
from pyscripts.config import path2
from pyscripts.datasets import DatasetLoader, Metadata
from pyscripts.genomeutil import bac_translate, is_regular

dloader  = DatasetLoader()
metadata = Metadata()

In [2]:
from pyscripts.iupred2a_lib_mod import iupred2a_predictor
pred = iupred2a_predictor('long')

# The original version of IUPred2A was reported by Balint Meszaros, Gabor Erdos, Zsuzsanna Dosztanyi in Nucleic Acids Research 2018;46(W1):W329-W337.
# The following is a copyright notice under a license from IUPred2A:
## © 2018-2020 Dr. Zsuzsanna Dosztányi, Bálint Mészáros and Gábor Erdős, and Eötvös Loránd University (ELTE)
## All rights are reserved for the authors and ELTE.

# 'iupred2a_lib_mod.py' is a modified version of 'iupred2a_lib.py' of IUPred2A by Shun Yamanouchi at Dept. of Biol. Sci., Grad. Sch. of Sci., the Univ. of Tokyo.
# Specifically, I wrapped the original functions in a class, and suppressed the file loading that occurs every time the iupred() function is called.
# No changes have been made to the core of the program.
# The source code is not available to the public because the license prohibits redistribution.

# IUPred2A: context-dependent prediction of protein disorder as a function of redox state and protein binding
# Balint Meszaros, Gabor Erdos, Zsuzsanna Dosztanyi
# Nucleic Acids Research 2018;46(W1):W329-W337.
#
# Prediction type: long

# modified by Shun Yamanouchi


## For noncoding reading frames

- The upstream sequences of all canonical ORFs and all non-stop reading frames that are not in-frame with the canonical ORFs were translated into peptides.
- Of these, those with a length of more than or eqal to 30 amino acid residues were subjected to prediction of intrinsic diorder using IUPred2A.

In [3]:
from enum import IntEnum
class AbsFrame(IntEnum):
    NONE  = 0b00000000
    P0    = 0b00000001
    P1    = 0b00000010
    P2    = 0b00000100
    M0    = 0b00001000
    M1    = 0b00010000
    M2    = 0b00100000
    ALL   = P0 | P1 | P2 | M0 | M1 | M2

plusAbs  = [AbsFrame.P0, AbsFrame.P1, AbsFrame.P2]
minusAbs = [AbsFrame.M0, AbsFrame.M1, AbsFrame.M2]
    
class RelFrame(IntEnum):
    UNDET = 0b00000000
    FW0   = 0b00000001
    FW1   = 0b00000010
    FW2   = 0b00000100
    RC0   = 0b00001000
    RC1   = 0b00010000
    RC2   = 0b00100000

map2AbsFrame = {
    (0, +1): AbsFrame.P0, (1, +1): AbsFrame.P1, (2, +1): AbsFrame.P2, 
    (0, -1): AbsFrame.M0, (1, -1): AbsFrame.M1, (2, -1): AbsFrame.M2
}

map2RelFrame = { # (self_abs, coding_abs) => self_relative_to_coding
    (AbsFrame.P0, AbsFrame.P0): RelFrame.FW0, (AbsFrame.P0, AbsFrame.M0): RelFrame.RC0, 
    (AbsFrame.P0, AbsFrame.P1): RelFrame.FW2, (AbsFrame.P0, AbsFrame.M1): RelFrame.RC1, 
    (AbsFrame.P0, AbsFrame.P2): RelFrame.FW1, (AbsFrame.P0, AbsFrame.M2): RelFrame.RC2, 

    (AbsFrame.P1, AbsFrame.P0): RelFrame.FW1, (AbsFrame.P1, AbsFrame.M0): RelFrame.RC2, 
    (AbsFrame.P1, AbsFrame.P1): RelFrame.FW0, (AbsFrame.P1, AbsFrame.M1): RelFrame.RC0, 
    (AbsFrame.P1, AbsFrame.P2): RelFrame.FW2, (AbsFrame.P1, AbsFrame.M2): RelFrame.RC1, 

    (AbsFrame.P2, AbsFrame.P0): RelFrame.FW2, (AbsFrame.P2, AbsFrame.M0): RelFrame.RC1, 
    (AbsFrame.P2, AbsFrame.P1): RelFrame.FW1, (AbsFrame.P2, AbsFrame.M1): RelFrame.RC2, 
    (AbsFrame.P2, AbsFrame.P2): RelFrame.FW0, (AbsFrame.P2, AbsFrame.M2): RelFrame.RC0, 

    (AbsFrame.M0, AbsFrame.P0): RelFrame.RC0, (AbsFrame.M0, AbsFrame.M0): RelFrame.FW0, 
    (AbsFrame.M0, AbsFrame.P1): RelFrame.RC2, (AbsFrame.M0, AbsFrame.M1): RelFrame.FW1, 
    (AbsFrame.M0, AbsFrame.P2): RelFrame.RC1, (AbsFrame.M0, AbsFrame.M2): RelFrame.FW2, 

    (AbsFrame.M1, AbsFrame.P0): RelFrame.RC1, (AbsFrame.M1, AbsFrame.M0): RelFrame.FW2, 
    (AbsFrame.M1, AbsFrame.P1): RelFrame.RC0, (AbsFrame.M1, AbsFrame.M1): RelFrame.FW0, 
    (AbsFrame.M1, AbsFrame.P2): RelFrame.RC2, (AbsFrame.M1, AbsFrame.M2): RelFrame.FW1, 

    (AbsFrame.M2, AbsFrame.P0): RelFrame.RC2, (AbsFrame.M2, AbsFrame.M0): RelFrame.FW1, 
    (AbsFrame.M2, AbsFrame.P1): RelFrame.RC1, (AbsFrame.M2, AbsFrame.M1): RelFrame.FW2, 
    (AbsFrame.M2, AbsFrame.P2): RelFrame.RC0, (AbsFrame.M2, AbsFrame.M2): RelFrame.FW0, 
}


In [4]:
import re
def iter_noncoding_reading_frames(record):    
    seq = str(record.seq)
    coding_absframe = np.full(len(seq), AbsFrame.NONE, dtype='int8')
    
    start_plus, start_minus = [], []
    # record all start codons of canonical ORFs
    # record absolute RF of all ORFs
    for cds in filter(lambda feat: feat.type=='CDS', record.features):
        loc = cds.location
        if is_regular(cds, record):
            for pt in loc.parts:
                coding_absframe[pt.start:pt.end] |= map2AbsFrame[(pt.start%3, pt.strand)] 
            if loc.strand > 0:
                start_plus.append(loc.start if len(loc.parts)==1 else loc.parts[0].start)
            else:
                start_minus.append(loc.end if len(loc.parts)==1 else loc.parts[1].end)
        else:
            for pt in loc.parts:
                coding_absframe[pt.start:pt.end] = AbsFrame.ALL
    
    # regex patterns of stop codons to record all of them
    ptrn_stop_plus  = re.compile('TAA|TAG|TGA|TRA|TAR')
    ptrn_stop_minus = re.compile('TTA|CTA|TCA|TYA|YTA')
    
    # split by (1) start codons of canonical ORFs; (2) all stop codons on the genome
    plus = pd.concat([
        pd.DataFrame({'s': start_plus, 'e': start_plus}, dtype=int),
        pd.DataFrame([
            m.span() 
            for m in re.finditer(ptrn_stop_plus, seq)
        ], columns=['s','e'], dtype=int)
    ])
    
    minus = pd.concat([
        pd.DataFrame({'s': start_minus, 'e': start_minus}, dtype=int),
        pd.DataFrame([
            m.span() 
            for m in re.finditer(ptrn_stop_minus, seq)
        ], columns=['s','e'], dtype=int)
    ])
    
    # find all non-stop and non-coding regions
    for sign, strand_pm in {+1: plus, -1: minus}.items():
        cross_pm = {}
        for self_absfr, pm_fr in strand_pm.groupby((strand_pm['s']%3).apply(lambda st: map2AbsFrame[(st, sign)])):
            pm_fr = pm_fr.sort_values(by=['s','e']).reset_index(drop=True)
            cross_pm[self_absfr] = (pm_fr.iloc[0, 0], pm_fr.iloc[-1, 1])
            
            for s, e in zip(pm_fr['e'].values, pm_fr['s'].values[1:]):
                if s == e or (coding_absframe[s:e] & self_absfr).any(): 
                    ## ignore in-frame overlap with any (potential) coding region
                    continue  
                
                ## assign relframe to non-coding reading frames
                loc = FeatureLocation(ExactPosition(s), ExactPosition(e), sign)
                transl = bac_translate(loc.extract(seq))
                assert '*' not in transl
                
                self_relframe = np.full_like(coding_absframe[s:e], RelFrame.UNDET, dtype='int8')
                for coding_absfr in (plusAbs + minusAbs):
                    self_relframe[coding_absframe[s:e] == coding_absfr] = map2RelFrame[(self_absfr, coding_absfr)]
                self_relframe_transl = (self_relframe[0::3] & self_relframe[1::3] & self_relframe[2::3])[::sign]
                yield loc, transl, self_relframe_transl
                assert (self_relframe_transl != RelFrame.FW0).all(), (s, e, loc, transl)
        
        ## Handling corner cases: ARFs that cross the boundary of a circular DNA record.
        ## Very troublesome...
        if record.annotations['topology'] == 'circular':
            mod = len(seq) % 3
            pair_first  = plusAbs if sign>0 else minusAbs
            pair_second = pair_first[-mod:] + pair_first[:-mod]

            for self_absfr1, self_absfr2 in zip(pair_first, pair_second):
                s1, e2 = cross_pm[self_absfr1][1], cross_pm[self_absfr2][0]
                coding_absframe1, coding_absframe2 = coding_absframe[s1:], coding_absframe[:e2]
                if (coding_absframe[s1:] & self_absfr1).any() or (coding_absframe[:e2] & self_absfr2).any(): 
                    ## ignore in-frame overlap with any (potential) coding region
                    continue
                self_relframe1 = np.full_like(coding_absframe1, RelFrame.UNDET, dtype='int8')
                self_relframe2 = np.full_like(coding_absframe2, RelFrame.UNDET, dtype='int8')    
                for coding_absfr in (plusAbs + minusAbs):
                    self_relframe1[coding_absframe1 == coding_absfr] = map2RelFrame[(self_absfr1, coding_absfr)]
                    self_relframe2[coding_absframe2 == coding_absfr] = map2RelFrame[(self_absfr2, coding_absfr)]
                self_relframe = np.hstack([self_relframe1, self_relframe2])
                self_relframe_transl = (self_relframe[0::3] & self_relframe[1::3] & self_relframe[2::3])[::sign]

                loc = CompoundLocation([
                    FeatureLocation(ExactPosition(s1), ExactPosition(len(seq)), sign),
                    FeatureLocation(ExactPosition(0), ExactPosition(e2), sign)
                ][::sign])
                transl = bac_translate(loc.extract(seq))

                if (p := transl.find('*')) < 0:
                    yield loc, transl, self_relframe_transl
                    assert len(transl) == len(self_relframe_transl)
                else:
                    assert '*' not in transl[p+1:]
                    e1, s2 = s1 + (len(seq)-s1) // 3 * 3, e2 % 3
                    loc1 = FeatureLocation(ExactPosition(s1), ExactPosition(e1), sign)
                    loc2 = FeatureLocation(ExactPosition(s2), ExactPosition(e2), sign)
                    if sign > 0:
                        yield loc1, transl[:p],   self_relframe_transl[:p]
                        yield loc2, transl[p+1:], self_relframe_transl[p+1:]
                        assert len(loc2) == len(self_relframe_transl[p+1:]) * 3 and (len(seq)-s1) // 3 == p
                    else:
                        yield loc2, transl[:p],   self_relframe_transl[:p]
                        yield loc1, transl[p+1:], self_relframe_transl[p+1:]
                        assert len(loc2) == p * 3 and (len(seq)-s1) // 3 == len(self_relframe_transl[p+1:])
        
        else:
            for self_absfr, (e2, s1) in cross_pm.items():
                e1, s2 = s1 + (len(seq)-s1) // 3 * 3, e2 % 3
                loc1 = FeatureLocation(ExactPosition(s1), ExactPosition(e1), sign)
                loc2 = FeatureLocation(ExactPosition(s2), ExactPosition(e2), sign)
                transl1 = bac_translate(loc1.extract(seq))
                transl2 = bac_translate(loc2.extract(seq))
                coding_absframe1 = coding_absframe[s1:e1]
                coding_absframe2 = coding_absframe[s2:e2]
                self_relframe1 = np.full_like(coding_absframe1, RelFrame.UNDET, dtype='int8')
                self_relframe2 = np.full_like(coding_absframe2, RelFrame.UNDET, dtype='int8')    
                for coding_absfr in (plusAbs + minusAbs):
                    self_relframe1[coding_absframe1 == coding_absfr] = map2RelFrame[(self_absfr, coding_absfr)]
                    self_relframe2[coding_absframe2 == coding_absfr] = map2RelFrame[(self_absfr, coding_absfr)]
                self_relframe_transl1 = (self_relframe1[0::3] & self_relframe1[1::3] & self_relframe1[2::3])[::sign]
                self_relframe_transl2 = (self_relframe2[0::3] & self_relframe2[1::3] & self_relframe2[2::3])[::sign]
                yield loc1, transl1, self_relframe_transl1
                yield loc2, transl2, self_relframe_transl2


In [5]:
def work1(gcf):
    rec = max(dloader.load_genome(gcf), key=len)
    nc = [
        dict(loc=loc, transl=transl, relfr=relfr, iupred2=np.array(pred.iupred(transl)))
        for loc, transl, relfr in iter_noncoding_reading_frames(rec)
        if len(loc) >= 30
    ]
    pd.to_pickle(nc, path2.data/'iupred2a'/'noncoding-intermediates'/f'{gcf}.pkl.bz2')
    
from multiprocessing import Pool
with Pool(100) as pool:
    for _ in tqdm(pool.imap_unordered(work1, metadata.acc['refseq']), total=len(metadata.acc)):
        pass


  0%|          | 0/2624 [00:00<?, ?it/s]

## For coding sequences

- Degeneration prediction by IUPred2A was performed for CDSs that were considered to be regular by the following criteria.
  - not annotated as pseudo
  - have length in a multiple of three
  - consist of a single part
  - start with a start codon and end with a stop codon
        
- The annotations of the CDSs followed those of NCBI RefSeq. Here, overlapping regions with other genes were also included in the total count.


In [6]:
def work2(gcf):
    rec = max(dloader.load_genome(gcf), key=len)
    cd = [
        dict(loc=cds.location, transl=transl, relfr=np.full(len(transl), RelFrame.FW0, dtype='int8'), iupred2=np.array(pred.iupred(transl)))
        for cds in filter(lambda feat: feat.type=='CDS', rec.features)
        if is_regular(cds, rec)
        if (transl := cds.qualifiers.get('translation', [None])[0]) is not None
    ]
    pd.to_pickle(cd, path2.data/'iupred2a'/'cds-intermediates'/f'{gcf}.pkl.bz2')
    
from multiprocessing import Pool
with Pool(100) as pool:
    for _ in tqdm(pool.imap_unordered(work2, metadata.acc['refseq']), total=len(metadata.acc)):
        pass


  0%|          | 0/2624 [00:00<?, ?it/s]