# Expression siganls in ARFs

Here, we count the genomic positions with both transcriptional and translational signals.
Genomic positions were considered to show transcriptional (translational) signals when they have >10% of the average transcriptional (translational) signals of ORFs in any of growth coditions and biological replicates. Note that only non-overlapping regions of canonical ORFs were under investigation. 


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itertools import chain
from tqdm.notebook import tqdm
from pathlib import Path
from pyscripts.config import path2
from pyscripts.datasets import DatasetLoader
from pyscripts.genomeutil import iter_cds_nonoverlapping_regions
dloader = DatasetLoader()

In [2]:
str5_refs = pd.read_csv(path2.metadata/'expression'/'Kim+20SciData_refs.tsv', sep='\t')
str5_runs = pd.read_csv(path2.metadata/'expression'/'Kim+20SciData_runs.tsv', sep='\t')

In [3]:
str5_workdir = path2.data/'expression'/'Kim+20SciData'
      
def read_plus_depth(tag):
    return tag, pd.read_pickle(str5_workdir/'depth'/f'{tag}.plus.pkl.bz2')

def read_minus_depth(tag):
    return tag, pd.read_pickle(str5_workdir/'depth'/f'{tag}.minus.pkl.bz2')

from multiprocessing import Pool
with Pool(80) as pool:
    str5_plus  = dict(tqdm(pool.imap_unordered(read_plus_depth , str5_runs['tag']), total=80))
    str5_minus = dict(tqdm(pool.imap_unordered(read_minus_depth, str5_runs['tag']), total=80))


  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

In [4]:
for sp, run_sp in str5_runs.groupby('species_code'):    
    print('=====', sp, '=====')
    expressed_sense = {}
    expressed_antisense = {}
    assert run_sp['reference_assembly'].nunique() == 1
    refname = run_sp['reference_assembly'].unique()[0]
    rec = max(dloader.load_genome(refname, dirname=path2.pubdata/'expression'/'Kim+20SciData'/'reference'), key=len)

    for (gp, rep), run_grp in run_sp.groupby(['growth_phase', 'replicate']):
        
        depth = {}
        for ty, tag in run_grp[['type', 'tag']].itertuples(index=False):
            plus, minus = str5_plus[tag][rec.id], str5_minus[tag][rec.id]
        
            if ty == 'Ribo':
                if rec.annotations['topology'] == 'linear':
                    plus  = sum([plus.shift(-p, fill_value=0) for p in range(28)])
                    minus = sum([minus.shift(p, fill_value=0) for p in range(28)])
                else:
                    plus  = sum([np.roll(plus, -p) for p in range(28)])
                    minus = sum([np.roll(minus, p) for p in range(28)])
            
            depth[ty] = pd.concat([
                pd.concat([plus[pt.start:pt.end], minus[pt.start:pt.end]][::pt.strand], axis=1, keys=['sense', 'antisense'])
                for loc_tag, loc in iter_cds_nonoverlapping_regions(rec)
                for pt in loc.parts
            ])

        expressed_sense[(gp, rep)] = (
            (depth['RNA' ]['sense'] >= depth['RNA' ]['sense'].mean() * 0.1) & 
            (depth['Ribo']['sense'] >= depth['Ribo']['sense'].mean() * 0.1)
        )
        expressed_antisense[(gp, rep)] = (
            (depth['RNA' ]['antisense'] >= depth['RNA' ]['sense'].mean() * 0.1) & 
            (depth['Ribo']['antisense'] >= depth['Ribo']['sense'].mean() * 0.1)
        )

        print(gp, rep, 'RNA  threshold:', depth['RNA' ]['sense'].mean() * 0.1)
        print(gp, rep, 'Ribo threshold:', depth['Ribo']['sense'].mean() * 0.1)
        
    expressed_sense = pd.DataFrame(expressed_sense)
    expressed_antisense = pd.DataFrame(expressed_antisense)
    
    print('----------------')
    print('replicon size:', len(rec), 'bp; of these:')
    print('  non-overlapping regions of canonical ORFs:', len(expressed_sense), 'bp; of these:')
    print('    positions with expression signals:')
    print('      sense:', expressed_sense.any(axis=1).sum(), 'bp')
    print('      antisense:', expressed_antisense.any(axis=1).sum(), 'bp')
    print('================')
    print()
    

===== Save =====
E 1 RNA  threshold: 12.522699465233758
E 1 Ribo threshold: 2.8248459860080297
E 2 RNA  threshold: 12.473614108742375
E 2 Ribo threshold: 0.67194713298165
L 1 RNA  threshold: 10.547976372608929
L 1 Ribo threshold: 1.6709532715493873
L 2 RNA  threshold: 11.913390410467521
L 2 Ribo threshold: 0.9019861405175107
S 1 RNA  threshold: 11.61679563465324
S 1 Ribo threshold: 1.448976335464004
S 2 RNA  threshold: 9.301637621882625
S 2 Ribo threshold: 0.8704583992253474
T 1 RNA  threshold: 13.314274218323929
T 1 Ribo threshold: 1.0234794550343242
T 2 RNA  threshold: 12.298080586251523
T 2 Ribo threshold: 0.6864374891800318
----------------
replicon size: 9025608 bp; of these:
  non-overlapping regions of canonical ORFs: 7457277 bp; of these:
    positions with expression signals:
      sense: 3818234 bp
      antisense: 137387 bp

===== Scla =====
E 1 RNA  threshold: 11.541056872088877
E 1 Ribo threshold: 3.879170128328726
E 2 RNA  threshold: 10.886039934403378
E 2 Ribo threshold:

In [5]:
ecol_refs = pd.read_csv(path2.metadata/'expression'/'HB17NAR_refs.tsv', sep='\t')
ecol_runs = pd.read_csv(path2.metadata/'expression'/'HB17NAR_runs.tsv', sep='\t')

In [6]:
ecol_workdir = path2.data/'expression'/'HB17NAR'
ecol_plus, ecol_minus = {}, {}
for tag in ecol_runs['tag']:
    ecol_plus[tag]  = pd.read_pickle(ecol_workdir/'depth'/f'{tag}.plus.pkl.bz2' ) 
    ecol_minus[tag] = pd.read_pickle(ecol_workdir/'depth'/f'{tag}.minus.pkl.bz2')    

In [7]:
for sp, run_sp in ecol_runs.groupby('species_code'):    
    print('=====', sp, '=====')
    expressed_sense = {}
    expressed_antisense = {}
    assert run_sp['reference_assembly'].nunique() == 1
    refname = run_sp['reference_assembly'].unique()[0]
    rec = max(dloader.load_genome(refname, dirname=path2.pubdata/'expression'/'HB17NAR'/'reference'), key=len)

    for rep, run_grp in run_sp.groupby('replicate'):
        
        depth = {}
        for ty, tag in run_grp[['type', 'tag']].itertuples(index=False):
            plus, minus = ecol_plus[tag][rec.id], ecol_minus[tag][rec.id]
            if ty == 'Ribo':
                if rec.annotations['topology'] == 'linear':
                    plus  = sum([plus.shift(-p, fill_value=0) for p in range(28)])
                    minus = sum([minus.shift(p, fill_value=0) for p in range(28)])
                else:
                    plus  = pd.Series(sum([np.roll(plus, -p) for p in range(28)]), index=plus.index)
                    minus = pd.Series(sum([np.roll(minus, p) for p in range(28)]), index=plus.index)
            
            depth[ty] = pd.concat([
                pd.concat([plus[pt.start:pt.end], minus[pt.start:pt.end]][::pt.strand], axis=1, keys=['sense', 'antisense'])
                for loc_tag, loc in iter_cds_nonoverlapping_regions(rec)
                for pt in loc.parts
            ])

        expressed_sense[rep] = (
            (depth['RNA' ]['sense'] >= depth['RNA' ]['sense'].mean() * 0.1) & 
            (depth['Ribo']['sense'] >= depth['Ribo']['sense'].mean() * 0.1)
        )
        expressed_antisense[rep] = (
            (depth['RNA' ]['antisense'] >= depth['RNA' ]['sense'].mean() * 0.1) & 
            (depth['Ribo']['antisense'] >= depth['Ribo']['sense'].mean() * 0.1)
        )

        print(rep, 'RNA  threshold:', depth['RNA' ]['sense'].mean() * 0.1)
        print(rep, 'Ribo threshold:', depth['Ribo']['sense'].mean() * 0.1)
        
    expressed_sense = pd.DataFrame(expressed_sense)
    expressed_antisense = pd.DataFrame(expressed_antisense)
    
    print('----------------')
    print('replicon size:', len(rec), 'bp; of these:')
    print('  non-overlapping regions of canonical ORFs:', len(expressed_sense), 'bp; of these:')
    print('    positions with expression signals:')
    print('      sense:', expressed_sense.any(axis=1).sum(), 'bp')
    print('      antisense:', expressed_antisense.any(axis=1).sum(), 'bp')
    print('================')
    print()


===== Ecol =====
2 RNA  threshold: 1.4842325247442112
2 Ribo threshold: 6.5310298506444955
3 RNA  threshold: 1.7094184962015229
3 Ribo threshold: 7.707296672679881
----------------
replicon size: 4641652 bp; of these:
  non-overlapping regions of canonical ORFs: 3959613 bp; of these:
    positions with expression signals:
      sense: 1784348 bp
      antisense: 4261 bp



In [8]:
saur_refs = pd.read_csv(path2.metadata/'expression'/'DGY14PNAS_refs.tsv', sep='\t')
saur_runs = pd.read_csv(path2.metadata/'expression'/'DGY14PNAS_runs.tsv', sep='\t')

In [9]:
saur_workdir = path2.data/'expression'/'DGY14PNAS'
saur_plus, saur_minus = {}, {}
for tag in saur_runs['tag']:
    saur_plus[tag]  = pd.read_pickle(saur_workdir/'depth'/f'{tag}.plus.pkl.bz2' ) 
    saur_minus[tag] = pd.read_pickle(saur_workdir/'depth'/f'{tag}.minus.pkl.bz2')    

In [10]:
for sp, run_sp in saur_runs.groupby('species_code'):    
    print('=====', sp, '=====')
    expressed_sense = {}
    expressed_antisense = {}
    assert run_sp['reference_assembly'].nunique() == 1
    refname = run_sp['reference_assembly'].unique()[0]
    rec = max(dloader.load_genome(refname, dirname=path2.pubdata/'expression'/'DGY14PNAS'/'reference'), key=len)

    for rep, run_grp in run_sp.groupby('replicate'):
        depth = {}
        i = 0
        for ty, tag in run_grp[['type', 'tag']].itertuples(index=False):
            plus, minus = saur_plus[tag][rec.id], saur_minus[tag][rec.id]
            if 'Ribo' in ty:
                if rec.annotations['topology'] == 'linear':
                    plus  = sum([plus.shift(-p, fill_value=0) for p in range(28)])
                    minus = sum([minus.shift(p, fill_value=0) for p in range(28)])
                else:
                    plus  = pd.Series(sum([np.roll(plus, -p) for p in range(28)]), index=plus.index)
                    minus = pd.Series(sum([np.roll(minus, p) for p in range(28)]), index=plus.index)
            if 'RNA' in ty:
                i += 1
                ty = (ty, i)
            depth[ty] = pd.concat([
                pd.concat([plus[pt.start:pt.end], minus[pt.start:pt.end]][::pt.strand], axis=1, keys=['sense', 'antisense'])
                for loc_tag, loc in iter_cds_nonoverlapping_regions(rec)
                for pt in loc.parts
            ])

        if rep == 1:
            expressed_sense[rep] = (
                ((depth[('RNA',1)]['sense'] >= depth[('RNA',1)]['sense'].mean() * 0.1) |
                 (depth[('RNA',2)]['sense'] >= depth[('RNA',2)]['sense'].mean() * 0.1) ) & 
                ((depth['Ribo-monosome']['sense'] >= depth['Ribo-monosome']['sense'].mean() * 0.1) |
                 (depth['Ribo-disome'  ]['sense'] >= depth['Ribo-disome'  ]['sense'].mean() * 0.1) )
            )
            expressed_antisense[rep] =  (
                ((depth[('RNA',1)]['antisense'] >= depth[('RNA',1)]['sense'].mean() * 0.1) |
                 (depth[('RNA',2)]['sense'] >= depth[('RNA',2)]['sense'].mean() * 0.1) ) & 
                ((depth['Ribo-monosome']['antisense'] >= depth['Ribo-monosome']['sense'].mean() * 0.1) |
                 (depth['Ribo-disome'  ]['antisense'] >= depth['Ribo-disome'  ]['sense'].mean() * 0.1) )
            )
            print(rep, 'RNA (run1)  threshold:', depth[('RNA',1)]['sense'].mean() * 0.1)
            print(rep, 'RNA (run2)  threshold:', depth[('RNA',2)]['sense'].mean() * 0.1)
            print(rep, 'Ribo-monosome thresh.:', depth['Ribo-monosome']['sense'].mean() * 0.1)
            print(rep, 'Ribo-disome   thresh.:', depth['Ribo-disome'  ]['sense'].mean() * 0.1)
            
        else:
            expressed_sense[rep] = (
                (depth[('RNA',1)]['sense'] >= depth[('RNA',1)]['sense'].mean() * 0.1) & 
                ((depth['Ribo-monosome']['sense'] >= depth['Ribo-monosome']['sense'].mean() * 0.1) |
                 (depth['Ribo-disome'  ]['sense'] >= depth['Ribo-disome'  ]['sense'].mean() * 0.1))
            )
            expressed_antisense[rep] =  (
                (depth[('RNA',1)]['antisense'] >= depth[('RNA',1)]['sense'].mean() * 0.1) & 
                ((depth['Ribo-monosome']['antisense'] >= depth['Ribo-monosome']['sense'].mean() * 0.1) |
                 (depth['Ribo-disome'  ]['antisense'] >= depth['Ribo-disome'  ]['sense'].mean() * 0.1))
            )    
            print(rep, 'RNA  threshold:', depth[('RNA',1)]['sense'].mean() * 0.1)
            print(rep, 'Ribo-monosome thresh.:', depth['Ribo-monosome']['sense'].mean() * 0.1)
            print(rep, 'Ribo-disome   thresh.:', depth['Ribo-disome'  ]['sense'].mean() * 0.1)
            
    expressed_sense = pd.DataFrame(expressed_sense)
    expressed_antisense = pd.DataFrame(expressed_antisense)
    
    print('----------------')
    print('replicon size:', len(rec), 'bp; of these:')
    print('  non-overlapping regions of canonical ORFs:', len(expressed_sense), 'bp; of these:')
    print('    positions with expression signals:')
    print('      sense:', expressed_sense.any(axis=1).sum(), 'bp')
    print('      antisense:', expressed_antisense.any(axis=1).sum(), 'bp')
    print('================')
    print()


===== Saur =====
1 RNA (run1)  threshold: 3.076647385848452
1 RNA (run2)  threshold: 0.3194238322196962
1 Ribo-monosome thresh.: 0.6021775656075681
1 Ribo-disome   thresh.: 1.4125729655659376
2 RNA  threshold: 1.826415319338819
2 Ribo-monosome thresh.: 1.6561112842153314
2 Ribo-disome   thresh.: 0.59244769794895
----------------
replicon size: 2821361 bp; of these:
  non-overlapping regions of canonical ORFs: 2339631 bp; of these:
    positions with expression signals:
      sense: 1038178 bp
      antisense: 6359 bp



- successfully reproduced our result