## CRIES
### Counting Reads for Intronic and Exonic Segments
https://github.com/csglab/CRIES




my `alignment` conda environment 

**Step 1. Creating GTF annotation files**

Intron and exon annotations for `gencode.v34` made within my notebook named `build-genome.ipynb`. 

**Step 2. Mapping reads**

`STAR` 

**Step 3. Counting reads that map to intronic or exonic segments of each gene**

`featureCounts`
http://bioinf.wehi.edu.au/featureCounts/

In [6]:
cat scripts/intron_exon_counts.sh

#loop featureCounts intron and exon counts

PDIR=$1
bamDIR=$2
countDIR=$3
logDIR=$4
JOBS=$5

cd $PDIR

GTF_index='~/genomes/hg38/hg38_ensemble_'

mkdir -p $countDIR
mkdir -p $logDIR

for f in ${bamDIR}/*.bam; do
    base=`basename $f`
    sample=${base/.bam/};
    echo -e '----------------------- ' $sample  ' -----------------------'
    echo `date` 
    featureCounts -M -T $JOBS -t intron -g gene_id -a ${GTF_index}introns.gtf -o ${countDIR}/${sample}_introns.txt ${f} &> ${logDIR}/${sample}_introns.log
    featureCounts -M -T $JOBS -t exon -g gene_id -a ${GTF_index}consExons.gtf -o ${countDIR}/${sample}_exons.txt ${f} &> ${logDIR}/${sample}_exons.log
    echo `date` Done!
done


In [8]:
# nohup bash scripts/intron_exon_counts.sh . bam stbl-counts stbl-log 24 > stbl_count.out 

Note that `$countDIR` is `./stbl-counts`

### Run multiQC

In [2]:
!multiqc stbl-counts/*_exons.txt.summary -f -n QC-reports/counts-exons-multiqc.html
!multiqc stbl-counts/*_introns.txt.summary -f -n QC-reports/counts-introns-multiqc.html

[1;30m[INFO   ][0m         multiqc : This is MultiQC v1.9
[1;30m[INFO   ][0m         multiqc : Template    : default
[1;30m[INFO   ][0m         multiqc : Searching   : /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/120h_DMSO_rep1_exons.txt.summary
[1;30m[INFO   ][0m         multiqc : Searching   : /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/120h_DMSO_rep2_exons.txt.summary
[1;30m[INFO   ][0m         multiqc : Searching   : /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/120h_treated_rep1_exons.txt.summary
[1;30m[INFO   ][0m         multiqc : Searching   : /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/120h_treated_rep2_exons.txt.summary
[1;30m[INFO   ][0m         multiqc : Searching   : /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/6h_DMSO_rep1_exons.txt.summary
[1;30m[INFO   ][0m         multiqc : Searching   : /data_gilbert/hom

**Step 4. Normalization** - REMBRANDTS

## Run REMBRANDTS
#### REMoving Bias from Rna-seq ANalysis of Differential Transcript Stability
> [REMBRANDTS](https://github.com/csglab/REMBRANDTS) is a package for analysis of RNA-seq data across multiple samples in order to obtain unbiased estimates of differential mRNA stability. It uses DESeq to obtain estimates of differential pre-mRNA and mature mRNA abundance across samples, and then estimates a gene-specific bias function that is then subtracted from Δexon–Δintron to provide unbiased differential mRNA stability measures.



In [1]:
import os
import subprocess
import glob
import warnings
import pandas as pd
import numpy as np

# from rpy2.rinterface import RRuntimeWarning
# warnings.filterwarnings("ignore")
# warnings.filterwarnings("ignore", category=RRuntimeWarning)

Prepare inputs:

In [4]:
def make_interon_exon_df(stbldir,samples):
    # list path to intron and exon count files 
    exon_count_files = glob.glob(f'{stbldir}/*_exons.txt')
    intron_count_files = glob.glob(f'{stbldir}/*_introns.txt')
    # define gene_ids for each count results for exon and intron 
    exon_gene_ids = pd.read_table(exon_count_files[0], skiprows=1)['Geneid'].tolist()
    intron_gene_ids = pd.read_table(intron_count_files[0], skiprows=1)['Geneid'].tolist()
    
    # read all count tables into one dataframe 
    df_exon = pd.DataFrame({}, index=exon_gene_ids)
    df_intron = pd.DataFrame({}, index=intron_gene_ids)

    for f in exon_count_files:
        counts = pd.read_table(f, skiprows=1).iloc[:, -1].tolist()
        f_n = f.split('/')[-1].replace('_exons.txt', '')
        df_exon[f_n] = counts
    for f in intron_count_files:
        counts = pd.read_table(f, skiprows=1).iloc[:, -1].tolist()
        f_n = f.split('/')[-1].replace('_introns.txt', '')
        df_intron[f_n] = counts

    # reorder in to same column order as the meta table rows 
    df_exon = df_exon[samples] 
    df_intron = df_intron[samples] 

    # select overlap gene_ids
    overlaps = list(set(df_exon.index)&set(df_intron.index))
    # subset by overlap gene_ids 
    df_exon = df_exon.loc[overlaps, ]
    df_intron = df_intron.loc[overlaps, ]
    df_intron = df_intron.loc[df_exon.index.tolist(),:]
    return df_exon, df_intron


def write_inputs(stbldir, df_exon, df_intron):
    # write input counts 
    for c in samples:
        df_exon.loc[:, c].to_csv(f'{stbldir}/{c}_exons.ol.txt', sep='\t', header=None)    
        df_intron.loc[:, c].to_csv(f'{stbldir}/{c}_introns.ol.txt', sep='\t', header=None)

    ### Prepare REMBRANDTS inputs
    # make meta data for running REMBRANDTS
    labels = np.repeat(df_exon.columns.tolist(), 2)
    files = [f'{labels[i]}_exons.ol.txt' if i%2==0 else f'{labels[i]}_introns.ol.txt' for i in range(len(labels))]
    readtypes = ['exonic' if i%2==0 else 'intronic' for i in range(len(labels))]
    meta = pd.DataFrame({'Label':labels, 'File':files, 'ReadType':readtypes, 'Batch':1})
    meta.to_csv(f'{stbldir}/input_table.txt', index=False, sep='\t')

In [5]:
samples = [b.split('/')[-1].replace('.bam','') for b in glob.glob('bam/*.bam')]

In [6]:
df_exon, df_intron = make_interon_exon_df('stbl-counts',samples)

write_inputs('stbl-counts', df_exon, df_intron)

`stbl` env

Run REMBRANDTS

In [10]:
workdir = os.getcwd()
countdir = f'{workdir}/stbl-counts'
outdir = f'{workdir}/stbl-output'

In [11]:
# Run REMBRANDTS
os.chdir('/data_gilbert/home/aarab/REMBRANDTS')
cmd = f'bash REMBRANDTS.sh Decitabine {countdir}/input_table.txt {countdir}/ 0.99 linear'
cmd
# subprocess.call(cmd , shell=True)

'bash REMBRANDTS.sh Decitabine /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/input_table.txt /data_gilbert/home/aarab/Projects/Decitabine-treatment/RNA-seq/stbl-counts/ 0.99 linear'

In [12]:
# # remove results from previous run 
# subprocess.call(f'rm -rv {outdir}', shell=True)
subprocess.call(f'mkdir -p {outdir}', shell=True)
subprocess.call(f'mv -v ./out/Decitabine/* {outdir}', shell=True)
os.chdir(workdir)

print('\n********************************** Finished **********************************\n')


********************************** Finished **********************************



In [17]:
out = pd.read_table(f'{outdir}/stability.filtered.mx.txt', index_col=0)

In [23]:
[(x,y) for x,y in zip(out.columns,samples)]

[('hl60_120h_u_1.x', 'hl60_120h_u_1'),
 ('hl60_120h_u_2.x', 'hl60_120h_u_2'),
 ('hl60_120h_t_1.x', 'hl60_120h_t_1'),
 ('hl60_120h_t_2.x', 'hl60_120h_t_2'),
 ('hl60_6h_u_1.x', 'hl60_6h_u_1'),
 ('hl60_6h_u_2.x', 'hl60_6h_u_2'),
 ('hl60_6h_t_1.x', 'hl60_6h_t_1'),
 ('hl60_6h_t_2.x', 'hl60_6h_t_2'),
 ('hl60_72h_u_1.x', 'hl60_72h_u_1'),
 ('hl60_72h_u_2.x', 'hl60_72h_u_2'),
 ('hl60_72h_t_1.x', 'hl60_72h_t_1'),
 ('hl60_72h_t_2.x', 'hl60_72h_t_2'),
 ('kg1_t_1.x', 'kg1_t_1'),
 ('kg1_t_2.x', 'kg1_t_2'),
 ('kg1_t_3.x', 'kg1_t_3'),
 ('kg1_u_1.x', 'kg1_u_1'),
 ('kg1_u_2.x', 'kg1_u_2'),
 ('kg1_u_3.x', 'kg1_u_3'),
 ('molm14_t_1.x', 'molm14_t_1'),
 ('molm14_t_2.x', 'molm14_t_2'),
 ('molm14_t_3.x', 'molm14_t_3'),
 ('molm14_u_1.x', 'molm14_u_1'),
 ('molm14_u_2.x', 'molm14_u_2'),
 ('molm14_u_3.x', 'molm14_u_3'),
 ('ociaml2_t_1.x', 'ociaml2_t_1'),
 ('ociaml2_t_2.x', 'ociaml2_t_2'),
 ('ociaml2_t_3.x', 'ociaml2_t_3'),
 ('ociaml2_u_1.x', 'ociaml2_u_1'),
 ('ociaml2_u_2.x', 'ociaml2_u_2'),
 ('ociaml2_u_3.x', 'o

In [None]:
out.columns = samples
out.to_csv('stbl/REMBRANDTS.txt', sep='\t')

In [15]:
out.shape

(2776, 42)

In [16]:
!date

Wed Dec 15 00:31:52 PST 2021


All done!