# CRIES
### Counting Reads for Intronic and Exonic Segments
https://github.com/csglab/CRIES


**Step 1. Creating GTF annotation files**

Intron and exon annotations for `gencode.v34` made within my notebook named `build-genome.ipynb`. 

**Step 2. Mapping reads**

`STAR` 

**Step 3. Counting reads that map to intronic or exonic segments of each gene**

`featureCounts`
http://bioinf.wehi.edu.au/featureCounts/

In [2]:
!cat ~/GitHub/Abe/my_scripts/intron_exon_counts.sh

#loop featureCounts intron and exon counts

PDIR=$1
bamDIR=$2
countDIR=$3
logDIR=$4
JOBS=$5

cd $PDIR

GTF_index='/rumi/shams/genomes/hg38/hg38_ensemble_'

mkdir -p $countDIR
mkdir -p $logDIR

for f in ${bamDIR}/*.bam; do
    base=`basename $f`
    sample=${base/.bam/};
    echo -e '----------------------- ' $sample  ' -----------------------'
    echo `date` 
    featureCounts -M -T $JOBS -t intron -g gene_id -a ${GTF_index}introns.gtf -o ${countDIR}/${sample}_introns.txt ${f} &> ${logDIR}/${sample}_introns.log
    featureCounts -M -T $JOBS -t exon -g gene_id -a ${GTF_index}consExons.gtf -o ${countDIR}/${sample}_exons.txt ${f} &> ${logDIR}/${sample}_exons.log
    echo `date` Done!
done


Note that `$countDIR` is `./stbl-counts`

### Run multiQC

**Step 4. Normalization** - REMBRANDTS

#### REMoving Bias from Rna-seq ANalysis of Differential Transcript Stability
[REMBRANDTS](https://github.com/csglab/REMBRANDTS) is a package for analysis of RNA-seq data across multiple samples in order to obtain unbiased estimates of differential mRNA stability. It uses DESeq to obtain estimates of differential pre-mRNA and mature mRNA abundance across samples, and then estimates a gene-specific bias function that is then subtracted from Δexon–Δintron to provide unbiased differential mRNA stability measures.



In [2]:
import os
import subprocess
import glob
import warnings
import pandas as pd
import numpy as np

from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=RRuntimeWarning)

Prepare inputs:

In [29]:
def make_interon_exon_df(stbldir,samples):
    # list path to intron and exon count files 
    exon_count_files = glob.glob(f'{stbldir}/*_exons.txt')
    intron_count_files = glob.glob(f'{stbldir}/*_introns.txt')
    # define gene_ids for each count results for exon and intron 
    exon_gene_ids = pd.read_table(exon_count_files[0], skiprows=1)['Geneid'].tolist()
    intron_gene_ids = pd.read_table(intron_count_files[0], skiprows=1)['Geneid'].tolist()
    
    # read all count tables into one dataframe 
    df_exon = pd.DataFrame({}, index=exon_gene_ids)
    df_intron = pd.DataFrame({}, index=intron_gene_ids)

    for f in exon_count_files:
        counts = pd.read_table(f, skiprows=1).iloc[:, -1].tolist()
        f_n = f.split('/')[-1].replace('_exons.txt', '')
        df_exon[f_n] = counts
    for f in intron_count_files:
        counts = pd.read_table(f, skiprows=1).iloc[:, -1].tolist()
        f_n = f.split('/')[-1].replace('_introns.txt', '')
        df_intron[f_n] = counts

    # reorder in to same column order as the meta table rows 
    df_exon = df_exon[samples] 
    df_intron = df_intron[samples] 

    # select overlap gene_ids
    overlaps = list(set(df_exon.index)&set(df_intron.index))
    # subset by overlap gene_ids 
    df_exon = df_exon.loc[overlaps, ]
    df_intron = df_intron.loc[overlaps, ]
    df_intron = df_intron.loc[df_exon.index.tolist(),:]
    return df_exon, df_intron


def write_inputs(stbldir, df_exon, df_intron):
    # write input counts 
    for c in samples:
        df_exon.loc[:, c].to_csv(f'{stbldir}/{c}_exons.ol.txt', sep='\t', header=None)    
        df_intron.loc[:, c].to_csv(f'{stbldir}/{c}_introns.ol.txt', sep='\t', header=None)

    ### Prepare REMBRANDTS inputs
    # make meta data for running REMBRANDTS
    labels = np.repeat(df_exon.columns.tolist(), 2)
    files = [f'{labels[i]}_exons.ol.txt' if i%2==0 else f'{labels[i]}_introns.ol.txt' for i in range(len(labels))]
    readtypes = ['exonic' if i%2==0 else 'intronic' for i in range(len(labels))]
    meta = pd.DataFrame({'Label':labels, 'File':files, 'ReadType':readtypes, 'Batch':1})
    meta.to_csv(f'{stbldir}/input_table.txt', index=False, sep='\t')

In [8]:
samples = [b.split('/')[-1].replace('.bam','') for b in glob.glob('*-bam/*.bam')]

In [7]:
df_exon, df_intron = make_interon_exon_df('stbl-counts',samples)

write_inputs('stbl-counts', df_exon, df_intron)

Run REMBRANDTS

In [4]:
workdir = os.getcwd()
countdir = f'{workdir}/stbl-counts'
outdir = f'{workdir}/stbl-output'

In [43]:
# Run REMBRANDTS
os.chdir('/rumi/shams/abe/Workflows/REMBRANDTS/')
cmd = f'bash REMBRANDTS.sh Decitabine {countdir}/input_table.txt {countdir}/ 0.99 linear'
subprocess.call(cmd , shell=True)

# # remove results from previous run 
# subprocess.call(f'rm -rv {outdir}', shell=True)
subprocess.call(f'mkdir -p {outdir}', shell=True)
subprocess.call(f'mv -v ./out/Decitabine/* {outdir}', shell=True)
os.chdir(workdir)

print('\n********************************** Finished **********************************\n')


********************************** Finished **********************************



In [11]:
subprocess.call('ls 

cell-line-consistency.ipynb   hl60-multiqc.html    REMBRANDTS.txt
checksum-fastq.txt            [0m[01;34mhl60-stbl[0m/           run-QCs.ipynb
[01;34mcounts-exons-multiqc_data[0m/    [01;34mhl60-tinat[0m/          salmon.sh
counts-exons-multiqc.html     [01;34mother-bam[0m/           salmon_tinat.sh
[01;34mcounts-introns-multiqc_data[0m/  [01;34mother-exp[0m/           [01;34mscallop-genome[0m/
counts-introns-multiqc.html   [01;34mother-fastq[0m/         [01;34mstbl-counts[0m/
[01;34mhl60-bam[0m/                     [01;34mother-logs[0m/          [01;34mstbl-output[0m/
[01;34mhl60-exp[0m/                     [01;34mother-multiqc_data[0m/  stbl-run_REMBRANDTS.ipynb
[01;34mhl60-fastq[0m/                   other-multiqc.html   tinat.sh
[01;34mhl60-logs[0m/                    [01;34mother-stbl[0m/
[01;34mhl60-multiqc_data[0m/            [01;34mother-tinat[0m/


In [9]:
out = pd.read_table(f'{outdir}/stability.filtered.mx.txt', index_col=0)
out.columns = samples
out.to_csv('REMBRANDTS.txt', sep='\t')

In [10]:
out.shape

(2776, 42)

All done!