# 20 vs 20 signal processing

## 2 ways to process signal
* Based on exact tags count using fragment size
    UNIQUE_BAM -> PILEUP_BED -> TAGS -> intersect with given regions bed and compute intersection
* Based on bigwigs
    UNIQUE_BAM -> BIGWIG -> bigWigAverageOverBed
    
### Unique BAM -> TAGS
```
./gradlew integration:shadowJar && java -cp integration/build/libs/integration-dev.jar org.jetbrains.bio.experiments.histones.UniqueBamsExperiment Y20O20

cd /mnt/stripe/bio/experiments/configs/Y20O20/unique
for D in $(ls . | grep -v yaml); do 
    echo $(pwd)/$D; 
    bash /mnt/stripe/washu/parallel/tags_bigwig.sh /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes 150 $(pwd)/$D; 
done
```

## Interesting LOCI
* Manual generated `/mnt/stripe/bio/raw-data/aging/loci_of_interest/`
* Auto generated `/mnt/stripe/bio/experiments/loci_of_interest/`
* Consensus peaks `/mnt/stripe/bio/raw-data/aging/loci_of_interest/weak_consensus/`

# Prepare BigWigs
```
# Prepare data
for M in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3; do 
    echo $M; 
    mkdir $M; 
    ls /mnt/stripe/bio/experiments/configs/Y20O20/unique_tags_bw/$M/*.bw | xargs -I {} ln -s {} $M/; 
    # In case we have input separated processed
    # ls /mnt/stripe/bio/experiments/configs/Y20O20/unique_tags_bw/input/*.bw xargs -I {} ln -s {} $M/;
done
```

# Process signals and build PCA
```
export PYTHONPATH="/mnt/stripe/washu:$PYTHONPATH"
DIR=/mnt/stripe/bio/experiments/signal
cd $DIR
DIR=$(pwd)
export WASHU_PARALLELISM=16
for M in $(find . -maxdepth 1  -type d | grep '/' | sed 's#./##g'); do
    echo "Processing $M"; 
    cd $DIR/$M
#    PEAKS=$(find /mnt/stripe/bio/experiments/aging/peak_calling/ -name "${M}_golden_weak_consensus.bed")
#    echo "Peaks: $PEAKS";
    for F in $(find /mnt/stripe/bio/raw-data/aging/loci_of_interest/ -name "*.bed" | grep -v -E "pathway|repeat"); do 
        echo "Processing regions $F"; 
        N=${F%%.bed}; 
        N=${N##*/}; 
        if [ ! -d $DIR/$M/$N ]; then
            time bash /mnt/stripe/washu/parallel/signals_bw.sh $DIR/$M $F $N /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes $PEAKS;
        fi;
    done;
done | tee log.txt

# Create report
bash ~/work/washu/reports/signals_report.sh ${DIR} ${DIR}/report.tsv
```


# Tests over consensus peaks to find differential regions

* Process signal based on consensus peaks 
* Launch statistics tests on 

## Compute signal over weak consensus
```
# Compute signal on weak consensus
cd /mnt/stripe/bio/experiments/signal_weak_consensus_peaks

export PYTHONPATH="/mnt/stripe/washu:$PYTHONPATH"
DIR=$(pwd)
export WASHU_PARALLELISM=20
for M in $(find . -maxdepth 1  -type d | grep '/' | sed 's#./##g'); do
    echo "Processing $M"; 
    cd $DIR/$M
    for F in $(find /mnt/stripe/bio/raw-data/aging/loci_of_interest/weak_consensus/ -name "*$M*.bed" | grep -vE '[YO]D'); do 
        echo "Processing $M regions $F"; 
        N=${F%%.bed}; 
        N=${N##*/}; 
        if [ ! -d $DIR/$M/$N ]; then
            time bash /mnt/stripe/washu/parallel/signals_bw.sh $DIR/$M $F $N /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes;
        fi;
    done;
done | tee log.txt

# Create report
bash ~/work/washu/reports/signals_report.sh ${DIR} ${DIR}/report.tsv
```

In [1]:
%matplotlib inline
import glob
from scipy.stats import mannwhitneyu
from statsmodels.sandbox.stats.multicomp import multipletests
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from scripts.util import *

def test_pvals(x, y):
    try:
        # Mann-Whitney U test     
        return mannwhitneyu(x, y).pvalue
    except ValueError:
        return 1.0

def stat_test(folder, m, alpha):
    folder = os.path.join(folder, m)
    for signal_type in ['raw', 'rpm', 'rpkm', 'scores', 'scores_tmm']:
        for f in glob.glob('{}/*/{}*{}.tsv'.format(folder, m, signal_type)):
            print('Processing', m, re.sub('[^/]*/', '', f))
            df = pd.read_csv(f, sep='\t')
            ods = [c for c in df.columns.values if is_od(c)]
            yds = [c for c in df.columns.values if is_yd(c)]
            pvals = np.array([test_pvals(row[ods], row[yds]) for _,row in df.iterrows()])
            res = multipletests(pvals, alpha, "fdr_bh")
            h0_rejects = res[0]
            pvals_adj = res[1]
            print("FDR={}: {}".format(alpha, sum(h0_rejects)))
            if sum(h0_rejects) > 0:
                plt.figure(figsize=(10, 5))
                ax = plt.subplot(1,2,1)
                ax.hist(pvals, bins=30)
                ax.set_title("{} P-values (no correction)".format(f))
                ax = plt.subplot(1,2,2)
                ax.hist(pvals_adj, bins=30)
                ax.set_title("{} P-values (adjusted)".format(f))
                plt.show()


# Actual diff

In [5]:
for m in ['H3K27ac', 'H3K27me3', 'H3K4me1', 'H3K4me3', 'H3K36me3']:
    stat_test('/mnt/stripe/bio/experiments/signal_weak_consensus_peaks', m, 0.05)

Processing H3K27ac H3K27ac_zinbra_weak_consensus_raw.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_macs2_weak_consensus_raw.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_zinbra_weak_consensus_rpm.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_macs2_weak_consensus_rpm.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_zinbra_weak_consensus_rpkm.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_macs2_weak_consensus_rpkm.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_zinbra_weak_consensus_scores.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_macs2_weak_consensus_scores.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_zinbra_weak_consensus_scores_tmm.tsv
FDR=0.05: 0
Processing H3K27ac H3K27ac_macs2_weak_consensus_scores_tmm.tsv
FDR=0.05: 0
Processing H3K27me3 H3K27me3_zinbra_weak_consensus_raw.tsv
FDR=0.05: 0
Processing H3K27me3 H3K27me3_macs2_weak_consensus_raw.tsv
FDR=0.05: 0
Processing H3K27me3 H3K27me3_sicer_weak_consensus_raw.tsv
FDR=0.05: 0
Processing H3K27me3 H3K27me3_zinbra_weak_consensus_rpm.tsv
FDR=0.05: 0
Processing 

# K27ac testing with logFC

In [73]:
import re
# Let's investigate one case for H3K27ac manually.
df = pd.read_csv('/mnt/stripe/bio/experiments/signal_weak_consensus_peaks/\
H3K27ac/H3K27ac_macs2_weak_consensus/H3K27ac_macs2_weak_consensus_scores.tsv', sep='\t')
# Drop contigs
df = df.loc[[bool(re.match('chr[0-9XYM]+$', c)) for c in df['chr']]]

ods = [c for c in df.columns.values if is_od(c)]
yds = [c for c in df.columns.values if is_yd(c)]
pvals = np.array([test_pvals(row[ods], row[yds]) for _,row in df.iterrows()])
alpha=0.05
res = multipletests(pvals, alpha, "fdr_bh")
h0_rejects = res[0]
pvals_adj = res[1]
print("FDR={}: {}".format(alpha, sum(h0_rejects)))

# Top 10 smallest P-value

In [87]:
df['pval']=pvals
df['pval_adj']=pvals_adj
df['od_mean']=df[ods].mean(axis=1).to_frame('od_mean')['od_mean']
df['yd_mean']=df[yds].mean(axis=1).to_frame('yd_mean')['yd_mean']
df['logfc']=np.log(df['od_mean'] / df['yd_mean'])
COLUMNS=['chr', 'start', 'end', 'yd_mean', 'od_mean', 'logfc', 'pval', 'pval_adj']
print(df.loc[pvals.argsort()[:10]][COLUMNS])

         chr      start        end     yd_mean  od_mean     logfc      pval  \
811     chr1   32644179   32644211    1.000000     1.70  0.530628  0.000056   
2819    chr1  167442010  167442163    2.833333     1.30 -0.779090  0.000324   
549     chr1   26597462   26597542    2.166667     1.00 -0.773190  0.000552   
10783  chr13   50726553   50726729    6.444444    12.40  0.654478  0.000775   
35469   chr7   94138962   94139931   59.666667    48.80 -0.201043  0.319940   
2762    chr1  161932110  161932474    6.388889    12.65  0.683097  0.001804   
38317   chr9   79145315   79145978   15.722222    19.30  0.205030  0.040681   
3616    chr1  224010975  224011129    3.500000     6.00  0.538997  0.002109   
9329   chr12   89661377   89661684    5.333333    10.05  0.633596  0.002195   
26612   chr3   46036307   46038317  219.388889   231.30  0.052870  0.284267   

       pval_adj  
811    0.501615  
2819   0.501615  
549    0.501615  
10783  0.501615  
35469  0.501615  
2762   0.501615  
3831

# Top 10 largest absolute log FC
**NOTE**: number 1 is the only DiffBind result for K27ac!

In [88]:
print(df.loc[np.abs(df['logfc']).argsort()[::-1][:10]][COLUMNS])

         chr      start        end     yd_mean  od_mean     logfc      pval  \
8073   chr12    6240074    6241307    1.833333    49.65  3.298863  0.018814   
7398   chr11   86306575   86307063   12.166667     1.30 -2.236336  0.051997   
7616   chr11  115127019  115127515   13.444444     2.55 -1.662473  0.077028   
8290   chr12   10707143   10708041   24.611111     5.85 -1.436756  0.064534   
39304   chr9  134500615  134500631    1.222222     1.10 -0.105361  0.159899   
22143   chr2  153031410  153033105  325.333333   344.95  0.058549  0.294305   
8280   chr12   10460552   10460838    6.222222     1.65 -1.327352  0.437663   
39438   chr9  139538135  139540152  203.055556   218.85  0.074907  0.309566   
14020  chr16    1381973    1382229    6.500000     1.80 -1.284016  0.016030   
7358   chr11   85461390   85461698    8.222222     2.30 -1.273931  0.339140   

       pval_adj  
8073   0.501615  
7398   0.501615  
7616   0.501615  
8290   0.501615  
39304  0.501615  
22143  0.501615  
8280