# 20 vs 20 signal processing

### There is 2 ways to process signal
* Based on exact tags count using fragment size
    UNIQUE_BAM -> PILEUP_BED -> TAGS -> intersect with given regions bed and compute intersection
* Based on bigwigs
    UNIQUE_BAM -> BIGWIG -> bigWigAverageOverBed

### Interesting LOCI
/mnt/stripe/bio/raw-data/aging/loci_of_interest/

# Prepare BigWigs
```
mkdir H3K27ac
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*k27ac*_unique.bw | xargs -I {} ln -s {} H3K27ac/
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*input*_unique.bw | xargs -I {} ln -s {} H3K27ac/
mkdir H3K27me3
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*k27me3*_unique.bw | xargs -I {} ln -s {} H3K27me3/
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*input*_unique.bw | xargs -I {} ln -s {} H3K27me3/
mkdir H3K36me3
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*k36me3*_unique.bw | xargs -I {} ln -s {} H3K36me3/
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*input*_unique.bw | xargs -I {} ln -s {} H3K36me3/
mkdir H3K4me1
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*k4me1*_unique.bw | xargs -I {} ln -s {} H3K4me1/
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*input*_unique.bw | xargs -I {} ln -s {} H3K4me1/
mkdir H3K4me3
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*k4me3*_unique.bw | xargs -I {} ln -s {} H3K4me3/
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*input*_unique.bw | xargs -I {} ln -s {} H3K4me3/
mkdir meth
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*meth*.bw | xargs -I {} ln -s {} meth/
# Remove outliers
rm meth/*od5* meth/*od17* meth/*yd9*
mkdir rnaseq
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*transcription*.bw | xargs -I {} ln -s {} rnaseq/
mkdir mirna
ls /mnt/stripe/bio/experiments/configs/Y20O20/browsers/data_browser/*mirna*.bw | xargs -I {} ln -s {} mirna/
```

# Process signals and build PCA/graphics/diffbind scores
```
export PYTHONPATH="/mnt/stripe/washu:$PYTHONPATH"
DIR=/mnt/stripe/bio/experiments/signal
cd $DIR
DIR=$(pwd)
export WASHU_PARALLELISM=16
for M in $(find . -maxdepth 1  -type d | grep '/' | sed 's#./##g'); do
    echo "Processing $M"; 
    cd $DIR/$M
    PEAKS=$(find /mnt/stripe/bio/experiments/aging/peak_calling/ -name "${M}_golden_weak_consensus.bed")
    echo "Peaks: $PEAKS";
    for F in $(find /mnt/stripe/bio/raw-data/aging/loci_of_interest/ -name "*.bed" | grep -v -E "pathway|repeat"); do 
        echo "Processing regions $F"; 
        N=${F%%.bed}; 
        N=${N##*/}; 
        if [ ! -d $DIR/$M/$N ]; then
            time bash /mnt/stripe/washu/parallel/signals_bw.sh $DIR/$M $F $N /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes $PEAKS;
        fi;
    done;
done | tee log.txt
```


### Create summary fit error table
```
T=$'\t'; 
for F in $(find . -name "*_fit_error.csv"); do 
    N=$(echo $F | sed 's#\./##g'); 
    M=${N%%/*}; 
    R=${N##*/}; 
    echo $M; echo $R; 
    L=$(cat $F | tr ',' '\t'); 
    echo "$M$T$R$T$L" >> result.tsv.tmp; 
done
echo "modification${T}file${T}e${T}e_scaled${T}e_log${T}e_scaled_log${T}e_min" > result.tsv
cat result.tsv.tmp | awk -v OFS='\t' '{min=$3; for(j=4;j<=6;j++){min=($j<min)?$j:min}; print($1,$2,$3,$4,$5,$6,min)}' >> result.tsv
# Cleanup
rm result.tsv.tmp
```

# Tests over weak consensus to find differential regions

* Prepare all the bw files beforehead
* Process signal based on golden standard weak consensus peaks 
* Launch statistics tests on 

## Compute signal
```
export PYTHONPATH="/mnt/stripe/washu:$PYTHONPATH"
DIR=/mnt/stripe/bio/experiments/signal_diff
cd $DIR
DIR=$(pwd)
for M in $(find . -maxdepth 1  -type d | grep '/' | sed 's#\./##g'); do 
    echo "Processing $M"; 
    cd $DIR/$M;
    PEAKS=$(find /mnt/stripe/bio/experiments/aging/peak_calling/ -name "${M}_golden_weak_consensus.bed" | head -1)
    N=${PEAKS%%.bed}; 
    N=${N##*/}; 
    bash /mnt/stripe/washu/parallel/signals_bw.sh $DIR/$M $PEAKS $N /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes $PEAKS;
done | tee log.txt```

# Stat tests

In [3]:
%matplotlib inline
from scipy.stats import mannwhitneyu
from statsmodels.sandbox.stats.multicomp import multipletests
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scripts.util import *

def test(X,Y):
        try:
            # Use only when the number of observation in each sample is > 20 
            # and you have 2 independent samples of ranks. Mann-Whitney U is
            # significant if the u-obtained is LESS THAN or equal to the critical value of U
            return mannwhitneyu(X, Y).pvalue 
        except:
            return 1.0
def stat_test(folder, id, alpha):
    for signal_type in ['raw', 'rpm', 'rpkm', 'rpm_peaks', 'scores']:
        f = os.path.join(folder, '{0}/{0}_{1}.tsv'.format(id, signal_type))
        print('Processing:', f)
        try:
            df = pd.read_csv(f, sep='\t')
            ods = [c for c in df.columns.values if is_od(c)]
            yds = [c for c in df.columns.values if is_yd(c)]
            pvals = np.array([test(row[ods], row[yds]) for _,row in df.iterrows()])
            res = multipletests(pvals, alpha, "fdr_bh")
            h0_rejects = res[0]
            pvals_adj = res[1]
            print("FDR={}: {}".format(alpha, sum(h0_rejects)))
            if sum(h0_rejects) > 0:
                plt.figure(figsize=(10, 5))
                ax = plt.subplot(1,2,1)
                ax.hist(pvals, bins=30)
                ax.set_title("[{}] P-values (no correction)".format(signal_type))
                ax = plt.subplot(1,2,2)
                ax.hist(pvals_adj, bins=30)
                ax.set_title("[{}] P-values (adjusted)".format(signal_type))
                plt.show()

        except FileNotFoundError:
            print('File not found:', f)

In [5]:
for m in ['H3K27ac', 'H3K27me3', 'H3K4me1', 'H3K4me3', 'H3K36me3']:
    stat_test('/mnt/stripe/bio/experiments/signal_diff/{}'.format(m), 
              '{}_golden_weak_consensus'.format(m), 
              0.05)

Processing: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_raw.tsv
File not found: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_raw.tsv
Processing: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_rpm.tsv
File not found: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_rpm.tsv
Processing: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_rpkm.tsv
File not found: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_rpkm.tsv
Processing: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_consensus/H3K27ac_golden_weak_consensus_rpm_peaks.tsv
File not found: /mnt/stripe/bio/experiments/signal_diff/H3K27ac/H3K27ac_golden_weak_