# 20 vs 20 signal processing

## 2 ways to process signal
* Based on exact tags count using fragment size
    UNIQUE_BAM -> PILEUP_BED -> TAGS -> intersect with given regions bed and compute intersection
* Based on bigwigs
    UNIQUE_BAM -> BIGWIG -> bigWigAverageOverBed
    
### Unique BAM -> TAGS
```
./gradlew integration:shadowJar && java -cp integration/build/libs/integration-dev.jar org.jetbrains.bio.experiments.histones.UniqueBamsExperiment Y20O20

cd /mnt/stripe/bio/experiments/configs/Y20O20/unique
for D in $(ls . | grep -v yaml); do 
    echo $(pwd)/$D; 
    bash /mnt/stripe/washu/parallel/tags_bigwig.sh /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes 150 $(pwd)/$D; 
done
```

## Interesting LOCI
* Auto generated `/mnt/stripe/bio/experiments/loci_of_interest/`
* All merged `/mnt/stripe/bio/raw-data/aging/loci_of_interest/`

# Prepare BigWigs
```
# Prepare data
for M in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3; do 
    echo $M; 
    mkdir $M; 
    ls /mnt/stripe/bio/experiments/configs/Y20O20/unique_tags_bw/$M/*.bw | xargs -I {} ln -s {} $M/; 
    # In case we have input separated processed
    # ls /mnt/stripe/bio/experiments/configs/Y20O20/unique_tags_bw/input/*.bw xargs -I {} ln -s {} $M/;
done
```

# Process signals and build PCA
```
export PYTHONPATH="/mnt/stripe/washu:$PYTHONPATH"
export WASHU_PARALLELISM=16
DIR=$(pwd)
for M in $(find . -maxdepth 1  -type d | grep '/' | sed 's#./##g'); do
    echo "Processing $M"; 
    cd $DIR/$M
    for F in $(find /mnt/stripe/bio/raw-data/aging/loci_of_interest/ -name "*.bed" | grep -vE 'repeats|other_pathway'); do 
        echo "$M regions $F"; 
        N=${F%%.bed}; 
        N=${N##*/}; 
        if [ ! -d $DIR/$M/$N ]; then
            bash /mnt/stripe/washu/parallel/signals_bw.sh $DIR/$M $F $N /mnt/stripe/bio/genomes/hg19/hg19.chrom.sizes $PEAKS;
        fi;
    done;
done | tee log.txt

# Create report
bash /mnt/stripe/washu/reports/signals_report.sh $(pwd) $(pwd)/report.tsv
```

# Cleanup possible errors
```
DIR=$(pwd)
for M in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3; do 
    cd ${DIR}/$M; 
    echo $M; 
    for F in $(find . -maxdepth 1  -type d | grep '/' | sed 's#./##g'); do 
        PNGS=$(find $F -name "*.png"); 
        if [[ -z "$PNGS" ]]; then 
            echo "$M $F"; 
            rm -r $F; 
        fi; 
    done; 
done
```

In [None]:
import glob
from scipy.stats import mannwhitneyu
from statsmodels.sandbox.stats.multicomp import multipletests
import os
import pandas as pd
import numpy as np
import re
from scripts.util import *

def stat_test(folder, m, test_name, test, alpha, min_pval):
    folder = os.path.join(folder, m)
    for signal_type in ['raw', 'rpm', 'rpkm', 'scores', 'scores_tmm']:
        for f in glob.glob('{}/*/{}*weak_consensus*{}.tsv'.format(folder, m, signal_type)):
            # Ignore ODS vs YDS here             
            if re.match('.*DS.*', f):
                continue
            print('Processing', m, '@', re.sub('[^/]*/', '', f))
            df = pd.read_csv(f, sep='\t')
            # Drop contigs
            df = df.loc[[bool(re.match('chr[0-9XYM]+$', c)) for c in df['chr']]]
            ods = [c for c in df.columns.values if is_od(c)]
            yds = [c for c in df.columns.values if is_yd(c)]
            pvals = np.array([test(row[ods], row[yds]) for _,row in df.iterrows()])
            res = multipletests(pvals, alpha, "fdr_bh")
            h0_rejects = res[0]
            pvals_adj = res[1]
            df['pval'] = pvals
            df['pval_adj'] = pvals_adj
            df['od_mean'] = df[ods].mean(axis=1).to_frame('od_mean')['od_mean']
            df['yd_mean'] = df[yds].mean(axis=1).to_frame('yd_mean')['yd_mean']
            df['logfc'] = np.log(df['od_mean'] / df['yd_mean'])
            # Sort by pvalues 
            dfp = df.loc[pvals.argsort()[:5]]
            dfp = df.loc[df['pval'] < min_pval]
            print("Locations: {}; FDR={}: {}; pvals < {}: {}".format(
                    len(df), alpha, sum(h0_rejects), min_pval, len(dfp)))
            # Display top 5 smallest pvalues
            if (len(dfp) > 0):
                print(dfp.loc[:5][['chr', 'start', 'end', 'yd_mean', 'od_mean', 'logfc', 'pval']])
            # Save result to file            
            if sum(h0_rejects) > 0:
                testf = re.sub('\.tsv', '_{}.tsv'.format(test_name), f)
                df.loc[h0_rejects][['chr', 'start', 'end', 'yd_mean', 'od_mean', 'logfc', 'pval', 'pval_adj']]\
                .to_csv(testf, sep='\t', index=None, header=True)
                print('Saved {} regions to {}'.format(sum(h0_rejects), testf))


# MannWhitney U test

In [None]:
def mann_whitney(x, y):
    try:
        # Mann-Whitney U test     
        return mannwhitneyu(x, y).pvalue
    except ValueError:
        return 1.0

for m in ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']:
    stat_test('/mnt/stripe/bio/experiments/signal_loci_of_interest_auto', m, 'mann_whitney', mann_whitney, 0.05, 1e-4)

# T-Test

In [None]:
import scipy
def ttest(x, y):
    try:
        return scipy.stats.ttest_ind(x, y).pvalue
    except ValueError:
        return 1.0

for m in ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']:
    stat_test('/mnt/stripe/bio/experiments/signal_loci_of_interest_auto', m, 'ttest', ttest, 0.05, 1e-4)    