# SPAN autocorrelation analysis

Please use `--ext` parameter to save scores.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
import glob
import os.path

import numpy as np
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure

output_notebook()

In [None]:
FOLDER = '/Users/Oleg.Shpynov/data/2022_GSE26320/span'
NAMES = ['GSM646316_GM12878_H3K27ac_rep1', 'GSM646320_GM12878_H3K36me3_rep1', 'GSM646326_GM12878_H3K4me3_rep1',
         'GSM646318_GM12878_H3K27me3_rep1', 'GSM646322_GM12878_H3K4me1_rep1']

CHROMOSOMES = [str(i) for i in range(1, 22)] + ['X', 'Y', 'M']

In [None]:
coverages = {n: glob.glob(FOLDER + '/cache/*' + n + '*.npz')[0] for n in NAMES}
print('Coverages', coverages)
models = {}
for n in NAMES:
    for f in glob.glob(FOLDER + '/fit/*.span'):
        if '_200' in f and all(p in f for p in n.split('_')):
            models[n] = f
            break
print('Models', models)

In [None]:
import tempfile
from scipy.stats import nbinom as nb
import os
from collections import Counter
import json


def plot_coverage(name, bin=200, n=20):
    coverage_path = coverages[name]
    model_path = models[name]
    print('Loading coverage', coverage_path)
    npz = np.load(coverage_path)
    coverage = Counter()
    for chr in CHROMOSOMES:
        for p in npz[f'chr{chr}/+']:
            coverage[int(p / bin)] += 1
        for p in npz[f'chr{chr}/-']:
            coverage[int(p / bin)] += 1
    cvalues = coverage.values()
    print('Summary coverage', sum(cvalues))
    notnull = [x for x in cvalues if x > 0]
    print('Summary notnull bins', len(notnull))
    print('Max coverage', max(notnull))

    print('Loading model', model_path)
    with tempfile.TemporaryDirectory() as tmp:
        ! tar -xf {model_path} --directory {tmp}
        with open(os.path.join(tmp, 'model.json')) as fm:
            model = json.load(fm)
        # print(model)
        with open(os.path.join(tmp, 'information.json')) as fi:
            fitinfo = json.load(fi)
        # print(fitinfo)

    hist, edges = np.histogram([x for x in notnull if x <= n], density=True, bins=(n))

    meanLow, fsLow = model['neg_bin_emission_schemes'][0]['mean'], model['neg_bin_emission_schemes'][0]['failures']
    meanHigh, fsHigh = model['neg_bin_emission_schemes'][1]['mean'], model['neg_bin_emission_schemes'][1]['failures']

    nbLow = nb(fsLow, fsLow / (meanLow + fsLow))
    # print('Low', meanLow, fsLow, nbLow.mean())

    nbHigh = nb(fsHigh, fsHigh / (meanHigh + fsHigh))
    # print('High', meanHigh, fsHigh, nbHigh.mean())
    # print('\n')
    xs = np.linspace(1, n, n)
    low = [nbLow.pmf(x) for x in xs]
    high = [nbHigh.pmf(x) for x in xs]

    p = figure(title=name + ' bin' + str(bin) + ' >0', tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)

    p.line(xs, low, line_color="orange", line_width=5, alpha=0.7, legend_label="Low")
    p.line(xs, high, line_color="green", line_width=5, alpha=0.7, legend_label="High")

    # p.y_range.start = 0
    # p.y_range.end = 1
    # p.y_scale = LogScale()
    p.x_range.start = 1
    p.x_range.end = n
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color = "white"

    return p

show(plot_coverage('GSM646320_GM12878_H3K36me3_rep1'))

In [None]:
show(gridplot([plot_coverage(n) for n in NAMES], ncols=1,
              width=500, height=300, toolbar_location=None))

# Autocorrelation

In [None]:
from math import ceil
from scipy.stats import pearsonr
import matplotlib.pyplot as plt


def autocorr_full(
        name,
        bin=200,
        min_dist=1, max_dist=20,
        chr_filter=None, # 1, MT, X, Y, hs37d5
        ignore_zero=True,
        ylim = None
):
    coverage_path = coverages[name]
    print('Loading coverage', coverage_path)
    npz = np.load(coverage_path)
    chr_sizes = {}
    for chr in CHROMOSOMES:
        if chr_filter is not None and chr not in chr_filter:
            continue
        chr_sizes[chr] = int(ceil(max(max(npz[f'chr{chr}/+']), max(npz[f'chr{chr}/-'])) / bin))

    coverage = np.zeros(sum(chr_sizes.values()))
    offset = 0
    for chr in CHROMOSOMES:
        if chr_filter is not None and chr not in chr_filter:
            continue
        for p in npz['chr{}/+'.format(chr)]:
            coverage[offset + int(p / bin)] += 1
        for p in npz['chr{}/-'.format(chr)]:
            coverage[offset + int(p / bin)] += 1
        offset += chr_sizes[chr]
    if ignore_zero:
        coverage = coverage[coverage > 0]
    print('Total coverage size', len(coverage))
    print('Max coverage', max(coverage))
    correlations = []
    for d in range(min_dist, max_dist):
        corr, pval = pearsonr(coverage, np.roll(coverage, d))
        correlations.append(corr)

    plt.plot(range(min_dist, max_dist), correlations)
    if ylim is not None:
        plt.ylim(ylim)
    plt.xlabel("Delta dist (bp)")
    plt.ylabel("Correlation")
    plt.title(name)
    plt.show()

autocorr_full('GSM646320_GM12878_H3K36me3_rep1')

In [None]:
for n in NAMES:
    autocorr_full(n, ylim=(0, 1))