# SPAN models analysis


This notebook is dedicated to some convergence problems of SPAN models.\
It analyzes coverage and log with `--debug` options.


In [None]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
import pandas as pd
import numpy as np
import glob

output_notebook()

In [None]:
folder = '<coverages_folder>'
coverages = glob.glob(folder + '/cache/*.npz')
print(coverages)

In [None]:
import re
import mmap
from scipy.stats import nbinom as nb
from collections import Counter
from bokeh.models.scales import LogScale
    

def plot_coverage(path):
    print('Coverage', path)
    '''Compute coverage --keep-dup --fragment 0'''
    name = re.sub('.*/coverage_|_raw\#.*', '', path)
    print('Processing', name)    
    npz = np.load(path)
    BIN = 100
    coverage = Counter()
    for chr in range(1, 6):
        for p in npz['chr{}/+'.format(chr)]:
            coverage[int(p / BIN)] += 1
        for p in npz['chr{}/-'.format(chr)]:
            coverage[int(p / BIN)] += 1
    cvalues = coverage.values()
    print('Summary coverage', sum(cvalues))
    notnull = [x for x in cvalues if x > 0]
    print('Summary notnull bins', len(notnull))
    print('Max coverage', max(notnull))
    log = glob.glob(folder + '/**/*' + name + '*.log')[0]

    print('Log', log)
    meanLow = 0
    fsLow = 0
    meanHigh = 0
    fsHigh = 0
    with open(log, 'r') as f:
        # memory-map the file, size 0 means whole file
        m = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) # prot argument is *nix only
        i = m.rfind(bytes('means=[', 'utf-8'))   # search for last occurrence
        m.seek(i)             # seek to the location
        result = m.readline()
        print(result)
        numbers = [float(n) for n in re.findall(r"[-+]?\d*\.\d+|\d+", str(result))]
        meanLow = numbers[0]
        meanHigh = numbers[1]
        fsLow = numbers[2]
        fsHigh = numbers[3]

    nbLow = nb(fsLow, fsLow / (meanLow + fsLow))
#     print('Low', meanLow, fsLow, nbLow.mean())

    nbHigh = nb(fsHigh, fsHigh / (meanHigh + fsHigh))
#     print('High', meanHigh, fsHigh, nbHigh.mean())
    print('\n')

    N = 10    
    hist, edges = np.histogram([x for x in notnull if x <= N], density=True, bins=N)

    xs = np.linspace(1, N, N)    
    low = [ nbLow.pmf(x)  for x in xs ]
    high = [ nbHigh.pmf(x) for x in xs ]
    
    p = figure(title=name + ' bin' + str(BIN) + ' >0', tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
    
    p.line(xs, low, line_color="orange", line_width=5, alpha=0.7, legend="Low")
    p.line(xs, high, line_color="green", line_width=5, alpha=0.7, legend="High")
    
    p.y_range.start = 0
    p.y_range.end = 1
#     p.y_scale = LogScale()
    p.x_range.start = 1
    p.x_range.end = N
    p.legend.location = "center_right"
    p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    
    return p

In [None]:
show(gridplot([plot_coverage(c) for c in coverages], ncols=1, 
              plot_width=800, plot_height=400, toolbar_location=None))

# Analysis of track convergence

When we are facing bad track quality we experience LOW state mean -> 0. 
This is best illustrated by mutlistarted learning below.

In [None]:
log = '/mnt/stripe/shpynov/GSE65360/fit_20xstart_100iteration_0.1dl_nozeroes_multistart_determined/logs/BJ_100_0.1_5.log'
states = []
with open(log, 'r') as f:
    for line in f.readlines():
        if 'NegBinEmissionScheme' in line:
            l = line.split('NegBinEmissionScheme', 1)[1]             
            mean, failures = [float(n) for n in re.findall(r"[-+]?\d*\.\d+|\d+{E[-]?\d+}?", l)][-2:]
            states.append(nb(failures, failures / (mean + failures)))

N = 10
def plot_nbs(i, nbLow, nbHigh):
    xs = np.linspace(1, N, N)    
    low = [ nbLow.pmf(x)  for x in xs ]
    high = [ nbHigh.pmf(x) for x in xs ]
    
    p = figure(title=str(i), tools='', background_fill_color="#fafafa")    
    p.line(xs, low, line_color="orange", line_width=5, alpha=0.7, legend="Low")
    p.line(xs, high, line_color="green", line_width=5, alpha=0.7, legend="High")
    
    p.y_range.start = 0
    p.y_range.end = 1
    p.x_range.start = 1
    p.x_range.end = N
    p.legend.location = "center_right"
    p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"    
    return p

show(gridplot([plot_nbs(i, states[i * 2], states[i * 2 + 1]) for i in range(0, int(len(states) / 2))], 
              ncols=1, plot_width=400, plot_height=300, toolbar_location=None))