In [None]:
from __future__ import division
import bokeh as bk
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
import pandas as pd
import msgpack
import toolz
import scipy.stats as st
import matplotlib.pylab as plt
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm
import blosc
import cPickle as pkl
import random

In [None]:
def get_genes():
    with open('genes.msg', 'rb') as f:
        return set(msgpack.unpackb(f.read()))

def get_gene_map_intron():
    with open('gene_map_intron.msg', 'rb') as f:
        return dict(msgpack.unpackb(f.read()))

def nice_df(df):
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    return df

def load_entire_file():
    with open('intron_events_filter0.pkl.blp', 'rb') as f:
        return pkl.loads(blosc.decompress(f.read()))

def plot_frequencies(df, title):
    def get_extremes(x):
        x = np.asarray(x).ravel()
        x = x[~np.isnan(x)]

        return np.min(x), np.percentile(x, 99.5), np.max(x)


    
    f, ((ax00, ax01), (ax10, ax11)) = plt.subplots(2, 2, facecolor='w',
                                                  sharex='col', sharey='row',
                                                  figsize=(10,5))
#     data = df.groupby(['intron', 'ntri']).size().unstack(level=0)
    data = df.groupby('ntri').size()

    ax00 = data.plot(ax=ax00, legend=False, title=None)
    ax01 = data.plot(ax=ax01, legend=False, title=None)

    ax10 = data.plot(ax=ax10, legend=False, title=None)
    ax11 = data.plot(ax=ax11, legend=False, title=None)

    x0, x1, x2 = get_extremes(df['ntri'])
    y0, y1, y2 = get_extremes(data)

    ax00.set_ylim(y1, y2)
    ax00.set_xlim(x0, x1)

    ax01.set_ylim(y1, y2)
    ax01.set_xlim(x1, x2)

    ax10.set_ylim(y0, y1)
    ax10.set_xlim(x0, x1)

    ax11.set_ylim(y0, y1)
    ax11.set_xlim(x1, x2)

    ax00.set_ylabel('Frequency');
    ax10.set_ylabel('Frequency');

    return f.suptitle(title, fontsize=16)

In [None]:
genes = get_genes()
gene_map_intron = get_gene_map_intron()
gi = gene_map_intron
gene_intron = [(k, v) for k in gi.keys() for v in gi[k]]
df = load_entire_file()

In [None]:
while True:
    gi0 = list(random.sample(gene_intron, 1)[0])
    try:
        df0 = df.loc[gi0,['ntri']]
    except KeyError:
        continue
    plot_frequencies(df0, '%s/%d' % tuple(gi0))