In [1]:
from __future__ import division
import bokeh as bk
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
import pandas as pd
import msgpack
import toolz
import scipy.stats as st
import matplotlib.pylab as plt
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm
import blosc
import cPickle as pkl

In [2]:
def get_genes():
    with open('genes.msg', 'rb') as f:
        return set(msgpack.unpackb(f.read()))

def get_gene_map_intron():
    with open('gene_map_intron.msg', 'rb') as f:
        return dict(msgpack.unpackb(f.read()))

def read_gene(gene):
    df = pd.read_hdf('intron_events.h5', 'df', where="(gene == '%s')" % (gene,))
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    df = df.reset_index()
    del df['gene']
    del df['assay']
    df.name = gene
    return df

def read_gene_intron(gene_intron):
    gene, intron = gene_intron
    df = pd.read_hdf('intron_events.h5', 'df',
                     where="(gene == '%s') & (intron == %d)" % (gene, intron))
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    df = df.reset_index()
    del df['gene']
    del df['assay']
    df.name = gene
    return df

def load_entire_file():
    with open('intron_events.pkl.blp', 'rb') as f:
        return pkl.loads(blosc.decompress(f.read()))

def save_pickle():
    with open('intron_events.pkl.blp', 'wb') as f:
        f.write(blosc.compress(pkl.dumps(df, -1)))

In [3]:
%%time
genes = get_genes()
gene_map_intron = get_gene_map_intron()

CPU times: user 267 ms, sys: 79 ms, total: 346 ms
Wall time: 357 ms


Get all pair of gene-intron in a generator.

In [4]:
%%time
gi = gene_map_intron
gene_intron = ((k, v) for k in gi.keys() for v in gi[k])

CPU times: user 286 µs, sys: 0 ns, total: 286 µs
Wall time: 372 µs


In [5]:
%%time
df = load_entire_file()

CPU times: user 1.22 s, sys: 952 ms, total: 2.17 s
Wall time: 1.47 s


In [15]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nread0,nread1
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,1,HG00096.1.M_111124_6,0,2
ENSG00000000003,1,HG00097.7.M_120219_2,0,2
ENSG00000000003,1,HG00099.1.M_120209_6,0,0
ENSG00000000003,1,HG00099.5.M_120131_3,0,2
ENSG00000000003,1,HG00100.2.M_111215_8,0,0


Load the entire DataFrame into memory

In [10]:
%%time
df = load_entire_file()

error: Error 211049427 : not a Blosc buffer or header info is corrupted

In [None]:
blosc.compress

In [8]:
gene0 = toolz.first(genes)

In [14]:
%%time
dfv = df.loc[gene0]

CPU times: user 877 ms, sys: 1.71 s, total: 2.58 s
Wall time: 2.58 s


In [22]:
%%time
dfv = df.loc[gene0, 30]

CPU times: user 2.13 s, sys: 2.53 s, total: 4.66 s
Wall time: 4.66 s


In [23]:
dfv.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nread0,nread1
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000110514,30,HG00096.1.M_111124_6,6,107
ENSG00000110514,30,HG00097.7.M_120219_2,8,188
ENSG00000110514,30,HG00099.1.M_120209_6,3,142
ENSG00000110514,30,HG00099.5.M_120131_3,0,141
ENSG00000110514,30,HG00100.2.M_111215_8,4,118


Acceptance criterion

In [6]:
def accept(df):
    return np.percentile(df['ntri'].values, 90) < 1000

In [56]:
p = Pool(20)
ok = p.map(toolz.compose(accept, read_gene_intron), gene_intron)

IndexError: cannot do a non-empty take from an empty axes.

In [7]:
def does_it_return(gene_intron):
    try:
        gene, intron = gene_intron
        df = pd.read_hdf('intron_events.h5', 'df',
                         where="(gene == '%s') & (intron == %d)" % (gene, intron))
        df['nsuc'] = df['nread0']
        df['ntri'] = df['nread0'] + df['nread1']
        del df['nread0']
        del df['nread1']
        df = df.reset_index()
        del df['gene']
        del df['assay']
        df.name = gene
    except:
        return (gene, intron, False)
    return (gene, intron, True)

In [None]:
pool = Pool(20)
results = []
for result in tqdm(pool.imap_unordered(does_it_return, gene_intron, chunksize=1000), mininterval=60, maxinterval=300):
    results.append(result)

0it [00:00, ?it/s]9001it [01:00, 148.49it/s]29001it [02:00, 178.06it/s]48001it [03:00, 204.94it/s]68001it [04:02, 230.59it/s]88001it [05:06, 249.71it/s]109001it [06:07, 272.46it/s]128001it [07:12, 277.56it/s]149001it [08:17, 290.93it/s]169001it [09:19, 299.11it/s]189001it [10:21, 305.54it/s]210001it [11:25, 312.70it/s]230001it [12:29, 312.81it/s]250001it [13:30, 316.32it/s]271001it [14:30, 325.57it/s]290641it [15:31, 325.44it/s]310242it [16:35, 319.40it/s]331001it [17:36, 325.40it/s]352001it [18:36, 331.69it/s]374001it [19:36, 341.39it/s]395001it [20:39, 338.47it/s]415374it [21:41, 335.42it/s]436001it [22:45, 332.51it/s]457001it [23:49, 330.64it/s]478001it [25:00, 319.44it/s]500001it [26:02, 328.68it/s]519811it [27:11, 315.14it/s]539001it [28:12, 316.14it/s]562001it [29:12, 332.61it/s]582161it [30:15, 329.77it/s]604001it [31:15, 338.14it/s]624423it [32:17, 335.72it/s]644662it [33:19, 334.09it/s]666001it [34:21, 336.49it/s]686241it [35:23, 332.68it/s]

In [62]:
cagado = [i for i in gene_intron if len(i) == 0 or len(i) == 1]

In [63]:
len(cagado)

0

In [10]:
df = read_gene(gene0)
df.head()

Unnamed: 0,intron,nsuc,ntri
0,1,1,3
1,1,0,3
2,1,0,4
3,1,0,3
4,1,0,2


In [27]:
p = Pool(20)
ok = p.map(toolz.compose(accept, read_gene), genes)

In [30]:
print("We have genes that have passed the %d" % sum(ok))

We have genes that have passed the 26721
