# Super fast Pandas DataFrame IO/Usage

In [1]:
from __future__ import division
import bokeh as bk
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
import pandas as pd
import msgpack
import toolz
import scipy.stats as st
import matplotlib.pylab as plt
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm
import blosc
import cPickle as pkl

In [2]:
def get_genes():
    with open('genes.msg', 'rb') as f:
        return set(msgpack.unpackb(f.read()))

def get_gene_map_intron():
    with open('gene_map_intron.msg', 'rb') as f:
        return dict(msgpack.unpackb(f.read()))

def read_gene(gene):
    df = pd.read_hdf('intron_events.h5', 'df', where="(gene == '%s')" % (gene,))
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    df = df.reset_index()
    del df['gene']
    del df['assay']
    df.name = gene
    return df

def read_gene_intron(gene_intron):
    gene, intron = gene_intron
    df = pd.read_hdf('intron_events.h5', 'df',
                     where="(gene == '%s') & (intron == %d)" % (gene, intron))
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    df = df.reset_index()
    del df['gene']
    del df['assay']
    df.name = gene
    return df

def load_entire_file():
    with open('intron_events.pkl.blp', 'rb') as f:
        return pkl.loads(blosc.decompress(f.read()))

def save_pickle():
    with open('intron_events.pkl.blp', 'wb') as f:
        f.write(blosc.compress(pkl.dumps(df, -1)))

In [3]:
%%time
genes = get_genes()
gene_map_intron = get_gene_map_intron()

CPU times: user 265 ms, sys: 82.3 ms, total: 347 ms
Wall time: 355 ms


Get all pair of gene-intron in a generator.

In [4]:
%%time
gi = gene_map_intron
gene_intron = ((k, v) for k in gi.keys() for v in gi[k])

CPU times: user 1.03 ms, sys: 0 ns, total: 1.03 ms
Wall time: 1.03 ms


In [5]:
%%time
df = load_entire_file()

CPU times: user 1.4 s, sys: 938 ms, total: 2.34 s
Wall time: 1.57 s


In [6]:
gene0 = toolz.first(genes)

In [7]:
%%time
df.loc[gene0, 30].head()

CPU times: user 2.81 s, sys: 3.14 s, total: 5.95 s
Wall time: 5.96 s


Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nread0,nread1
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000110514,30,HG00096.1.M_111124_6,6,107
ENSG00000110514,30,HG00097.7.M_120219_2,8,188
ENSG00000110514,30,HG00099.1.M_120209_6,3,142
ENSG00000110514,30,HG00099.5.M_120131_3,0,141
ENSG00000110514,30,HG00100.2.M_111215_8,4,118
