# Filter out irrelevant gene-introns for use

We don't want number of trials larger than 1,000 because LMM can be applied to them equally well.

In [1]:
from __future__ import division
import bokeh as bk
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
import pandas as pd
import msgpack
import toolz
import scipy.stats as st
import matplotlib.pylab as plt
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm
import blosc
import cPickle as pkl

In [10]:
def get_genes():
    with open('genes.msg', 'rb') as f:
        return set(msgpack.unpackb(f.read()))

def get_gene_map_intron():
    with open('gene_map_intron.msg', 'rb') as f:
        return dict(msgpack.unpackb(f.read()))

def read_gene(gene):
    df = pd.read_hdf('intron_events.h5', 'df', where="(gene == '%s')" % (gene,))
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    df = df.reset_index()
    del df['gene']
    del df['assay']
    df.name = gene
    return df

def nice_df(df):
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    return df

def read_gene_intron(gene_intron):
    gene, intron = gene_intron
    df = pd.read_hdf('intron_events.h5', 'df',
                     where="(gene == '%s') & (intron == %d)" % (gene, intron))
    df['nsuc'] = df['nread0']
    df['ntri'] = df['nread0'] + df['nread1']
    del df['nread0']
    del df['nread1']
    df = df.reset_index()
    del df['gene']
    del df['assay']
    df.name = gene
    return df

def load_entire_file():
    with open('intron_events.pkl.blp', 'rb') as f:
        return pkl.loads(blosc.decompress(f.read()))

def save_pickle():
    with open('intron_events.pkl.blp', 'wb') as f:
        f.write(blosc.compress(pkl.dumps(df, -1)))

In [3]:
genes = get_genes()
gene_map_intron = get_gene_map_intron()
gi = gene_map_intron
gene_intron = ((k, v) for k in gi.keys() for v in gi[k])
df = load_entire_file()

# Preview of what I am going to do

In [37]:
gene0 = toolz.first(genes)
gene1 = toolz.second(genes)
print(gene0, gene1)

('ENSG00000110514', 'ENSG00000086015')


In [38]:
dfv = nice_df(df.loc[[gene0, gene1]].copy())
dfv.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nsuc,ntri
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000086015,1,HG00096.1.M_111124_6,18,61
ENSG00000086015,1,HG00097.7.M_120219_2,17,121
ENSG00000086015,1,HG00099.1.M_120209_6,8,79
ENSG00000086015,1,HG00099.5.M_120131_3,4,43
ENSG00000086015,1,HG00100.2.M_111215_8,2,29


In [39]:
dfv['ntri'].groupby(level=[0, 1]).quantile(0.99).to_frame().T

gene,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,ENSG00000086015,...,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514,ENSG00000110514
intron,1,2,3,4,5,6,7,8,9,10,...,28,29,30,31,32,33,34,35,36,37
ntri,203.0,188.2,213.6,241.6,217.2,214.4,183.5,178.0,303.3,281.0,...,163.6,288.6,290.6,308.6,435.9,290.2,224.0,241.1,308.4,1273.9


In [53]:
print("Number of rows before filtering: %d" % dfv.shape[0])
dfv0 = dfv.groupby(level=[0, 1]).filter(lambda g: g['ntri'].quantile(0.99) < 1000)
print("Number of rows before filtering: %d" % dfv0.shape[0])

Number of rows before filtering: 37115
Number of rows before filtering: 35973


In [54]:
dfv0.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,nsuc,ntri
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000086015,1,HG00096.1.M_111124_6,18,61
ENSG00000086015,1,HG00097.7.M_120219_2,17,121
ENSG00000086015,1,HG00099.1.M_120209_6,8,79
ENSG00000086015,1,HG00099.5.M_120131_3,4,43
ENSG00000086015,1,HG00100.2.M_111215_8,2,29


# Filtering out

In [55]:
genes = get_genes()
gene_map_intron = get_gene_map_intron()
gi = gene_map_intron
gene_intron = ((k, v) for k in gi.keys() for v in gi[k])
df = load_entire_file()
df = nice_df(df.copy())

In [56]:
df0 = df.groupby(level=[0, 1]).filter(lambda g: g['ntri'].quantile(0.99) < 1000)

In [57]:
print("Remaining traits: %d" % df0.shape[0])

Remaining traits: 112172950


# Storing the resulting dataframe

In [58]:
with open('intron_events_filter0.pkl.blp', 'wb') as f:
    f.write(blosc.compress(pkl.dumps(df0, -1)))

Acceptance criterion

In [6]:
def accept(df):
    return np.percentile(df['ntri'].values, 90) < 1000

In [56]:
p = Pool(20)
ok = p.map(toolz.compose(accept, read_gene_intron), gene_intron)

IndexError: cannot do a non-empty take from an empty axes.

In [7]:
def does_it_return(gene_intron):
    try:
        gene, intron = gene_intron
        df = pd.read_hdf('intron_events.h5', 'df',
                         where="(gene == '%s') & (intron == %d)" % (gene, intron))
        df['nsuc'] = df['nread0']
        df['ntri'] = df['nread0'] + df['nread1']
        del df['nread0']
        del df['nread1']
        df = df.reset_index()
        del df['gene']
        del df['assay']
        df.name = gene
    except:
        return (gene, intron, False)
    return (gene, intron, True)

In [27]:
p = Pool(20)
ok = p.map(toolz.compose(accept, read_gene), genes)

In [30]:
print("We have genes that have passed the %d" % sum(ok))

We have genes that have passed the 26721
