In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
from glob import glob
import os
import numpy as np

In [2]:
files = pd.Series(glob('intron_events/*'))
files.head()

0    intron_events/HG00273.3.M_120202_7.intron_even...
1    intron_events/HG00112.6.M_120119_2.intron_even...
2    intron_events/HG00262.2.M_111215_8.intron_even...
3    intron_events/NA12004.4.M_120208_5.intron_even...
4    intron_events/NA20528.2.M_120131_1.intron_even...
dtype: object

In [3]:
def explore_filename(f):
    assay_name = os.path.basename(f).replace('.intron_events.csv', '')
    person_id = assay_name.split('.')[0]
    lab_id = assay_name.split('.')[1]
    complement_id = assay_name.split('.')[2]
    return dict(assay_name=assay_name, person_id=person_id, lab_id=lab_id,
                complement_id=complement_id)

def read_csv(filepath):
    df = pd.read_csv(filepath, sep='\t', header=None, low_memory=False, engine='c')
    df.rename(columns={0:'gene', 1:'intron', 2:'nread0', 3:'nread1'}, inplace=True)
    df['intron'] = df['intron'].apply(lambda v: int(v.split('_')[2]))
    ef = explore_filename(filepath)
    df['assay'] = ef['assay_name']
    df['person_id'] = ef['person_id']
    df['lab_id'] = ef['lab_id']
    df['lab_id'] = pd.to_numeric(df['lab_id'])
    df['nsuccesses'] = df['nread0']
    df['ntrials'] = df['nread0'] + df['nread1']
    del df['nread0'], df['nread1']
    return df

In [4]:
import multiprocessing
pool = multiprocessing.Pool(20)
df = pd.concat(pool.map(read_csv, files))
df.set_index(['assay', 'gene', 'intron']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,person_id,lab_id,nsuccesses,ntrials
assay,gene,intron,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HG00273.3.M_120202_7,ENSG00000000003,1,HG00273,3,0,0
HG00273.3.M_120202_7,ENSG00000000003,2,HG00273,3,0,0
HG00273.3.M_120202_7,ENSG00000000003,3,HG00273,3,0,0
HG00273.3.M_120202_7,ENSG00000000003,4,HG00273,3,0,0
HG00273.3.M_120202_7,ENSG00000000003,5,HG00273,3,0,0


In [5]:
df.set_index(['assay', 'gene', 'intron']).describe()

Unnamed: 0,lab_id,nsuccesses,ntrials
count,128811890.0,128811890.0,128811890.0
mean,2.86,23.43,296.99
std,1.98,566.55,2798.22
min,1.0,0.0,0.0
25%,1.0,0.0,0.0
50%,2.0,0.0,18.0
75%,4.0,4.0,154.0
max,7.0,1065842.0,1418836.0


In [6]:
df.groupby('person_id').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,intron,lab_id,nsuccesses,ntrials
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HG00096,count,225590.00,225590.00,225590.00,225590.00
HG00096,mean,10.14,1.00,27.96,292.35
HG00096,std,14.30,0.00,417.78,2536.30
HG00096,min,1.00,1.00,0.00,0.00
HG00096,25%,3.00,1.00,0.00,0.00
HG00096,50%,6.00,1.00,0.00,24.00
HG00096,75%,13.00,1.00,8.00,171.00
HG00096,max,363.00,1.00,94737.00,533529.00
HG00097,count,225590.00,225590.00,225590.00,225590.00
HG00097,mean,10.14,7.00,40.71,467.09


In [9]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,assay,person_id,lab_id,nsuccesses,ntrials
gene,intron,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000000003,1,HG00273.3.M_120202_7,HG00273,3,0,0
ENSG00000000003,2,HG00273.3.M_120202_7,HG00273,3,0,0
ENSG00000000003,3,HG00273.3.M_120202_7,HG00273,3,0,0
ENSG00000000003,4,HG00273.3.M_120202_7,HG00273,3,0,0
ENSG00000000003,5,HG00273.3.M_120202_7,HG00273,3,0,0


In [None]:
df.index