In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import os
import random
import sys

src_dir = './../src/'
sys.path[0] = src_dir

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from copy import deepcopy
from scipy.stats import spearmanr


from access_biology_data import meta
from aging_tools import inout

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
sys.path.append('./../src/')
from aging_tools import inout, export
from access_aging_data import chaperome, earlier_studies, companions, sequencing
from access_biology_data import annotation, relations

# Construct sanity check: get median expression for every gene

In [6]:
# Get aging expression data, filter, and map to ncbi
df_counts, df_meta, df_genes = sequencing.load_cached_aging_map(
    dataset_name='aging_map_tmm_180105',
    unambiguous_to_entrez=True,
    as_entrez=True
)

In [7]:
%%time
agg = []
for pfu in df_meta['pfu'].unique():
    f_pfu = df_meta['pfu']==pfu
    for tissue in df_meta['tissue'].unique():
        f_tissue = df_meta['tissue']==tissue
        for age in df_meta['age'].unique():
            f_age = df_meta['age']==age
            f = f_pfu & f_tissue & f_age
            
            if any(f):
                d = df_counts.loc[:, f]
                d = d.median(1).to_frame('median')
                d.loc[:, 'tissue'] = tissue
                d.loc[:, 'pfu'] = pfu
                d.loc[:, 'age'] = age
                d = d.reset_index()

                agg.append(d)
                
df_median_counts = pd.concat(agg)                

CPU times: user 2.09 s, sys: 264 ms, total: 2.35 s
Wall time: 3.15 s


In [8]:
import glob

In [9]:
p = inout.get_internal_path(
    'dynamic/tstoeger/200504_legacy_sample_filtering/DE/Flu/*.csv')
d = glob.glob(p)

files_to_process = pd.DataFrame(columns=['path'], data = d)
files_to_process['base_name'] = files_to_process['path'].str.extract('.*/(.*).csv', expand=False)
files_to_process[['tissue', 'pfu', 'dividend', 'divisor']] = files_to_process['base_name'].str.extract(
    '^(.*)_pfu_(.*)_ages_(.*) (.*)_DE', expand=False)
files_to_process = files_to_process.set_index('base_name', verify_integrity=True)

agg = []
for j, v in files_to_process.iterrows():
    
    df = pd.read_csv(v['path'], usecols=['Symbol', 'log2FoldChange', 'pvalue', 'padj'])
    tags = ['tissue', 'pfu', 'dividend', 'divisor']
    for tag in tags:
        df.loc[:, tag] = v[tag]

    agg.append(df)

In [10]:
df = pd.concat(agg, axis=0)
df = df.rename(columns={'Symbol': 'gene_ensembl'})

for x in ['pfu', 'dividend', 'divisor']:
    df.loc[:, x] = df.loc[:, x].apply(float)

In [11]:
# Add ncbi gene IDs
df = pd.merge(
    df,
    df_genes[['gene_ensembl', 'gene_ncbi']], how='left').set_index('gene_ncbi').reset_index()

# Add some sanity checks and manually check discrepancies

In [12]:
df.loc[:, 'oldest'] = df.loc[:, ['dividend', 'divisor']].max(1)
df.loc[:, 'youngest'] = df.loc[:, ['dividend', 'divisor']].min(1)

In [13]:
for e in ['dividend', 'divisor', 'oldest', 'youngest']:
    df = pd.merge(
        df, 
        df_median_counts.rename(columns={'median': 'median_{}'.format(e)}), 
        left_on=['gene_ncbi', 'tissue', 'pfu', e],
        right_on=['gene_ncbi', 'tissue', 'pfu', 'age'],
        how='left'
    ).drop('age', 1)

In [14]:
f = df['dividend'] > df['divisor']

In [15]:
df.loc[f, 'o_over_y'] = df.loc[f, 'log2FoldChange']
df.loc[~f, 'o_over_y'] = -df.loc[~f, 'log2FoldChange']

In [16]:
test_dummy = df[(df['padj']<0.05) & ~(df['median_oldest'] == df['median_youngest'])]

In [17]:
f = test_dummy['median_oldest'] > test_dummy['median_youngest']
h = test_dummy[f]

In [18]:
h[h['o_over_y']<0].groupby(['tissue', 'pfu']).size()

tissue  pfu  
AM      0.0      14
        10.0      1
        150.0     4
Heart   0.0      67
WAT     0.0       2
dtype: int64

In [19]:
h[h['o_over_y']>0].groupby(['tissue', 'pfu']).size()

tissue      pfu  
AM          0.0       533
            10.0       16
            150.0     269
AT2         0.0       382
            10.0       55
            150.0      38
Adrenal     0.0      2077
BAT         0.0       511
Blood       0.0       113
            10.0      414
            150.0     173
Brain       0.0        21
Cerebellum  0.0        60
Esophagus   0.0       202
GutEP       0.0       139
Heart       0.0        46
Kidney      0.0      3745
LI          0.0       100
Liver       0.0        37
Lung        0.0       350
            10.0        6
            150.0      14
MoDC        0.0        33
            10.0       16
            150.0      67
MuscSat     0.0       255
SI          0.0       563
Skin        0.0       110
Stomach     0.0       934
WAT         0.0       828
dtype: int64

In [20]:
h[h['o_over_y']<0]

Unnamed: 0,gene_ncbi,gene_ensembl,log2FoldChange,pvalue,padj,tissue,pfu,dividend,divisor,oldest,youngest,median_dividend,median_divisor,median_oldest,median_youngest,o_over_y
19780,67488.0,ENSMUSG00000023055,-0.741014,0.000048,0.013235,AM,0.0,24.0,9.0,24.0,9.0,63.584425,53.989913,63.584425,53.989913,-0.741014
19794,12642.0,ENSMUSG00000050370,-1.006915,0.000264,0.046563,AM,0.0,24.0,9.0,24.0,9.0,34.138781,27.031120,34.138781,27.031120,-1.006915
973430,231506.0,ENSMUSG00000035310,-0.881801,0.001008,0.049303,AM,0.0,18.0,4.0,18.0,4.0,27.857952,23.530825,27.857952,23.530825,-0.881801
973432,107823.0,ENSMUSG00000057406,-0.570564,0.000998,0.049303,AM,0.0,18.0,4.0,18.0,4.0,61.099314,55.346720,61.099314,55.346720,-0.570564
1407455,244667.0,ENSMUSG00000043051,-3.860493,0.000318,0.034084,AM,150.0,24.0,4.0,24.0,4.0,6.080569,5.654156,6.080569,5.654156,-3.860493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3811286,22344.0,ENSMUSG00000018377,-0.697088,0.001308,0.044168,AM,0.0,24.0,4.0,24.0,4.0,45.364241,43.472756,45.364241,43.472756,-0.697088
3811289,74343.0,ENSMUSG00000027936,-0.640096,0.001403,0.046233,AM,0.0,24.0,4.0,24.0,4.0,52.476784,51.691882,52.476784,51.691882,-0.640096
4712907,57740.0,ENSMUSG00000015981,-5.467417,0.000013,0.042147,AM,10.0,18.0,9.0,18.0,9.0,5.154313,4.582718,5.154313,4.582718,-5.467417
5562351,216350.0,ENSMUSG00000034127,-6.350430,0.000005,0.002217,WAT,0.0,18.0,4.0,18.0,4.0,1.612216,1.218997,1.612216,1.218997,-6.350430


In [21]:
f = test_dummy['median_oldest'] < test_dummy['median_youngest']
h = test_dummy[f]

In [22]:
h[h['o_over_y']<0].groupby(['tissue', 'pfu']).size()

tissue      pfu  
AM          0.0       251
            10.0        5
            150.0     154
AT2         0.0       340
            10.0       82
            150.0      16
Adrenal     0.0      1773
BAT         0.0       193
Blood       0.0        72
            10.0      103
            150.0     114
Brain       0.0         8
Cerebellum  0.0         3
Esophagus   0.0       179
GutEP       0.0       237
Heart       0.0        11
Kidney      0.0      3934
LI          0.0       142
Liver       0.0        12
Lung        0.0       469
            150.0       8
MoDC        0.0        23
            10.0       10
            150.0      52
MuscSat     0.0       355
SI          0.0       250
Skin        0.0        70
Stomach     0.0       597
WAT         0.0       757
dtype: int64

In [23]:
h[h['o_over_y']>0].groupby(['tissue', 'pfu']).size()

tissue  pfu  
AM      0.0       17
        150.0      2
Heart   0.0      130
dtype: int64

# Finally: export

In [24]:
p = inout.get_internal_path('datasets/tstoeger/200505_pooled_differential_expression_legacy/age_groups.csv.gz')
inout.ensure_presence_of_directory(p)
df.to_csv(p, compression='gzip', index=False)