In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import os
import random
import sys

src_dir = './../src/'
sys.path[0] = src_dir

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from copy import deepcopy
from scipy.stats import spearmanr


from access_biology_data import meta
from aging_tools import inout

In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [5]:
sys.path.append('./../src/')
from aging_tools import inout, export
from access_aging_data import chaperome, earlier_studies, companions, sequencing
from access_biology_data import annotation, relations

# Construct sanity check: get median expression for every gene

In [6]:
# Get aging expression data, filter, and map to ncbi
df_counts, df_meta, df_genes = sequencing.load_cached_aging_map(
    dataset_name='aging_map_tmm_180105',
    unambiguous_to_entrez=True,
    as_entrez=True
)

In [7]:
%%time
agg = []
for pfu in df_meta['pfu'].unique():
    f_pfu = df_meta['pfu']==pfu
    for tissue in df_meta['tissue'].unique():
        f_tissue = df_meta['tissue']==tissue
        for age in df_meta['age'].unique():
            f_age = df_meta['age']==age
            f = f_pfu & f_tissue & f_age
            
            if any(f):
                d = df_counts.loc[:, f]
                d = d.median(1).to_frame('median')
                d.loc[:, 'tissue'] = tissue
                d.loc[:, 'pfu'] = pfu
                d.loc[:, 'age'] = age
                d = d.reset_index()

                agg.append(d)
                
df_median_counts = pd.concat(agg)                

CPU times: user 1.05 s, sys: 128 ms, total: 1.17 s
Wall time: 1.18 s


In [8]:
import glob

In [9]:
p = inout.get_internal_path(
    'dynamic/tstoeger/181022_inclusive_any_detected/DE/Flu/*.csv')
d = glob.glob(p)

files_to_process = pd.DataFrame(columns=['path'], data = d)
files_to_process['base_name'] = files_to_process['path'].str.extract('.*/(.*).csv', expand=False)
files_to_process[['tissue', 'pfu', 'dividend', 'divisor']] = files_to_process['base_name'].str.extract(
    '^(.*)_pfu_(.*)_ages_(.*) (.*)_DE', expand=False)
files_to_process = files_to_process.set_index('base_name', verify_integrity=True)

agg = []
for j, v in files_to_process.iterrows():
    
    df = pd.read_csv(v['path'], usecols=['Symbol', 'log2FoldChange', 'pvalue', 'padj'])
    tags = ['tissue', 'pfu', 'dividend', 'divisor']
    for tag in tags:
        df.loc[:, tag] = v[tag]

    agg.append(df)

In [10]:
df = pd.concat(agg, axis=0)
df = df.rename(columns={'Symbol': 'gene_ensembl'})

for x in ['pfu', 'dividend', 'divisor']:
    df.loc[:, x] = df.loc[:, x].apply(float)

In [11]:
# Add ncbi gene IDs
df = pd.merge(
    df,
    df_genes[['gene_ensembl', 'gene_ncbi']], how='left').set_index('gene_ncbi').reset_index()

# Add some sanity checks and manually check discrepancies

In [12]:
df.loc[:, 'oldest'] = df.loc[:, ['dividend', 'divisor']].max(1)
df.loc[:, 'youngest'] = df.loc[:, ['dividend', 'divisor']].min(1)

In [13]:
for e in ['dividend', 'divisor', 'oldest', 'youngest']:
    df = pd.merge(
        df, 
        df_median_counts.rename(columns={'median': 'median_{}'.format(e)}), 
        left_on=['gene_ncbi', 'tissue', 'pfu', e],
        right_on=['gene_ncbi', 'tissue', 'pfu', 'age'],
        how='left'
    ).drop('age', 1)

In [14]:
f = df['dividend'] > df['divisor']

In [15]:
df.loc[f, 'o_over_y'] = df.loc[f, 'log2FoldChange']
df.loc[~f, 'o_over_y'] = -df.loc[~f, 'log2FoldChange']

In [16]:
test_dummy = df[(df['padj']<0.05) & ~(df['median_oldest'] == df['median_youngest'])]

In [17]:
f = test_dummy['median_oldest'] > test_dummy['median_youngest']
h = test_dummy[f]

In [18]:
h[h['o_over_y']<0].groupby(['tissue', 'pfu']).size()

tissue  pfu  
AM      0.0      14
        10.0      1
        150.0     4
AT2     10.0      2
Heart   0.0      67
WAT     0.0       2
dtype: int64

In [19]:
h[h['o_over_y']>0].groupby(['tissue', 'pfu']).size()

tissue      pfu  
AM          0.0       533
            10.0       16
            150.0     236
AT2         0.0       241
            10.0      502
            150.0      38
Adrenal     0.0      1986
BAT         0.0       392
Blood       0.0       180
            10.0      414
            150.0      79
Brain       0.0         3
Cerebellum  0.0        60
Esophagus   0.0       202
GutEP       0.0       139
Heart       0.0         4
Kidney      0.0      5717
LI          0.0       122
Liver       0.0        40
Lung        0.0       189
            10.0       45
            150.0      14
MoDC        0.0        26
            10.0       16
            150.0     104
MuscSat     0.0        38
SI          0.0       563
Skin        0.0        21
Stomach     0.0       946
WAT         0.0       828
dtype: int64

In [20]:
h[h['o_over_y']<0]

Unnamed: 0,gene_ncbi,gene_ensembl,log2FoldChange,pvalue,padj,tissue,pfu,dividend,divisor,oldest,youngest,median_dividend,median_divisor,median_oldest,median_youngest,o_over_y
19787,67488.0,ENSMUSG00000023055,-0.741014,4.788168e-05,0.013241,AM,0.0,24.0,9.0,24.0,9.0,63.584425,53.989913,63.584425,53.989913,-0.741014
19801,12642.0,ENSMUSG00000050370,-1.006915,2.637548e-04,0.046586,AM,0.0,24.0,9.0,24.0,9.0,34.138781,27.031120,34.138781,27.031120,-1.006915
979204,231506.0,ENSMUSG00000035310,-0.881801,1.008081e-03,0.049322,AM,0.0,18.0,4.0,18.0,4.0,27.857952,23.530825,27.857952,23.530825,-0.881801
979206,107823.0,ENSMUSG00000057406,-0.570564,9.978300e-04,0.049322,AM,0.0,18.0,4.0,18.0,4.0,61.099314,55.346720,61.099314,55.346720,-0.570564
1415169,244667.0,ENSMUSG00000043051,-3.860493,3.184160e-04,0.034098,AM,150.0,24.0,4.0,24.0,4.0,6.080569,5.654156,6.080569,5.654156,-3.860493
1774150,235048.0,ENSMUSG00000062794,-5.227847,3.241771e-05,0.006362,AM,150.0,24.0,12.0,24.0,12.0,4.623886,3.320520,4.623886,3.320520,-5.227847
1774173,100986.0,ENSMUSG00000040407,-0.844179,4.114675e-04,0.040353,AM,150.0,24.0,12.0,24.0,12.0,38.166417,37.464896,38.166417,37.464896,-0.844179
2374820,71886.0,ENSMUSG00000028396,1.322954,2.802583e-08,0.000034,Heart,0.0,4.0,18.0,18.0,4.0,148.386597,378.983288,378.983288,148.386597,-1.322954
2374821,17105.0,ENSMUSG00000069516,0.945248,2.138327e-05,0.002900,Heart,0.0,4.0,18.0,18.0,4.0,219.816158,432.715459,432.715459,219.816158,-0.945248
2374822,18158.0,ENSMUSG00000029019,1.067486,7.580457e-05,0.007117,Heart,0.0,4.0,18.0,18.0,4.0,663.414752,1323.297606,1323.297606,663.414752,-1.067486


In [21]:
f = test_dummy['median_oldest'] < test_dummy['median_youngest']
h = test_dummy[f]

In [22]:
h[h['o_over_y']<0].groupby(['tissue', 'pfu']).size()

tissue      pfu  
AM          0.0       251
            10.0        5
            150.0     117
AT2         0.0       215
            10.0      402
            150.0      16
Adrenal     0.0      1403
BAT         0.0       224
Blood       0.0        49
            10.0      103
            150.0     108
Brain       0.0         8
Cerebellum  0.0         3
Esophagus   0.0       178
GutEP       0.0       246
Heart       0.0         2
Kidney      0.0      5275
LI          0.0       154
Liver       0.0        13
Lung        0.0       228
            10.0       50
            150.0       8
MoDC        0.0         7
            10.0        9
            150.0      88
MuscSat     0.0        18
SI          0.0       250
Skin        0.0        34
Stomach     0.0       601
WAT         0.0       752
dtype: int64

In [23]:
h[h['o_over_y']>0].groupby(['tissue', 'pfu']).size()

tissue  pfu  
AM      0.0       17
        150.0      2
Heart   0.0      130
Kidney  0.0        2
dtype: int64

# Finally: export

In [24]:
p = inout.get_internal_path('datasets/tstoeger/181024_pooled_differential_expression_inclusive_de_any_detected/age_groups.csv.gz')
inout.ensure_presence_of_directory(p)
df.to_csv(p, compression='gzip', index=False)