In [1]:
import os
import sys

src_dir = './../src/'
sys.path[0] = src_dir

import matplotlib.pyplot as plt

import glob

import numpy as np
import pandas as pd
import seaborn as sns

from access_biology_data import meta, relations, properties
from access_literature_data import medline
from access_science_shared import standardizer 

In [2]:
sys.path.append('./../src/')
from aging_tools import inout, export
from access_aging_data import companions

In [3]:
outfolder = '211007_chache_tstoeger_190427_gtex_with_tilt_gene'

In [4]:
save_images = True

In [5]:
ref_genes = standardizer.reference_genes(9606, 'orp')

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [6]:
all_de = companions.tstoeger_190427_gtex()

In [7]:
%%time
if save_images:
    export.export_full_frame(
        '{}/all_de.csv'.format(outfolder), 
        all_de, 
        insert_date_time=False,
        save_index=False)

CPU times: user 2min 46s, sys: 2.85 s, total: 2min 49s
Wall time: 2min 52s


In [8]:
all_de.loc[:, 'reduced_tissue'] = all_de['tissue'].str.replace(
    ' - ', '_').str.replace(' ', '_').str.replace(
    '-', '_').str.replace('(','').str.replace(')', '')

all_de.loc[:, 'condition'] = all_de.loc[:, 'gender'] + 's_' + all_de['reduced_tissue'] + '_' + \
    all_de['older'].astype(int).astype(str) + '_vs_' + all_de['younger'].astype(int).astype(str)

In [9]:
from natsort import natsorted

In [10]:
conditions = natsorted(all_de['condition'].unique())

In [11]:
out = pd.DataFrame(index=conditions, columns=[
    'tilt', 'pval', 'tilt_orp', 'pval_orp'
]).rename_axis('condition')

In [12]:
pr = properties.genbank_gene(9606)[
    ['gene_ncbi', 'Genbank__gene: SumACGT']].rename(columns={
    'Genbank__gene: SumACGT': 'length'
})

pr.loc[:, 'log_length'] = np.log10(pr['length'])

In [13]:
from scipy.stats import spearmanr

In [14]:
%%time

for condition in conditions:
    d = all_de[
        all_de['condition']==condition
    ]

    d = d[['gene_ncbi', 'o_over_y']].dropna()
    d = pd.merge(d, pr)

    r, p = spearmanr(d['log_length'], d['o_over_y'])
    out.loc[condition, 'tilt'] = r
    out.loc[condition, 'pval'] = p
    
    d = d[d['gene_ncbi'].isin(ref_genes)]
    r, p = spearmanr(d['log_length'], d['o_over_y'])
    out.loc[condition, 'tilt_orp'] = r
    out.loc[condition, 'pval_orp'] = p

CPU times: user 23min 8s, sys: 2min 11s, total: 25min 19s
Wall time: 28min 19s


In [15]:
out = out.reset_index()

In [16]:
mega = pd.merge(out, all_de[['condition', 'gender', 'tissue', 'younger', 'older']].drop_duplicates())

In [21]:
if save_images:
    export.export_full_frame(
        '{}/stats_on_imbalance.csv'.format(outfolder), 
        mega, 
        insert_date_time=False,
        save_index=False)
    