In [1]:
%matplotlib inline 
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import spearmanr

In [4]:
from access_biology_data import meta, relations, annotation, properties
from access_literature_data import medline
from access_science_shared import standardizer

import sys
sys.path.append('./../src/')
from aging_tools import inout, export, calculators
from access_aging_data import chaperome, earlier_studies, sequencing


In [5]:
import random

In [6]:
import matplotlib
matplotlib.rcParams.update({'font.size': 10})

In [7]:
out_folder = '211018_significance_of_feature_importances'
in_folder = '190406_predict_fold_change_mouse'

In [8]:
save_images = False

In [9]:
from narrative import nar181026_heat_confidence_genes

In [10]:
ref_genes = standardizer.reference_genes(10090, 'orp')  # only consider supported protein coding-genes

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [11]:
all_de, detection, mice_in_comparisons, triplicate_series = nar181026_heat_confidence_genes.load_detection(
    detection_column='padj')

tissues_to_consider = ['BAT', 'Blood', 'GutEP', 'MuscSat', 'Adrenal', 'Kidney',
       'Heart', 'Liver', 'Skin', 'Stomach', 'Brain', 'SI',
       'LI', 'Cerebellum', 'WAT', 'Esophagus', 'Lung'] 
pfus=[0]

In [12]:
all_de = all_de[
    (all_de['younger'] == 4) & 
    (all_de['is_detected'] == True) &
    (all_de['de_unit'] == 'both') &
    (all_de['tissue'].isin(tissues_to_consider)) &
    (all_de['pfu'].isin(pfus)) & 
    (all_de['gene_ncbi'].isin(ref_genes))
]
all_de['significant'] = all_de['padj'] < 0.05

In [13]:
agg = []

for condition in all_de['condition'].unique():

    p = export.get_material_path(
        '{}/features_{}.csv'.format(in_folder, condition))

    if os.path.exists(p):
        h = pd.read_csv(p).set_index('feature')

        h = h.stack().rename_axis(['feature', 'iteration']).to_frame('importance').copy()
        h = h.reset_index()

        for it in h['iteration'].unique():
            g = h[h['iteration']==it].copy()
            g['ranked_importance'] = g['importance'].rank(ascending=False)

            g.sort_values('ranked_importance')
            g.loc[:, 'condition'] = condition
            agg.append(g)

In [14]:
d = pd.concat(agg)

In [15]:
interest =     [
        'rna_full_SumACGT',
        'gene_SumACGT',
        'rna_cds_SumACGT'
    ]

In [16]:
helper = all_de[['condition', 'tissue', 'younger', 'older', 'pfu']].drop_duplicates()

In [17]:
helper['pfu'].value_counts()

0.0    68
Name: pfu, dtype: int64

In [18]:
g = pd.merge(d, helper)

In [19]:
renamer = {
        'Adrenal': 'Adrenal Gland', 
        'BAT': 'Brown adipose tissue', 
        'Blood': 'Blood', 
        'Brain': 'Frontal cortex', 
        'Esophagus': 'Esophagus', 
        'GutEP': 'Gut epithelium', 
        'Heart': 'Heart',
        'Kidney': 'Kidney', 
        'LI': 'Large intestine', 
        'Liver': 'Liver', 
        'Lung': 'Lung', 
        'MuscSat': 'Muscle', 
        'SI': 'Small intestine', 
        'Skin': 'Skin',
        'Stomach': 'Stomach', 
        'WAT': 'White adipose tissue', 
        'Cerebellum': 'Cerebellum'
}

In [20]:
g['tissue'] = g['tissue'].replace(renamer)

In [None]:
plt.figure(figsize=(10, 20))
sns.boxplot(
    y='tissue',
    x='importance',
    hue='feature',
    data=g[
        (g['younger']==4) &
        (g['older']==24) &
        (g['feature'].isin(interest))
    ],
    order=sorted(g['tissue'].unique()),
    hue_order=interest,
    notch=True
)

if save_images:
    export.export_image(
        '{}/effect_sizes_variabilty.pdf'.format(out_folder))



In [None]:
d = pd.merge(d, helper)

In [None]:
d['tissue'] = d['tissue'].replace(renamer)

In [None]:
d['condition_n'] = d['tissue'] + '_' + d['older'].astype(int).astype(str)

In [None]:
e = d.groupby(['condition_n', 'feature'])['importance'].mean().reset_index().pivot(
    index='feature', columns='condition_n', values='importance')

In [None]:
e = e.reindex(columns=sorted(e.columns))

In [None]:
u = e.rank(ascending=False).median(1).sort_values()

In [None]:
e = e.reindex(index=u.index)

In [None]:
if save_images:
    export.export_full_frame(
        '{}/importances.xlsx'.format(out_folder),
        e
    )
    
    export.export_full_frame(
        '{}/importances_ranked.xlsx'.format(out_folder),
        e.rank(ascending=False)
    )