In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import os
import sys

src_dir = './../../src/'
sys.path[0] = src_dir

import matplotlib.pyplot as plt

import glob

import numpy as np
import pandas as pd
import seaborn as sns

from collections import OrderedDict

from access_biology_data import meta, relations, properties, annotation
from access_literature_data import medline
from access_science_shared import standardizer, mapper

In [4]:
sys.path.append('./../src/')
from aging_tools import inout, export
from access_aging_data import earlier_studies

In [5]:
from narrative import nar181026_heat_confidence_genes, nar190402_features

In [6]:
from random import shuffle
from scipy.stats import spearmanr
from sklearn.ensemble import GradientBoostingRegressor

In [7]:
target_age = 24
bins = 4

In [8]:
taxon_id = 10090

In [9]:
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

In [10]:
all_features = nar190402_features.agg_features_190406_with_mirnas(taxon_id=taxon_id)

  df = pd.read_table(p)
  df = pd.read_table(p, names=['mirna', 'rna_ncbi', 'score'])


In [11]:
def z_score(x):
    x = (x-np.mean(x)) / (np.std(x))
    return x

In [12]:
all_features = all_features.apply(z_score, axis=0)
all_features = all_features[all_features.index.isin(ref_genes)]

In [13]:
all_de, detection, mice_in_comparisons, triplicate_series = nar181026_heat_confidence_genes.load_detection(
    detection_column='padj')

tissues_to_consider = ['BAT', 'Blood', 'GutEP', 'MuscSat', 'Adrenal', 'Kidney',
       'Heart', 'Liver', 'Skin', 'Stomach', 'Brain', 'SI',
       'LI', 'Cerebellum', 'WAT', 'Esophagus', 'Lung'] 

all_de = all_de[
    (all_de['tissue'].isin(tissues_to_consider)) & 
    (all_de['pfu']==0) &
    (all_de['de_unit']=='both') & 
    (all_de['gene_ncbi'].isin(ref_genes)) & 
    (all_de['younger']==4) &
    (all_de['older']==target_age)
]

In [14]:
all_de = all_de.sort_values(['younger', 'older'], ascending=[1, 0])

In [15]:
%%time

for condition in all_de['condition'].unique():
    print(condition)

    d = all_de[all_de['condition'] == condition].copy().set_index(
        'gene_ncbi', 
        verify_integrity=True)

    g = all_features.copy()
    d = d[d.index.isin(g.index)]
    g = g.loc[d.index, :]
    
    g = g.sample(frac=1, replace=False)
    d = d.reindex(g.index)
    all_indices = pd.Series(np.arange(0, len(g)))
    d.index=all_indices.values
    g.index=all_indices.values
    
    bin_borders = [np.ceil(x) for x in np.linspace(0, len(g), bins+1)]

    importances = pd.DataFrame(index=g.columns, columns=range(bins))
    predictions = pd.DataFrame(index=[condition], columns=range(bins))

    for j in range(bins):

        in_indices = np.arange(bin_borders[j], bin_borders[j+1])
        out_indices = all_indices[~all_indices.isin(in_indices)]
        in_indices = all_indices[all_indices.isin(in_indices)]
       
        rf = GradientBoostingRegressor(loss='huber')

        rf.fit(g.loc[out_indices, :].values, d.loc[out_indices, 'o_over_y'].values)

        t = rf.predict(g.loc[in_indices])
        importances.loc[:, j] = rf.feature_importances_

        observed = d.loc[in_indices, 'o_over_y']
        r = spearmanr(observed, t)[0]
        predictions.loc[condition, j] = r

        v = pd.Series(index=in_indices, data=t).to_frame().reset_index().rename(
            columns={'index': 'gene_ncbi', 0: 'predicted'}
        )
        v.loc[:, 'boot'] = j
        v.loc[:, 'observed'] = observed.values
        
        export.export_full_frame(
            '191023_four_fold_cross_predict_fold_change_mouse_24/values_{}_{}.csv'.format(condition, j), 
            v,
            insert_date_time=False,
            save_index=False
        )

    importances = importances.rename_axis('feature')            

    export.export_full_frame(
        '191023_four_fold_cross_predict_fold_change_mouse/features_{}.csv'.format(condition), 
        importances,
        insert_date_time=False
    )
    export.export_full_frame(
        '191023_four_fold_cross_predict_fold_change_mouse/predictions_{}.csv'.format(condition),
        predictions,
        insert_date_time=False
    )

BAT_0_4_24
Stomach_0_4_24
GutEP_0_4_24
Blood_0_4_24
WAT_0_4_24
Lung_0_4_24
Skin_0_4_24
Esophagus_0_4_24
LI_0_4_24
Brain_0_4_24
Heart_0_4_24
Kidney_0_4_24
Liver_0_4_24
MuscSat_0_4_24
Adrenal_0_4_24
SI_0_4_24
Cerebellum_0_4_24
CPU times: user 34min 41s, sys: 14.4 s, total: 34min 56s
Wall time: 34min 19s
