In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import gc
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from copy import deepcopy
from random import shuffle
from scipy.stats import spearmanr

from access_biology_data import lincs, properties
from access_science_shared import standardizer

sys.path.append('./../src/')
from aging_tools import inout, export

from access_biology_data import annotation
from access_biology_data import meta

In [4]:
from random import shuffle
from scipy.stats import spearmanr
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
from narrative import nar181026_heat_confidence_genes, nar190402_features

In [6]:
outfolder = '190428_predict_gtex_with_mirnas'

In [7]:
# age_filter = '_4_vs_[23]'
# gender = 'males'

num_boot = 5
start_boot = 0

In [8]:
taxon_id = 9606
save_images = True

current_meta_batch = 6

In [9]:
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

In [10]:
all_features = nar190402_features.agg_features_190406_with_mirnas(taxon_id=taxon_id)
all_features = all_features[all_features.index.isin(ref_genes)]

In [11]:
def z_score(x):
    x = (x-np.mean(x)) / (np.std(x))
    return x

In [12]:
all_features = all_features.apply(z_score, axis=0)

In [13]:
import glob

In [14]:
from access_aging_data import companions

In [15]:
master = companions.tstoeger_190427_gtex()
master = master[
    (master['younger'].isin([2, 3])) &
    (master['older'].isin([4, 5, 6, 7]))
]

In [16]:
cond = master[['tissue', 'younger', 'older', 'gender']].drop_duplicates().reset_index(drop=True)

In [17]:
meta_batches = 8

In [18]:
for j in range(meta_batches):
    f = cond.index.values[j::meta_batches]
    cond.loc[f, 'meta_batch'] = j

In [19]:
master = pd.merge(master, cond)

In [20]:
master = master[master['meta_batch']==current_meta_batch].drop('meta_batch', 1)

In [21]:
master.loc[:, 'reduced_tissue'] = master['tissue'].str.replace(
    ' - ', '_').str.replace(' ', '_').str.replace(
    '-', '_').str.replace('(','').str.replace(')', '')

In [22]:
master.loc[:, 'short_name'] = master.loc[:, 'gender'] + 's_' + master['reduced_tissue'] + '_' + \
    master['older'].astype(int).astype(str) + '_vs_' + master['younger'].astype(int).astype(str)

In [23]:
%%time

for short_name in master['short_name'].unique():

    print(short_name)

    d = master[master['short_name']==short_name].dropna(
        subset=['padj'])
    
    d = d[d['gene_ncbi'].isin(ref_genes)]
    d = d.drop_duplicates(subset=['gene_ncbi'], keep=False)

    d = d.set_index('gene_ncbi', verify_integrity=True)

    g = all_features.copy()
    d = d[d.index.isin(g.index)]
    g = g.loc[d.index, :]

    importances = pd.DataFrame(index=g.columns, columns=range(num_boot))
    predictions = pd.DataFrame(index=[short_name], columns=range(num_boot))

    for j in range(start_boot, start_boot+num_boot):

        #rf = RandomForestRegressor(n_jobs=-1, n_estimators=100)
        rf = GradientBoostingRegressor(loss='huber')

        randix = g.index.values.copy()
        thr = int(np.floor(len(randix)*0.9))
        shuffle(randix)

        rf.fit(g.loc[randix[:thr], :].values, d.loc[randix[:thr], 'o_over_y'].values)

        t = rf.predict(g.loc[randix[thr:]])
        importances.loc[:, j] = rf.feature_importances_


        observed = d.loc[randix[thr:], 'o_over_y']
        r = spearmanr(observed, t)[0]
        predictions.loc[short_name, j] = r

        v = pd.Series(index=randix[thr:], data=t).to_frame().reset_index().rename(
            columns={'index': 'gene_ncbi', 0: 'predicted'}
        )
        v.loc[:, 'boot'] = j
        v.loc[:, 'observed'] = observed.values

        export.export_full_frame(
            '{}/values_{}_{}.csv'.format(outfolder, short_name, j), 
            v,
            insert_date_time=False,
            save_index=False
        )

    importances = importances.rename_axis('feature')            

    export.export_full_frame(
        '{}/features_{}.csv'.format(outfolder, short_name), 
        importances,
        insert_date_time=False
    )
    export.export_full_frame(
        '{}/predictions_{}.csv'.format(outfolder, short_name),
        predictions,
        insert_date_time=False
    )

males_Adipose_Subcutaneous_6_vs_3
males_Adipose_Visceral_Omentum_6_vs_3
males_Artery_Aorta_4_vs_2
males_Artery_Coronary_4_vs_2
males_Artery_Tibial_4_vs_2
males_Bladder_4_vs_2
males_Brain_Caudate_basal_ganglia_7_vs_2
males_Brain_Cerebellar_Hemisphere_7_vs_3
males_Brain_Cerebellum_7_vs_3
males_Brain_Cortex_7_vs_3
males_Brain_Frontal_Cortex_BA9_7_vs_3
males_Brain_Hypothalamus_7_vs_2
males_Brain_Nucleus_accumbens_basal_ganglia_7_vs_2
males_Brain_Spinal_cord_cervical_c_1_7_vs_2
males_Breast_Mammary_Tissue_7_vs_2
males_Colon_Sigmoid_7_vs_2
males_Colon_Transverse_7_vs_2
males_Esophagus_Gastroesophageal_Junction_7_vs_2
males_Esophagus_Mucosa_7_vs_2
males_Esophagus_Muscularis_7_vs_2
males_Heart_Atrial_Appendage_7_vs_2
males_Heart_Left_Ventricle_7_vs_2
males_Liver_4_vs_2
males_Lung_4_vs_2
males_Minor_Salivary_Gland_4_vs_2
males_Muscle_Skeletal_6_vs_2
males_Nerve_Tibial_6_vs_2
males_Pancreas_6_vs_2
males_Pituitary_4_vs_3
males_Prostate_5_vs_3
males_Skin_Not_Sun_Exposed_Suprapubic_6_vs_3
males_Ski