In [1]:
import os
import sys

src_dir = './../src/'
sys.path[0] = src_dir

import matplotlib.pyplot as plt

import glob

import numpy as np
import pandas as pd
import seaborn as sns

from access_biology_data import meta, relations, properties
from access_literature_data import medline
from access_science_shared import standardizer 

In [2]:
sys.path.append('./../src/')
from aging_tools import inout, export

In [3]:
from os import listdir
from os.path import isfile, join

from scipy.stats import spearmanr

In [4]:
from access_science_shared import mapper

In [5]:
save_images = True
outfolder = '211007_cache_gtex_self_controls_cds'

In [6]:
ref_genes = standardizer.reference_genes(9606, 'orp')

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [7]:

pr = properties.genbank_validated_rna(9606)[
    ['gene_ncbi', 'Genbank_validated_RNA: cds_SumACGT']].rename(columns={
    'Genbank_validated_RNA: cds_SumACGT': 'length'
})

pr.loc[:, 'log_length'] = np.log10(pr['length'])
pr = pr[pr['gene_ncbi'].isin(ref_genes)]

In [8]:
input_folders = {
    'male': inout.get_internal_path('dynamic/tstoeger/200609_gtex_m/DE/Flu'),
    'female': inout.get_internal_path('dynamic/tstoeger/200609_gtex_f/DE/Flu'),
}

In [9]:
agg = []

for gender, folder in input_folders.items():

    onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
    manager = pd.Series([x for x in onlyfiles if x.endswith('.csv')]).to_frame('filename')
    manager['tissue'] = manager['filename'].str.extract('^(.*?)_')
    manager['decade'] = manager['filename'].str.extract('pfu_0_age_([0-9])_first').astype(float)
    manager['animals'] = manager['filename'].str.extract('_first_([0-9\-]+)_DE')
    
    manager = manager.reset_index()
    
    for j in manager.index:
        filename = manager.loc[j, 'filename']
        tissue = manager.loc[j, 'tissue']
        decade = manager.loc[j, 'decade']
        animals = manager.loc[j, 'animals']
        

        file = os.path.join(folder, filename)

        df = pd.read_csv(file, usecols=['Symbol', 'log2FoldChange', 'padj']).rename(
            columns={'Symbol': 'gene_ensembl'}
        )

        df = mapper.gene_ensembl_2_gene_ncbi_unambiguously(df, 9606).reset_index()

        df = df[
            (df['padj'].notnull())
        ]
        df = pd.merge(df, pr)
        rho, pval = spearmanr(df['log_length'], df['log2FoldChange'])

        results = dict()
        results['tilt'] = rho
        results['pval'] = pval
        results['filename'] = filename
        results['tissue'] = tissue
        results['decade'] = decade
        results['animals'] = animals        
        results['gender'] = gender
        
        agg.append(results)

In [10]:
out = pd.DataFrame(agg)

In [11]:
if save_images:
    export.export_full_frame(
        '{}/stats_on_imbalance_self_controls.xlsx'.format(outfolder), 
        out, 
        insert_date_time=False,
        save_index=False)
    