In [1]:
%matplotlib inline 
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import spearmanr

In [4]:
from access_biology_data import meta, relations, annotation, properties
from access_literature_data import medline
from access_science_shared import standardizer

import sys
sys.path.append('./../src/')
from aging_tools import inout, export, calculators
from access_aging_data import chaperome, earlier_studies, sequencing


In [5]:
import matplotlib
matplotlib.rcParams.update({'font.size': 10})

In [6]:
quanta = 0.05

In [7]:
save_images = True

In [8]:
def get_edges(nodes, mini_painter):

    # construct network which keeps the most specific edges for smaller subcategories

    helper = pd.merge(mini_painter, mini_painter, on='gene_ncbi')

    h = helper.groupby(['annotation_x', 'annotation_y']).size().to_frame('shared').reset_index()

    genes_in_annotation = mini_painter['annotation'].value_counts(
        ).rename_axis('annotation').to_frame('genes').reset_index()

    h = pd.merge(
        h,
        genes_in_annotation.rename(columns={'annotation': 'annotation_x'}),
        on='annotation_x',
        how='left')
    h = pd.merge(
        h,
        genes_in_annotation.rename(columns={'annotation': 'annotation_y'}),
        how='left',
        on='annotation_y'
    )

    h = h[h['annotation_x'] != h['annotation_y']]
    h = h[h['genes_x'] <= h['genes_y']]
    h.loc[:, 'fraction_of_smaller'] = h.loc[:, 'shared'] / h.loc[:, 'genes_x']

    h = pd.merge(
        h,
        h[['annotation_x', 'fraction_of_smaller']].groupby('annotation_x').agg(max).reset_index()
    )
    h = pd.merge(
        h,
        h[['annotation_x', 'genes_y']].groupby('annotation_x').agg(min).reset_index()
    )
    h = pd.merge(
        h,
        painter['annotation'].value_counts().to_frame('all_in_cat').rename_axis('annotation_y').reset_index()
    )
    h = pd.merge(
        h,
        h[['annotation_x', 'all_in_cat']].groupby('annotation_x').agg(min).reset_index()
    )

    h = h.rename(columns={
        'annotation_x': 'source',
        'annotation_y': 'target',
        'genes_x': 'genes_in_source',
        'genes_y': 'genes_in_target',
        'fraction_of_smaller': 'fraction_of_source',
        'all_in_cat': 'all_in_target'
    })

    edges = h.copy()
    
    return edges

In [9]:
for taxon_id in [9606, 10090]:

    ref_genes = standardizer.reference_genes(taxon_id, 'orp')  # only consider supported protein coding-genes

    pr = properties.genbank_validated_rna(taxon_id)[[
        'gene_ncbi', 'Genbank_validated_RNA: full_SumACGT']].rename(columns={
        'Genbank_validated_RNA: full_SumACGT': 'length'
    })
    pr.loc[:, 'log_length'] = np.log10(pr['length'])
    pr = pr[pr['gene_ncbi'].isin(ref_genes)]
    pr.loc[:, 'rank'] = pr['length'].rank(pct=True)


    painters = dict()

    kegg = annotation.biosystems(
        taxon_id=taxon_id, databases='KEGG')[
        ['biosystem_name', 'gene_ncbi']].drop_duplicates()
    kegg.loc[:, 'annotation'] = kegg['biosystem_name']
    kegg = kegg[['gene_ncbi', 'annotation']].drop_duplicates()
    painters['kegg'] = kegg


    go = annotation.go(
        taxon_id=taxon_id, 
        category=['Process'],
        negating_support=[False],
        any_negating_support=[False],
        temporary_evidence=[True, False], 
        unmapped_evidence=[False]
        )[
        ['GO_term', 'gene_ncbi']].drop_duplicates()
    go.loc[:, 'annotation'] = go['GO_term']
    go = go[['gene_ncbi', 'annotation']].drop_duplicates()
    painters['go_process'] = go


    go = annotation.go(
        taxon_id=taxon_id, 
        category=['Process', 'Compartment', 'Function'],
        negating_support=[False],
        any_negating_support=[False],
        temporary_evidence=[True, False], 
        unmapped_evidence=[False]
        )[
        ['GO_term', 'gene_ncbi']].drop_duplicates()
    go.loc[:, 'annotation'] = go['GO_term']
    go = go[['gene_ncbi', 'annotation']].drop_duplicates()
    painters['go_all_in'] = go


    if taxon_id == 9606:
        med = annotation.disease_genealacart(9606, add_absenece=False)
        med = med.rename(columns={'unified_disease': 'annotation'})[
            ['gene_ncbi', 'annotation']
        ].drop_duplicates()
        painters['disease'] = med

        med = annotation.human_phenotype_genealacart(9606, add_absenece=False)
        med = med.rename(columns={
            'human_phenotype_genealacart: human_phenotype_name': 'annotation'})[
            ['gene_ncbi', 'annotation']
        ].drop_duplicates()
        painters['phenotype'] = med


    for paint in painters.keys():
        painter=painters[paint]
        short_genes = pr[pr['rank']<quanta]['gene_ncbi']
        long_genes = pr[pr['rank']>(1-quanta)]['gene_ncbi']
        background = pr['gene_ncbi']

        in_short = calculators.functional_ratio(
            short_genes, 
            background,    
            painter,
            test='difference'
        )   

        in_long = calculators.functional_ratio(
            long_genes, 
            background,    
            painter,
            test='difference'
        )     

        h = pd.merge(
            in_short.reset_index(), 
            in_long.reset_index(), 
            on='annotation',
            suffixes=('_s', '_l')
        )

        enriched_in_short_depleted_in_long = h[
            (h['benjamini_hochberg_s']<0.05) &
            (h['fold_enrichment_s']>0) &
            (h['fold_enrichment_l']<0)
        ].sort_values(
            'fold_enrichment_s', 
            ascending=False)

        enriched_in_long_depleted_in_short = h[
            (h['benjamini_hochberg_l']<0.05) &
            (h['fold_enrichment_s']<0) &
            (h['fold_enrichment_l']>0)
        ].sort_values(
            'fold_enrichment_l', 
            ascending=False)


        if save_images:
            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_short.xlsx'.format(taxon_id, paint), 
                enriched_in_short_depleted_in_long, 
                insert_date_time=False, 
                save_index=False)

            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_long.xlsx'.format(taxon_id, paint), 
                enriched_in_long_depleted_in_short, 
                insert_date_time=False, 
                save_index=False)
        
        
        nodes = enriched_in_short_depleted_in_long.copy()
        mini_painter = painter[
            (painter['gene_ncbi'].isin(short_genes)) &
            (painter['annotation'].isin(nodes['annotation']))
        ]
        edges = get_edges(nodes, mini_painter)
        
        nodes = nodes[[
            'annotation', 'fold_enrichment_s', 'in_significant_s'
        ]].copy().rename(columns={
            'fold_enrichment_s': 'fold_enrichment',
            'in_significant_s': 'in_significant',
        })
        nodes.loc[:, 'sqrt'] = nodes.loc[:, 'in_significant'].apply(lambda x: np.sqrt(x))
        
        if save_images:
            
            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_short_edges.xlsx'.format(taxon_id, paint), 
                edges, 
                insert_date_time=False, 
                save_index=False)

            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_short_nodes.xlsx'.format(taxon_id, paint), 
                nodes, 
                insert_date_time=False, 
                save_index=False)

        nodes.loc[:, 'flavor'] = 'short'    
        
        agg_nodes = []
        agg_edges = []
        
        agg_nodes.append(nodes)
        agg_edges.append(edges)
        
            
        nodes = enriched_in_long_depleted_in_short.copy()
        mini_painter = painter[
            (painter['gene_ncbi'].isin(long_genes)) &
            (painter['annotation'].isin(nodes['annotation']))
        ]
        edges = get_edges(nodes, mini_painter)
        
        nodes = nodes[[
            'annotation', 'fold_enrichment_l', 'in_significant_l'
        ]].copy().rename(columns={
            'fold_enrichment_l': 'fold_enrichment',
            'in_significant_l': 'in_significant',
        })
        nodes.loc[:, 'sqrt'] = nodes.loc[:, 'in_significant'].apply(lambda x: np.sqrt(x))
     
        
        if save_images:
            
            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_long_edges.xlsx'.format(taxon_id, paint), 
                edges, 
                insert_date_time=False, 
                save_index=False)

            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_long_nodes.xlsx'.format(taxon_id, paint), 
                nodes, 
                insert_date_time=False, 
                save_index=False)
        
        nodes.loc[:, 'flavor'] = 'long'
        nodes.loc[:, 'fold_enrichment'] = -nodes.loc[:, 'fold_enrichment']
        
        agg_nodes.append(nodes)
        agg_edges.append(edges)
        
        nodes = pd.concat(agg_nodes)
        edges = pd.concat(agg_edges)
        
        
        if save_images:
            
            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_pooled_edges.xlsx'.format(taxon_id, paint), 
                edges, 
                insert_date_time=False, 
                save_index=False)

            export.export_full_frame(
                '190409_network_for_short_and_long/{}_{}_pooled_nodes.xlsx'.format(taxon_id, paint), 
                nodes, 
                insert_date_time=False, 
                save_index=False)
        

  fract_of_significant_w_annot / fract_of_background_w_annot)
