In [1]:
%matplotlib inline 
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [3]:
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import spearmanr

In [4]:
from access_biology_data import meta, relations, annotation, properties
from access_literature_data import medline
from access_science_shared import standardizer

import sys
sys.path.append('./../src/')
from aging_tools import inout, export, calculators
from access_aging_data import chaperome, earlier_studies, sequencing


In [5]:
import matplotlib
matplotlib.rcParams.update({'font.size': 10})

In [6]:
out_folder = '200819_number_of_annotations_opposingly_enriched'

In [7]:
quanta = 0.05

In [8]:
save_images = True

In [9]:
taxon_id = 9606

In [10]:
ref_genes = standardizer.reference_genes(taxon_id, 'orp')  # only consider supported protein coding-genes

pr = properties.genbank_validated_rna(taxon_id)[[
    'gene_ncbi', 'Genbank_validated_RNA: full_SumACGT']].rename(columns={
    'Genbank_validated_RNA: full_SumACGT': 'length'
})
pr.loc[:, 'log_length'] = np.log10(pr['length'])
pr = pr[pr['gene_ncbi'].isin(ref_genes)]
pr.loc[:, 'rank'] = pr['length'].rank(pct=True)

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [11]:
painters = dict()

go = annotation.go(
    taxon_id=taxon_id, 
    category=['Process', 'Compartment', 'Function'],
    negating_support=[False],
    any_negating_support=[False],
    temporary_evidence=[True, False], 
    unmapped_evidence=[False]
    )[
    ['GO_term', 'gene_ncbi']].drop_duplicates()
go.loc[:, 'annotation'] = go['GO_term']
go = go[['gene_ncbi', 'annotation']].drop_duplicates()
painters['go_all_in'] = go



In [12]:
paint = 'go_all_in'

In [13]:
painter=painters[paint]

In [21]:
painter[painter['gene_ncbi'].isin(pr['gene_ncbi'])]['annotation'].nunique()

15371

In [14]:
boots = 100

In [15]:
results = pd.DataFrame(index=range(boots), columns=['short_enriched', 'long_enriched'])

In [16]:
for boot in range(boots):

    number_of_short_genes = pr[pr['rank']<quanta].shape[0]
    number_of_long_genes = long_genes = pr[pr['rank']>(1-quanta)].shape[0]

    short_genes = pr.sample(frac=1, random_state=boot).iloc[pr.shape[0]-number_of_short_genes:]['gene_ncbi']
    long_genes = pr.sample(frac=1, random_state=boot).iloc[:number_of_long_genes]['gene_ncbi']
    
    
#     short_genes = pr[pr['rank']<quanta]['gene_ncbi']
#     long_genes = pr[pr['rank']>(1-quanta)]['gene_ncbi']
    
    
    background = pr['gene_ncbi']

    in_short = calculators.functional_ratio(
        short_genes, 
        background,    
        painter,
        test='difference'
    )   

    in_long = calculators.functional_ratio(
        long_genes, 
        background,    
        painter,
        test='difference'
    )     

    h = pd.merge(
        in_short.reset_index(), 
        in_long.reset_index(), 
        on='annotation',
        suffixes=('_s', '_l')
    )

    enriched_in_short_depleted_in_long = h[
        (h['benjamini_hochberg_s']<0.05) &
        (h['fold_enrichment_s']>0) &
        (h['fold_enrichment_l']<0)
    ].sort_values(
        'fold_enrichment_s', 
        ascending=False)

    enriched_in_long_depleted_in_short = h[
        (h['benjamini_hochberg_l']<0.05) &
        (h['fold_enrichment_s']<0) &
        (h['fold_enrichment_l']>0)
    ].sort_values(
        'fold_enrichment_l', 
        ascending=False)


    results.loc[boot, 'short_enriched'] = enriched_in_short_depleted_in_long.shape[0]
    results.loc[boot, 'long_enriched'] = enriched_in_long_depleted_in_short.shape[0]


  fract_of_significant_w_annot / fract_of_background_w_annot)


In [18]:
results['short_enriched'].value_counts()

0    99
1     1
Name: short_enriched, dtype: int64

In [19]:
results['long_enriched'].value_counts()

0    100
Name: long_enriched, dtype: int64