In [None]:
# load general tools
import sys
sys.path.append("../..")
from IPython.display import display
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.precision', 2)
pd.set_option('display.max_colwidth', 50)

In [None]:
# load magine specific tools
import magine.networks.ontology_network as nt
import magine.networks.visualization.notebooks.view as view
from magine.enrichment import load_enrichment_csv
from magine.plotting.heatmaps import heatmap_from_array
from magine.plotting.venn_diagram_maker import create_venn2
from magine.plotting.wordcloud_tools import create_wordcloud


# Exploring enrichment output

Here we load in the results from enrichR

In [None]:
enrichment_array = load_enrichment_csv('Data/cisplatin_enrichment.csv.gz', index_col=0)
enrichment_array['significant_flag'] = False
enrichment_array.loc[enrichment_array['adj_p_value']<=0.05, 'significant_flag'] = True

In [None]:
display(enrichment_array.head(5))

In [None]:
# clean up printing by selecting fewer columns
cols = ['term_name', 'rank', 'combined_score', 'n_genes', 'genes', 'sample_id']

In [None]:
display(enrichment_array[cols].head(5))

## Data stats

In [None]:
# column types
print(enrichment_array.dtypes)

In [None]:
# databases 
print(sorted(enrichment_array['db'].unique()))

In [None]:
# filter only sig terms
sig = enrichment_array[enrichment_array['significant_flag']].copy()

## Filtering and searching the dataframe

In [None]:
# Filter by terms
d = sig.filter_based_on_words(['death'])
display(d.sort_values('combined_score', ascending=False).head(5))

In [None]:
drug_dbs = ['DrugMatrix', 'Drug_Perturbations_from_GEO_2014']
drug = enrichment_array.filter_multi(
    p_value=0.05,
    combined_score=0.0, 
    db=drug_dbs,
    rank=100,
    #category='rna_up',
)

In [None]:
word_cloud = create_wordcloud(drug)
word_cloud.plot('wc_drug_dbs_example.png');

In [None]:
word_cloud.data.head(10)

# Single database exploration

Here we will focus on the Reactome enrichment.

In [None]:

reactome_only = enrichment_array.filter_multi(
    p_value=0.05, # only sig pvalues
    combined_score=0.0, # score threshold of positive values 
    db='Reactome_2016', # Only reactome db
    category='proteomics_up', # from this category
)

In [None]:
# This just cleans up the term name
display(reactome_only['term_name'].head(5))
reactome_only['term_name'] = reactome_only['term_name'].str.split('_').str.get(0)
display(reactome_only['term_name'].head(5))

In [None]:
# we can use a word cloud to view what terms are enriched
word_cloud = create_wordcloud(reactome_only)
word_cloud.plot('wc_example.png');

In [None]:
display(reactome_only.head(15))

In [None]:
# let's look at the sizes of genes per term
plt.hist(reactome_only['n_genes'], bins=20)
display(reactome_only.describe())

In [None]:
display(reactome_only.sort_values('n_genes', ascending=False)[cols].head(30))
display(reactome_only.sort_values('n_genes', ascending=False)[cols].head(30)['term_name'].values)

In [None]:
# Manually select terms with large number of genes that are not useful.
# This is based on person intuition. "Gene expression" and "metabolism" might be useful for some, but 
# here I will elimate as they are not descriptive enough to explain what the drug is doing.

not_useful = ['gene expression', 'metabolism', 
              'metabolism of proteins', 
              'immune system', 
              'disease', ''
              'processing of capped intron-containing pre-mrna',
              'developmental biology',
              'infectious disease', 
              'metabolism of amino acids and derivatives',
              'major pathway of rrna processing in the nucleolus',
              'influenza life cycle', 
             ]
reactome_only = reactome_only.loc[~reactome_only['term_name'].isin(not_useful)]

display(reactome_only.sort_values('n_genes', ascending=False)[cols].head(15))

In [None]:
# Now lets look at the first time point. 

test_df = reactome_only.filter_multi(sample_id='01hr')
display(test_df.head(10)[cols])
display(test_df.head(10)[cols].term_name.values)

There are 76 enriched terms. If we look at the top ranked terms, we see that some fo them have similar descriptions "Interlekin...". If we look at the gene list, we can also see that some of the genes are similar. To see if there are redundant terms that are enriched, we can calculate their similarity with the Jaccard Index (intersection over union).
![width=50](https://wikimedia.org/api/rest_v1/media/math/render/svg/eaef5aa86949f49e7dc6b9c8c3dd8b233332c9e7)
<img src="https://upload.wikimedia.org/wikipedia/commons/c/c7/Intersection_over_Union_-_visual_equation.png" alt="Drawing" style="width: 300px;"/>

In [None]:
test_df.dist_matrix(fig_size=(12,12));

In [None]:
# Valculate the Jaccard Index and returns a ranked dataframe of terms and scores.
# Higher scores means more similar terms
d = test_df.find_similar_terms('cell cycle')
display(d.head(10))

In [None]:
d = test_df.find_similar_terms('interleukin-2 signaling')
display(d.head(20))

In [None]:
d = test_df.find_similar_terms('mapk family signaling cascades')
display(d.head(10))

We can do this for all terms and view the results in a distance matrix.

The dark red represents terms that are highly similar. We can filter all highly similar terms from the matrix.

In [None]:
test_df.remove_redundant(threshold=.7, level='sample', inplace=True)
test_df.dist_matrix();

In [None]:
test_df[cols].head(25)

In [None]:
heatmap_from_array(test_df, convert_to_log=True, fig_size=(4, 6), linewidths=.05);

## Across all time points

In [None]:
reactome_all_time = reactome_only.remove_redundant(threshold=.6, level='sample')

reactome_all_time.dist_matrix();

In [None]:
display(reactome_all_time[cols].head(20))

In [None]:
reactome_all_time.find_similar_terms('mrna splicing - major pathway').head(10)

In [None]:
reactome_all_time.find_similar_terms('formation of incision complex in gg-ner').head(10)

In [None]:
reactome_all_time[reactome_all_time['term_name'] == 'dna repair']

In [None]:
reactome_all_time.filter_by_minimum_sig_columns(
    index='term_name', columns='sample_id', min_terms=2,
    inplace=True
)

heatmap_from_array(reactome_all_time, 
                   convert_to_log=True, 
                   fig_size=(6, 14), 
                   cluster_row=False, 
                   annotate_sig=True);

In [None]:
network = nx.read_gpickle('Data/cisplatin_based_network.p')

In [None]:
hits = ['cell cycle', 
        'dna repair', 
        'apoptosis',
        'interleukin-2 signaling',
        #'apoptotic cleavage of cellular proteins',
#         'transcriptional regulation by tp53', 
#         'vxpx cargo-targeting to cilium',
        ]

subset = reactome_only.loc[reactome_only['term_name'].isin(hits)].copy()
display(subset.head(10))
print(subset['term_name'].unique())

In [None]:
term_net, mol_net = nt.create_subnetwork( 
    subset, network, subset['term_name'],  'entire_network', create_only=True
)

In [None]:
view.render_graph(term_net)

In [None]:
view.display_graph(mol_net, add_parent=True)

In [None]:
view.render_graph(mol_net, add_parent=True)

In [None]:
from exp_data import exp_data

In [None]:
from magine.networks.subgraphs import Subgraph
net_sub = Subgraph(network)

In [None]:
print(len(mol_net))
print(isinstance(exp_data.compounds.id_list, (list, set)))

new_g = net_sub.expand_neighbors(mol_net, 
                                 nodes=list(mol_net.nodes), upstream=True, downstream=True,
                                 include_only=exp_data.compounds.id_list)
print(len(new_g))

In [None]:
vpx = reactome_only.term_to_genes('vxpx cargo-targeting to cilium',)
vpx_network = net_sub.paths_between_list(vpx)
new_g = net_sub.expand_neighbors(nodes=list(vpx_network.nodes), upstream=True, downstream=True,
                                 include_only=exp_data.compounds.id_list)
print(len(new_g.nodes()))
print(len(new_g.edges()))

In [None]:
view.render_graph(new_g)

In [None]:
from magine.networks.visualization.igraph_tools import paint_network_overtime

In [None]:
paint_network_overtime(new_g, exp_data=exp_data, save_name='metabolites', color_list='red')

In [None]:
print(reactome_only.shape)

at_least_2 = reactome_only.filter_by_minimum_sig_columns(
    columns='sample_id',
    min_terms=3
)

at_least_2.remove_redundant(inplace=True, level='dataframe')

at_least_2.dist_matrix(fig_size=(8, 8), level='dataframe')


heatmap_from_array(
    at_least_2,
    cluster_row=True, 
    annotate_sig=True, 
    fig_size=(8, 14)
);

In [None]:
go_only = enrichment_array.filter_multi(
    p_value=0.05, # only sig pvalues
    combined_score=0.0, 
    db=['GO_Biological_Process_2017b',
        'GO_Molecular_Function_2017b',
        'GO_Cellular_Component_2017b'],
    category='proteomics_up'
)

print(go_only.shape)
at_least_2 = go_only.filter_by_minimum_sig_columns(
    index='term_name', 
    columns='sample_id',
    min_terms=3
)
print(at_least_2.shape)
at_least_2.remove_redundant(inplace=True)
print(at_least_2.shape)
heatmap_from_array(at_least_2,convert_to_log=True,
                   annotate_sig=True, fig_size=(6,16));

In [None]:
fig = heatmap_from_array(reactome_only,
                         convert_to_log=True,
                         fig_size=(8, 8))

fig.savefig('enrichment.png', dpi=300, bbox_inches='tight')

In [None]:

fig = heatmap_from_array(reactome_only, 
                         convert_to_log=True, 
                         cluster_col=False, 
                         cluster_row=True, 
                         fig_size=(8, 8))

fig.savefig('cluster_enrichment.png', dpi=300, bbox_inches='tight')

In [None]:
drug_df = enrichment_array.filter_multi(
    p_value=0.05,
    combined_score=0.0,
    rank=25,
    db=['DrugMatrix', 'Drug_Perturbations_from_GEO_2014']
)
create_wordcloud(drug_df).plot(save_name='word_cloud_rna_drug_dbs');

In [None]:
sig_array = enrichment_array.filter_multi(
    p_value=0.05, 
    combined_score=5.0, 
    rank=25,
    db='GO_Biological_Process_2017b',
    category='proteomics_up'
)

create_wordcloud(sig_array).plot(save_name='wordcloud_array_go');

sig_array.remove_redundant(threshold=0.9, inplace=True, level='dataframe')
sig_array.dist_matrix();
fig = heatmap_from_array(sig_array, 
                         convert_to_log=True, 
                         cluster_col=False, 
                         cluster_row=True, 
                         fig_size=(8, 14), annotate_sig=True)

fig.savefig('go_bp_cluster_enrichment.png', dpi=300, bbox_inches='tight')

In [None]:
sig_array = enrichment_array.filter_multi(
    p_value=0.05, 
    combined_score=5.0, 
    rank=25,
    db='KEGG_2016'
)
sig_array.remove_redundant(inplace=True, threshold=0.7)
wordcloud = create_wordcloud(sig_array)
wordcloud.plot(save_name='wordcloud_array_kegg')

In [None]:
sig_array = enrichment_array.filter_multi(
    p_value=0.05, 
    combined_score=5.0, 
    rank=25,
    db='WikiPathways_2016'
)
sig_array.remove_redundant(inplace=True, threshold=0.7)

create_wordcloud(sig_array).plot(save_name='wordcloud_array_wiki');

fig = heatmap_from_array(sig_array, 
                         convert_to_log=True, 
                         cluster_col=False, 
                         cluster_row=True, 
                         fig_size=(8, 8))

fig.savefig('enrichment_example3.png', dpi=300, bbox_inches='tight')