In [10]:
from network_evaluation_tools import data_import_tools as dit
from network_evaluation_tools import network_evaluation_functions as nef
from network_evaluation_tools import network_propagation as prop
import pandas as pd
import numpy as np

In [11]:
# Load network (We choose a smaller network here for the example's sake)
network = dit.load_network_file('../Data/string_edge_list_common_names.tsv', verbose=True, delimiter='\t')
print(len(network.nodes))

Network File Loaded: ../Data/string_edge_list_common_names.tsv
19344


In [12]:
# Load gene sets for analysis
genesets = dit.load_node_sets('../Data/DisGeNET_genesets.txt')

In [13]:
# Calculate geneset sub-sample rate
genesets_p = nef.calculate_p(network, genesets)

In [14]:
# Determine optimal alpha for network (can also be done automatically by next step)
alpha = prop.calculate_alpha(network)
print(alpha)

0.55


In [None]:
import networkx as nx
print(len(network.nodes))
subs = list(nx.connected_component_subgraphs(network))
print(subs)

In [15]:
# Calculate network kernel for propagation
kernel = nef.construct_prop_kernel(network, alpha=alpha, verbose=True)

Alpha: 0.55
Network Propagation Complete: 317.83964490890503 seconds
Propagated network kernel constructed


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  network_Fn = network_Fn.ix[network_Fn.columns]


In [None]:
print(kernel.index)
print(genesets)

In [16]:
# Calculate the AUPRC values for each gene set
AUPRC_values = nef.small_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=30, cores=4, verbose=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  bg_sample_sum = kernel.ix[sample][bg_non_sample].sum().sort_values(ascending=False)				# summed prop value for all nodes in background
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  bg_sample_sum = kernel.ix[sample][bg_non_sample].sum().sort_values(ascending=False)				# summed prop value for all nodes in background
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  bg_sample_sum = kernel.ix[sample][bg_non_sample].sum().sort_values(ascending

AUPRC Analysis for given node set (117 nodes in network) complete: 2.9 seconds.
AUPRC Analysis for given node set (121 nodes in network) complete: 2.95 seconds.
AUPRC Analysis for given node set (118 nodes in network) complete: 3.02 seconds.
AUPRC Analysis for given node set (180 nodes in network) complete: 3.44 seconds.
AUPRC Analysis for given node set (90 nodes in network) complete: 1.85 seconds.
AUPRC Analysis for given node set (103 nodes in network) complete: 1.97 seconds.
AUPRC Analysis for given node set (211 nodes in network) complete: 2.75 seconds.
AUPRC Analysis for given node set (210 nodes in network) complete: 2.79 seconds.
AUPRC Analysis for given node set (70 nodes in network) complete: 1.73 seconds.
AUPRC Analysis for given node set (79 nodes in network) complete: 1.73 seconds.
AUPRC Analysis for given node set (121 nodes in network) complete: 2.03 seconds.
AUPRC Analysis for given node set (238 nodes in network) complete: 2.75 seconds.
AUPRC Analysis for given node se

**Note about the above cell:** There are a several options for this particular step depending on the computational resources available and network size. If the network is sufficiently small (<250k edges), it is recommended to use the 'small_network_AUPRC_wrapper' function as it can be much faster, especially when run in parallel (at least 8G per core is recommended). If you would like to parallelize the AUPRC calculation with a larger network (between 250K and 2.5M edges), at least 16G per core is recommended, 32G per core if the network contains more than 2.5M edges. For larger networks, it is recommended to use the 'large_network_AUPRC_wrapper', which may be a slightly slower function, but more equipped to handle the larger memory footprint(required. To change the parllelization status of the function, change the 'cores' option to the number of threads you would like to utilize.)

In [17]:
# Construct null networks and calculate the AUPRC of the gene sets of the null networks
# We can use the AUPRC wrapper function for this
null_AUPRCs = []
for i in range(10):
    shuffNet = nef.shuffle_network(network, max_tries_n=10, verbose=True)
    shuffNet_kernel = nef.construct_prop_kernel(shuffNet, alpha=alpha, verbose=False)
    shuffNet_AUPRCs = nef.small_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=30, cores=4, verbose=False)
    null_AUPRCs.append(shuffNet_AUPRCs)
    print('shuffNet', repr(i+1), 'AUPRCs calculated')

Network shuffled: 755.9287929534912 seconds. Edge similarity: 0.19687618840196253
shuffNet 1 AUPRCs calculated
Network shuffled: 773.5425930023193 seconds. Edge similarity: 0.19672764613249868
shuffNet 2 AUPRCs calculated
Network shuffled: 753.2581508159637 seconds. Edge similarity: 0.19681765628203635
shuffNet 3 AUPRCs calculated
Network shuffled: 741.5315568447113 seconds. Edge similarity: 0.1969445893095507


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  network_Fn = network_Fn.ix[network_Fn.columns]
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  bg_sample_sum = kernel.ix[sample][bg_non_sample].sum().sort_values(ascending=False)				# summed prop value for all nodes in background
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional in

KeyboardInterrupt: 

**Note about the above cell:** We use a small number to calculate the null AUPRC values, but a larger number of shuffled networks may give a better representation of the true null AUPRC value.  smaller number of networks here for this example, but larger numbers can be used, especially if the resulting distribution of null AUPRCs has a high variance relative to the actual AUPRC values, but we have found that the variance remains relatively small even with a small number of shuffled networks.

In [None]:
# Construct table of null AUPRCs
null_AUPRCs_table = pd.concat(null_AUPRCs, axis=1)
null_AUPRCs_table.columns = ['shuffNet'+repr(i+1) for i in range(len(null_AUPRCs))]

In [None]:
# Calculate performance metric of gene sets
network_performance = nef.calculate_network_performance_score(AUPRC_values, null_AUPRCs_table, verbose=True)
network_performance.name = 'Test Network'

In [None]:
# Calculate network performance gain over median null AUPRC
network_perf_gain = nef.calculate_network_performance_gain(AUPRC_values, null_AUPRCs_table, verbose=True)
network_perf_gain.name = 'Test Network'

In [None]:
# Rank network on average performance across gene sets vs performance on same gene sets in previous network set
all_network_performance = pd.read_csv('~/Data/Network_Performance.csv', index_col=0)
all_network_performance_filt = pd.concat([network_performance, all_network_performance.ix[network_performance.index]], axis=1)
network_performance_rank_table = all_network_performance_filt.rank(axis=1, ascending=False)
network_performance_rankings = network_performance_rank_table['Test Network']

In [None]:
# Rank network on average performance gain across gene sets vs performance gain on same gene sets in previous network set
all_network_perf_gain = pd.read_csv('~/Data/Network_Performance_Gain.csv', index_col=0)
all_network_perf_gain_filt = pd.concat([network_perf_gain, all_network_perf_gain.ix[network_perf_gain.index]], axis=1)
network_perf_gain_rank_table = all_network_performance_filt.rank(axis=1, ascending=False)
network_perf_gain_rankings = network_perf_gain_rank_table['Test Network']

In [None]:
# Network Performance
network_performance_metric_ranks = pd.concat([network_performance, network_performance_rankings, network_perf_gain, network_perf_gain_rankings], axis=1)
network_performance_metric_ranks.columns = ['Network Performance', 'Network Performance Rank', 'Network Performance Gain', 'Network Performance Gain Rank']
network_performance_metric_ranks.sort_values(by=['Network Performance Rank', 'Network Performance', 'Network Performance Gain Rank', 'Network Performance Gain'],
                                             ascending=[True, False, True, False])

In [None]:
# Construct network summary table
network_summary = {}
network_summary['Nodes'] = int(len(network.nodes()))
network_summary['Edges'] = int(len(network.edges()))
network_summary['Avg Node Degree'] = np.mean(network.degree().values())
network_summary['Edge Density'] = 2*network_summary['Edges'] / float((network_summary['Nodes']*(network_summary['Nodes']-1)))
network_summary['Avg Network Performance Rank'] = network_performance_rankings.mean()
network_summary['Avg Network Performance Rank, Rank'] = int(network_performance_rank_table.mean().rank().ix['Test Network'])
network_summary['Avg Network Performance Gain Rank'] = network_perf_gain_rankings.mean()
network_summary['Avg Network Performance Gain Rank, Rank'] = int(network_perf_gain_rank_table.mean().rank().ix['Test Network'])
for item in ['Nodes', 'Edges' ,'Avg Node Degree', 'Edge Density', 'Avg Network Performance Rank', 'Avg Network Performance Rank, Rank',
             'Avg Network Performance Gain Rank', 'Avg Network Performance Gain Rank, Rank']:
    print(item+':\t'+repr(network_summary[item]))