In [14]:
from network_evaluation_tools import data_import_tools as dit
from network_evaluation_tools import network_evaluation_functions as nef
from network_evaluation_tools import network_propagation as prop
import pandas as pd
import numpy as np

In [2]:
# Load network (We choose a smaller network here for the example's sake)
network = dit.load_network_file('~/Data/InBioMap_subnetwork.txt', verbose=True)

Network File Loaded: /cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/InBioMap75_Symbol.sif


In [3]:
# Load gene sets for analysis
genesets = dit.load_node_sets('~/Data/DisGeNET_genesets.txt')

In [4]:
# Calculate geneset sub-sample rate
genesets_p = nef.calculate_p(network, genesets)

In [5]:
# Determine optimal alpha for network (can also be done automatically by next step)
alpha = prop.calculate_alpha(network)
print alpha

0.596


In [6]:
# Calculate network kernel for propagation
kernel = nef.construct_prop_kernel(network, alpha=alpha, verbose=True)

Alpha: 0.596
Network Propagation Complete: 76.2080309391 seconds
Propagated network kernel constructed


In [7]:
# Calculate the AUPRC values for each gene set
AUPRC_values = nef.small_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=30, cores=4, verbose=True)

AUPRC Analysis for given node set (69 nodes in network) complete: 0.76 seconds.
AUPRC Analysis for given node set (90 nodes in network) complete: 0.81 seconds.
AUPRC Analysis for given node set (164 nodes in network) complete: 1.29 seconds.
AUPRC Analysis for given node set (139 nodes in network) complete: 1.46 seconds.
AUPRC Analysis for given node set (165 nodes in network) complete: 0.98 seconds.
AUPRC Analysis for given node set (129 nodes in network) complete: 1.07 seconds.
AUPRC Analysis for given node set (238 nodes in network) complete: 1.83 seconds.
AUPRC Analysis for given node set (97 nodes in network) complete: 0.89 seconds.
AUPRC Analysis for given node set (213 nodes in network) complete: 1.45 seconds.
AUPRC Analysis for given node set (162 nodes in network) complete: 0.95 seconds.
AUPRC Analysis for given node set (221 nodes in network) complete: 1.14 seconds.
AUPRC Analysis for given node set (102 nodes in network) complete: 1.27 seconds.
AUPRC Analysis for given node s

**Note about the above cell:** There are a several options for this particular step depending on the computational resources available and network size. If the network is sufficiently small (<250k edges), it is recommended to use the 'small_network_AUPRC_wrapper' function as it can be much faster, especially when run in parallel (at least 8G per core is recommended). If you would like to parallelize the AUPRC calculation with a larger network (between 250K and 2.5M edges), at least 16G per core is recommended, 32G per core if the network contains more than 2.5M edges. For larger networks, it is recommended to use the 'large_network_AUPRC_wrapper', which may be a slightly slower function, but more equipped to handle the larger memory footprint required. To change the parllelization status of the function, change the 'cores' option to the number of threads you would like to utilize.

In [13]:
# Construct null networks and calculate the AUPRC of the gene sets of the null networks
# We can use the AUPRC wrapper function for this
null_AUPRCs = []
for i in range(10):
    shuffNet = nef.shuffle_network(network, max_tries_n=10, verbose=True)
    shuffNet_kernel = nef.construct_prop_kernel(shuffNet, alpha=alpha, verbose=False)
    shuffNet_AUPRCs = nef.small_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=30, cores=4, verbose=False)
    null_AUPRCs.append(shuffNet_AUPRCs)
    print 'shuffNet', repr(i+1), 'AUPRCs calculated'

Network shuffled: 7.99008989334 seconds. Edge similarity: 0.168051958349
shuffNet 1 AUPRCs calculated
Network shuffled: 7.47656393051 seconds. Edge similarity: 0.166261430308
shuffNet 2 AUPRCs calculated
Network shuffled: 7.29569387436 seconds. Edge similarity: 0.168415349649
shuffNet 3 AUPRCs calculated
Network shuffled: 7.99349284172 seconds. Edge similarity: 0.167344997093
shuffNet 4 AUPRCs calculated
Network shuffled: 7.6033039093 seconds. Edge similarity: 0.167146783657
shuffNet 5 AUPRCs calculated
Network shuffled: 7.44416499138 seconds. Edge similarity: 0.167344997093
shuffNet 6 AUPRCs calculated
Network shuffled: 7.43572402 seconds. Edge similarity: 0.166803213701
shuffNet 7 AUPRCs calculated
Network shuffled: 7.36451888084 seconds. Edge similarity: 0.16702124848
shuffNet 8 AUPRCs calculated
Network shuffled: 7.58913898468 seconds. Edge similarity: 0.1675299963
shuffNet 9 AUPRCs calculated
Network shuffled: 7.63543987274 seconds. Edge similarity: 0.166928748877
shuffNet 10 AUPR

**Note about the above cell:** We use a small number to calculate the null AUPRC values, but a larger number of shuffled networks may give a better representation of the true null AUPRC value.  smaller number of networks here for this example, but larger numbers can be used, especially if the resulting distribution of null AUPRCs has a high variance relative to the actual AUPRC values, but we have found that the variance remains relatively small even with a small number of shuffled networks.

In [15]:
# Construct table of null AUPRCs
null_AUPRCs_table = pd.concat(null_AUPRCs, axis=1)
null_AUPRCs_table.columns = ['shuffNet'+repr(i+1) for i in range(len(null_AUPRCs))]

In [16]:
# Calculate performance metric of gene sets
network_performance = nef.calculate_network_performance_score(AUPRC_values, null_AUPRCs_table, verbose=True)
network_performance.name = 'Test Network'

AUPRC values z-normalized


In [17]:
# Calculate network performance gain over median null AUPRC
network_perf_gain = nef.calculate_network_performance_gain(AUPRC_values, null_AUPRCs_table, verbose=True)
network_perf_gain.name = 'Test Network'

AUPRC relative performance gain calculated


In [28]:
# Rank network on average performance across gene sets vs performance on same gene sets in previous network set
all_network_performance = pd.read_csv('~/Data/Network_Performance.csv', index_col=0)
all_network_performance_filt = pd.concat([network_performance, all_network_performance.ix[network_performance.index]], axis=1)
network_performance_rank_table = all_network_performance_filt.rank(axis=1, ascending=False)
network_performance_rankings = network_performance_rank_table['Test Network']

In [30]:
# Rank network on average performance gain across gene sets vs performance gain on same gene sets in previous network set
all_network_perf_gain = pd.read_csv('~/Data/Network_Performance_Gain.csv', index_col=0)
all_network_perf_gain_filt = pd.concat([network_perf_gain, all_network_perf_gain.ix[network_perf_gain.index]], axis=1)
network_perf_gain_rank_table = all_network_performance_filt.rank(axis=1, ascending=False)
network_perf_gain_rankings = network_perf_gain_rank_table['Test Network']

In [31]:
# Network Performance
network_performance_metric_ranks = pd.concat([network_performance, network_performance_rankings, network_perf_gain, network_perf_gain_rankings], axis=1)
network_performance_metric_ranks.columns = ['Network Performance', 'Network Performance Rank', 'Network Performance Gain', 'Network Performance Gain Rank']
network_performance_metric_ranks.sort_values(by=['Network Performance Rank', 'Network Performance', 'Network Performance Gain Rank', 'Network Performance Gain'],
                                             ascending=[True, False, True, False])

Unnamed: 0,Network Performance,Network Performance Rank,Network Performance Gain,Network Performance Gain Rank
Measles,250.901503,1.0,2.506668,1.0
Dermatomyositis,84.107679,1.0,2.436325,1.0
Hereditary Nonpolyposis Colorectal Cancer,76.032861,1.0,1.989239,1.0
Primary immune deficiency disorder,65.549563,1.0,2.35445,1.0
Bloom Syndrome,56.268981,1.0,1.882597,1.0
Sporadic Breast Carcinoma,56.068983,1.0,1.83706,1.0
Hepatoblastoma,48.344917,1.0,1.397277,1.0
Adenocarcinoma of colon,47.950858,1.0,1.195556,1.0
Aplasia Cutis Congenita,41.454155,1.0,1.798434,1.0
Dyslipidemias,79.86717,2.0,2.376294,2.0


In [40]:
# Construct network summary table
network_summary = {}
network_summary['Nodes'] = int(len(network.nodes()))
network_summary['Edges'] = int(len(network.edges()))
network_summary['Avg Node Degree'] = np.mean(network.degree().values())
network_summary['Edge Density'] = 2*network_summary['Edges'] / float((network_summary['Nodes']*(network_summary['Nodes']-1)))
network_summary['Avg Network Performance Rank'] = network_performance_rankings.mean()
network_summary['Avg Network Performance Rank, Rank'] = int(network_performance_rank_table.mean().rank().ix['Test Network'])
network_summary['Avg Network Performance Gain Rank'] = network_perf_gain_rankings.mean()
network_summary['Avg Network Performance Gain Rank, Rank'] = int(network_perf_gain_rank_table.mean().rank().ix['Test Network'])
for item in ['Nodes', 'Edges' ,'Avg Node Degree', 'Edge Density', 'Avg Network Performance Rank', 'Avg Network Performance Rank, Rank',
             'Avg Network Performance Gain Rank', 'Avg Network Performance Gain Rank, Rank']:
    print item+':\t'+repr(network_summary[item])

Nodes:	9432
Edges:	151352
Avg Node Degree:	32.093299406276508
Edge Density:	0.0034029582659608213
Avg Network Performance Rank:	6.53125
Avg Network Performance Rank, Rank:	7
Avg Network Performance Gain Rank:	6.53125
Avg Network Performance Gain Rank, Rank:	7
