In [1]:
import pandas as pd
import networkx as nx
import markov_clustering as mc
import numpy as np
import os

# Task 2.1

# a)

Calculate the following global (i.e. concerning the whole network and not the single nodes)
measures of SGI, U and I (only if no. of nodes >20):

• No. of nodes and no. of links

• No. of connected components

• No. of isolated nodes

• Average path length

• Average degree

• Average clustering coefficient

• Network diameter & radius

• Centralization

In [143]:
# read union dataset
union_df = pd.read_csv('results/union.csv', index_col=0)


In [3]:
intersection_df = pd.read_csv('results/intersection.csv', index_col=0)

In [4]:
sgi_df = pd.read_csv('results/sgi.csv', index_col=0)

In [5]:
def task_2_1_a(df, type_net):
    # undirected graph object
    graph = nx.from_pandas_edgelist(df, source = 'interactor A gene symbol', target='interactor B gene symbol')
    # check number of nodes
    if graph.number_of_nodes() >20:
        n_nodes = graph.number_of_nodes()
        # not consider the duplicates (union can have same edges with different sources: they will be considered just one time)
        n_edges = graph.number_of_edges()
        # number of connected components
        conn_components = nx.number_connected_components(graph)
        # number of isolates
        n_isolates = nx.number_of_isolates(graph)
        # average degree
        avg_degree = n_edges/n_nodes
        # avg clustering coefficient
        avg_cluster_coeff = nx.average_clustering(graph)
        
        print('%s'%type_net,'network has: \n')
        print(n_nodes, 'nodes')
        print(n_edges, 'edges')
        print(conn_components,'connected components')
        print(n_isolates, 'isolated nodes')
        print('average degree = ', avg_degree)
        print('average clustering coefficient = ', avg_cluster_coeff)
        
        # if graph is connected
        if conn_components ==1:
            
            # average shprtest path length
            avg_path = nx.average_shortest_path_length(graph)
            # diameter 
            diameter = nx.diameter(graph)
            # radius 
            radius = nx.radius(graph)
        
            print('shortest path length = ', avg_path)
            print('diameter = ', diameter)
            print('radius = ', radius)
            
        # if graph not connected  
        else:
            ll=[]
            #for each connected component computes the properties
            c = 1
            for g in nx.connected_component_subgraphs(graph): 
                print('Connected Component',c)
                print('average Shortest Path: ', nx.average_shortest_path_length(g))
                print('diameter', nx.diameter(g))
                print('radius', nx.radius(g))
                c +=1
    else:
        print('%s'%type_net,'network do not have a number of edges bigger than 20')

In [6]:
task_2_1_a(intersection_df, 'Intersection')

Intersection network has: 

57 nodes
89 edges
10 connected components
0 isolated nodes
average degree =  1.5614035087719298
average clustering coefficient =  0.09359509885825675
Connected Component 1
average Shortest Path:  3.6659619450317127
diameter 9
radius 5
Connected Component 2
average Shortest Path:  0
diameter 0
radius 0
Connected Component 3
average Shortest Path:  0
diameter 0
radius 0
Connected Component 4
average Shortest Path:  1.0
diameter 1
radius 1
Connected Component 5
average Shortest Path:  1.3333333333333333
diameter 2
radius 1
Connected Component 6
average Shortest Path:  0
diameter 0
radius 0
Connected Component 7
average Shortest Path:  1.0
diameter 1
radius 1
Connected Component 8
average Shortest Path:  0
diameter 0
radius 0
Connected Component 9
average Shortest Path:  0
diameter 0
radius 0
Connected Component 10
average Shortest Path:  0
diameter 0
radius 0


In [7]:
task_2_1_a(union_df, 'Union')

Union network has: 

5513 nodes
10329 edges
2 connected components
0 isolated nodes
average degree =  1.8735715581353165
average clustering coefficient =  0.07321978067098309
Connected Component 1
average Shortest Path:  3.4924265343579193
diameter 7
radius 4
Connected Component 2
average Shortest Path:  1.5
diameter 2
radius 1


In [8]:
task_2_1_a(sgi_df, 'SGI')

SGI network has: 

74 nodes
176 edges
13 connected components
0 isolated nodes
average degree =  2.3783783783783785
average clustering coefficient =  0.15667418167418168
Connected Component 1
average Shortest Path:  3.23879781420765
diameter 7
radius 4
Connected Component 2
average Shortest Path:  0
diameter 0
radius 0
Connected Component 3
average Shortest Path:  0
diameter 0
radius 0
Connected Component 4
average Shortest Path:  0
diameter 0
radius 0
Connected Component 5
average Shortest Path:  0
diameter 0
radius 0
Connected Component 6
average Shortest Path:  1.0
diameter 1
radius 1
Connected Component 7
average Shortest Path:  0
diameter 0
radius 0
Connected Component 8
average Shortest Path:  0
diameter 0
radius 0
Connected Component 9
average Shortest Path:  0
diameter 0
radius 0
Connected Component 10
average Shortest Path:  0
diameter 0
radius 0
Connected Component 11
average Shortest Path:  0
diameter 0
radius 0
Connected Component 12
average Shortest Path:  0
diameter 0
rad

# b) 

Isolate the largest connected component (LCC) of I and U and calculate the following global and local (i.e. for each node) measures:

### Global:

• N. of nodes and no. of links

• Average path length

• Average degree

• Average clustering coefficient

• Network diameter & radius

• Centralization

### Local:

• Node degree

• Betweenness centrality

• Eigenvector centrality

• Closeness centrality

• ratio Betweenness/Node degree


Store the results in a suitable matrix format of your choice.

### NB


Beacuse of the better computational time, we computed some global measures (avg shortest path, radius and diameter) and the local ones through the R igraph package.

In [33]:
# save the lcc adjacency matrix to csv file in order to use it on R
def lcc_a_matrix_to_csv(df, data_type):
    
    # create graph from df
    graph = nx.from_pandas_edgelist(df, source = 'interactor A gene symbol', target='interactor B gene symbol')
    # return set of nodes of the largest connected component
    lcc_set = max(nx.connected_components(graph), key=len)
    # lcc subgraph
    lcc_graph = graph.subgraph(lcc_set)
    # numpy adj matrix
    a= nx.to_numpy_matrix(lcc_graph)
    try:
        os.remove('data/%s'%data_type +'_lcc_matrix.csv')
    except:
        pass
    # save to csv
    #np.savetxt('data/%s'%data_type+'_lcc_matrix.csv', a, delimiter=",", header='')
    pd.DataFrame(a).to_csv('data/%s'%data_type +'_lcc_matrix.csv')
    
# save union matrix to csv
lcc_a_matrix_to_csv(union_df, 'union')
# save intersection matrix to csv
lcc_a_matrix_to_csv(intersection_df, 'intersection')

In [9]:
def task_2_1_b_global(df, type_net):
    # create graph from df
    graph = nx.from_pandas_edgelist(df, source = 'interactor A gene symbol', target='interactor B gene symbol')
    # return set of nodes of the largest connected component
    lcc_set = max(nx.connected_components(graph), key=len)
    # lcc subgraph
    lcc_graph = graph.subgraph(lcc_set)
    # number of nodes
    n_nodes = lcc_graph.number_of_nodes()
    # number of edges
    # not consider the duplicates (union can have same edges with different sources: they will be considered just one time)
    n_edges = lcc_graph.number_of_edges()
    # average degree
    avg_degree = n_edges/n_nodes
    # avg clustering coefficient
    avg_cluster_coeff = nx.average_clustering(lcc_graph)
    
    
    print('%s'%type_net,'lcc has: \n')
    print(n_nodes, 'nodes')
    print(n_edges, 'edges')
    print('average degree = ', avg_degree)
    print('average clustering coefficient = ', avg_cluster_coeff)
    
      

In [10]:
task_2_1_b_global(union_df, 'Union')

Union lcc has: 

5509 nodes
10326 edges
average degree =  1.8743873661281538
average clustering coefficient =  0.0732729444253276


In [11]:
task_2_1_b_global(intersection_df, 'Intersection')

Intersection lcc has: 

44 nodes
77 edges
average degree =  1.75
average clustering coefficient =  0.12124819624819624


In [45]:
# read adj matrix of lcc intersection
#pd.read_csv('data/intersection_lcc_matrix.csv').drop(['Unnamed: 0'],axis=1)

# Task 2.2

Cluster I-LCC and U-LCC using the MCL algorithm to get the modules.
Once you have clustered the networks, find modules with no. of nodes >= 10 in which seed genes are statistically overrepresented (p<0.05) by applying a hypergeometric test: such modules will be the “putative disease modules”.
Store the results for both U-LCC and I-LCC in tables including in each row: clustering algorithm used, module ID, no. of seed genes in the module, total no. of genes in each module, seed gene IDs, all gene IDs in the module, p-value.