In [1]:
import pandas as pd
import networkx as nx
import markov_clustering as mc
import numpy as np
import os
from scipy.stats import hypergeom

In [2]:
# read datasets
union_df = pd.read_csv('results/union.csv', index_col=0)
intersection_df = pd.read_csv('results/intersection.csv', index_col=0)
sgi_df = pd.read_csv('results/sgi.csv', index_col=0)

In [27]:
def task_2_1_a(df, type_net):
    # undirected graph object
    graph = nx.from_pandas_edgelist(df, source = 'interactor A gene symbol', target='interactor B gene symbol')
    # check number of nodes
    if graph.number_of_nodes() >20:
        n_nodes = graph.number_of_nodes()
        # not consider the duplicates (union can have same edges with different sources: they will be considered just one time)
        n_edges = graph.number_of_edges()
        # number of connected components
        conn_components = nx.number_connected_components(graph)
        # number of isolates
        n_isolates = nx.number_of_isolates(graph)
        # average degree
        avg_degree = n_edges/n_nodes
        # avg clustering coefficient
        avg_cluster_coeff = nx.average_clustering(graph)
        
        print('%s'%type_net,'network has: \n')
        print(n_nodes, 'nodes')
        print(n_edges, 'edges')
        print(conn_components,'connected components')
        print(n_isolates, 'isolated nodes')
        print('average degree = ', avg_degree)
        print('average clustering coefficient = ', avg_cluster_coeff)
        
        # if graph is connected
        if conn_components ==1:
            
            # average shprtest path length
            avg_path = nx.average_shortest_path_length(graph)
            # diameter 
            diameter = nx.diameter(graph)
            # radius 
            radius = nx.radius(graph)
        
            print('shortest path length = ', avg_path)
            print('diameter = ', diameter)
            print('radius = ', radius)
            
        # if graph not connected  
        else:
            ll=[]
            #for each connected component computes the properties
            c = 1
            for g in nx.connected_component_subgraphs(graph): 
            
                print('Connected Component',c)
                print('Number of nodes: ', nx.number_of_nodes(g))
                print('Number of edges: ', nx.number_of_edges(g))
                print('average Shortest Path: ', nx.average_shortest_path_length(g))
                print('diameter', nx.diameter(g))
                print('radius', nx.radius(g))
                c +=1
    else:
        print('%s'%type_net,'network do not have a number of edges bigger than 20')

In [28]:
task_2_1_a(intersection_df, 'Intersection')

Intersection network has: 

57 nodes
89 edges
10 connected components
0 isolated nodes
average degree =  1.5614035087719298
average clustering coefficient =  0.09359509885825675
Connected Component 1
Number of nodes:  44
Number of edges:  77
average Shortest Path:  3.6659619450317127
diameter 9
radius 5
Connected Component 2
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 3
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 4
Number of nodes:  2
Number of edges:  2
average Shortest Path:  1.0
diameter 1
radius 1
Connected Component 5
Number of nodes:  3
Number of edges:  3
average Shortest Path:  1.3333333333333333
diameter 2
radius 1
Connected Component 6
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 7
Number of nodes:  2
Number of edges:  1
average Shortest Path:  1.0
diameter 1
radius 1
Connected Component 8
Numb

In [29]:
task_2_1_a(union_df, 'Union')

Union network has: 

5513 nodes
10329 edges
2 connected components
0 isolated nodes
average degree =  1.8735715581353165
average clustering coefficient =  0.07321978067098309
Connected Component 1
Number of nodes:  5509
Number of edges:  10326
average Shortest Path:  3.4924265343579193
diameter 7
radius 4
Connected Component 2
Number of nodes:  4
Number of edges:  3
average Shortest Path:  1.5
diameter 2
radius 1


In [30]:
task_2_1_a(sgi_df, 'SGI')

SGI network has: 

74 nodes
176 edges
13 connected components
0 isolated nodes
average degree =  2.3783783783783785
average clustering coefficient =  0.15667418167418168
Connected Component 1
Number of nodes:  61
Number of edges:  164
average Shortest Path:  3.23879781420765
diameter 7
radius 4
Connected Component 2
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 3
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 4
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 5
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 6
Number of nodes:  2
Number of edges:  1
average Shortest Path:  1.0
diameter 1
radius 1
Connected Component 7
Number of nodes:  1
Number of edges:  1
average Shortest Path:  0
diameter 0
radius 0
Connected Component 8
Number of nodes:  1
Number of ed

In [81]:
'''
class LCC

Save to csv the adjacency matrix in order to use it on R.

Compute global and local measures of the largest connected component 
    of the graph corresponding to the given df.
'''

class LCC(object):
    def __init__(self, dataframe, data_type):
        '''
        - dataframe: dataset corresponding to the union or the intersection
        
        - data_type: string to indicate type of df ('union' or 'intersection')
        '''
        
        self.df = dataframe
        self.data_type = data_type
    
    def lcc_graph(self):
        '''
        Returns:
        
            - largest connected component graph
        '''
        # create graph from df
        graph = nx.from_pandas_edgelist(self.df, source = 'interactor A gene symbol', target='interactor B gene symbol')
        # return set of nodes of the largest connected component
        lcc_set = max(nx.connected_components(graph), key=len)
        # lcc subgraph
        lcc_graph = graph.subgraph(lcc_set)
        return lcc_graph
    
    def lcc_a_matrix_to_csv(self):
        '''
        Returns:
        
            - file .csv with the adjacency matrix of the graph corresponding 
                to the df given in input to the class. 
        
        It is necessary saving the matrix on an external file in order to use it for more 
            computations in R, where specific functions are faster.
        '''
    
        # lcc subgraph
        lcc_graph = self.lcc_graph()
        # numpy adj matrix
        a= nx.to_numpy_matrix(lcc_graph)
        try:
            os.remove('data/%s'%self.data_type +'_lcc_matrix.csv')
        except:
            pass
        # save to csv
        #np.savetxt('data/%s'%data_type+'_lcc_matrix.csv', a, delimiter=",", header='')
        pd.DataFrame(a).to_csv('data/%s'%self.data_type +'_lcc_matrix.csv')

    def task_2_1_b_global(self):
        '''
        Returns:
        
            - df: dataframe of global measures for the graph which corresponds to the 
                input dataframe (union or inters).
            
            - nodes: list of nodes names of the graph.
        '''
        # lcc subgraph
        lcc_graph = self.lcc_graph()
        # number of nodes
        n_nodes = lcc_graph.number_of_nodes()
        # number of edges
        # not consider the duplicates (union can have same edges with different sources: they will be considered just one time)
        n_edges = lcc_graph.number_of_edges()
        # average degree
        avg_degree = n_edges/n_nodes
        # avg clustering coefficient
        avg_cluster_coeff = nx.average_clustering(lcc_graph)
        # create df
        df = pd.DataFrame([n_nodes, n_edges, avg_degree, avg_cluster_coeff])
        df.rename({0:'Nodes', 1:'Edges', 2:'Avg Degree', 3:'Avg Clustering Coeff'}, inplace = True)
        # lcc nodes
        nodes = list(lcc_graph.nodes())
        return df, nodes

    def merge_global_measures(self):
        '''
        Return:
        
            - dataframe containing the global measures of the graph computed on Python and R.
                In particular there are: number of edges and nodes, avg degree, avg clustering coefficient, 
                    avg. path length, diameter and radius. 
        
        '''
        # compute the above function to obtain global measures
        lcc_global1, lcc_nodes  = self.task_2_1_b_global()
        # upload the df containg avg shortest path, diameter and radius
        # it was computed on R because of the better computational time of the functions
        lcc_global2 = pd.read_csv('data/%s'%self.data_type+'_lcc_global_results.csv')
        # concatenate the 2 dataframe in one
        lcc_global = pd.concat([lcc_global1.T, lcc_global2], axis = 1).drop(['Unnamed: 0'], axis=1)
        lcc_global = lcc_global.rename({0:'lcc_%s'%self.data_type}) 
        
        return lcc_global
    
    def save_local_results(self):
        '''
        Returns:
        
            - file .csv with the local measures related to the graph of the input df (union or intersection)
        '''
        # lcc nodes names
        _,lcc_nodes = self.task_2_1_b_global()
        # load local dfs computed on R because of the better computational time
        lcc_local = pd.read_csv('data/%s'%self.data_type +'_lcc_local_results.csv').drop('Unnamed: 0', axis=1)
        # rename rows with gene names
        lcc_local = lcc_local.rename(dict(zip([x for x in range(len(lcc_local))],lcc_nodes)))
        # save local df results
        try:
            os.remove('results/%s'%self.data_type +'_lcc_local.csv')
        except:
            pass

        lcc_local.to_csv('results/%s'%self.data_type +'_lcc_local.csv')
        
    def mcl_algo(self):
        '''
        Returns:
            
            - Scipy sparse adjacency matrix.
            - List of clusters: each cluster is a sublist containing the nodes which belongs to it.
            
        '''
        graph = self.lcc_graph()
        adj_matrix = nx.to_scipy_sparse_matrix(graph)
        ### !!! attention: it works only with scipy version 1.2.0 !!!
        result = mc.run_mcl(adj_matrix)           # run MCL with default parameters
        clusters = mc.get_clusters(result)  # get clusters
        return adj_matrix,clusters
    
    def mcl_plot():
        adj_matrix, clusters = self.mcl_algo()
        mc.draw_graph(adj_matrix, clusters, node_size=50, with_labels=False, edge_color="silver")
        
        
    def hypergeometric_test(self):
        '''
        Returns:
        
            - Dictionary where for each cluster id gives the corresponding p-value.
        '''
        _ , lcc_clusters = self.mcl_algo()
        # list of lcc nodes names
        _, lcc_nodes = self.task_2_1_b_global()
        # list of seeds genes
        seed_genes = list(pd.read_csv('seed_genes.txt', header=None)[0])
        # seeds in lcc
        seeds_lcc = [x for x in seed_genes if x in lcc_nodes]
        # population M = number of genes in lcc
        pop = len(lcc_nodes)
        #initialize p-values dictionary
        p_values = {}
        for cluster in lcc_clusters:
            # n nodes in cluster
            cluster_dimension = len(cluster)
            # check number of nodes
            if cluster_dimension >= 10:
                # list of seed genes in cluster
                seeds_in_cluster = [x for x in cluster if lcc_nodes[x] in seeds_lcc]
                # number of seeds in cluster
                n_seeds_in_cluster = len(seeds_in_cluster)
                M,n,N,x =  pop, cluster_dimension,len(seeds_lcc),n_seeds_in_cluster
                pval = hypergeom.sf(x-1, M, n, N)
                p_values[c] = pval
        # putatitive diseases modules dictionary
        pdm_dic = {}
        for i in p_values.keys():

            if p_values[i] < 0.05:
                pdm_dic[i] = p_values[i]

        return pdm_dic
        

In [82]:
# call the class for the 2 dataframes
LCC_union = LCC(union_df, 'union')
LCC_inters = LCC(intersection_df, 'intersection')


In [32]:
# save adjacency matrix for the 2 graphs in order to use it on R
LCC_union.lcc_a_matrix_to_csv()
LCC_inters.lcc_a_matrix_to_csv()

### TASK 2.1 B - GLOBAL
# union global measures df 
lcc_union_global = LCC_union.merge_global_measures()
# intersection global measures df 
lcc_inters_global = LCC_inters.merge_global_measures()
# concatenate the 2 final dataframes
lcc_global = pd.concat([lcc_union_global, lcc_inters_global], axis = 0)
# save resulting df on csv
# remove if already existing adn then create the new one
try:
    os.remove('results/lcc_global.csv')
except: 
    pass
lcc_global.to_csv('results/lcc_global.csv')


### TASK 2.1 B - LOCAL

# save lcc union local results
LCC_union.save_local_results()
# save lcc intersection local results
LCC_inters.save_local_results()



In [83]:
### TASK 2.2
pdm_lcc_union = LCC_union.hypergeometric_test()
pdm_lcc_inters = LCC_inters.hypergeometric_test()

In [84]:
pdm_lcc_union


    


{}

In [85]:
pdm_lcc_inters

{}