In [1]:
import pandas as pd
import numpy as np
import markov_clustering as mc
import networkx as nx
import time
from networkx.algorithms import community
from cdlib import algorithms

In [2]:
# Load all necessary data
ppi = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPI data 20_08_2020\\PPI with Uniprot IDs\\PPI.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')
uniprot_ids = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\Metadata\\Filtered metadata\\Metadata.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')
ppi2 = ppi.copy()
uniprot_ids = uniprot_ids.astype(str)

# Create a NetworkX graph

In [3]:
nodes = list(set(list(ppi2.UniprotAccession_A.unique()) + list(ppi2.UniprotAccession_B.unique())))
edges = []
for i in range(len(ppi2)):
    edges.append(tuple([ppi2.UniprotAccession_A[i], ppi2.UniprotAccession_B[i]]))

In [4]:
# Create empty graph
G = nx.Graph()

# Add the nodes
G.add_nodes_from(nodes)

#Add the edges
G.add_edges_from(edges)

print(G.number_of_nodes())
print(G.number_of_edges())

18683
543266


# Markov Clustering Algorithm (MCL)

In [5]:
# get the adjacency matrix (in sparse form)
matrix = nx.to_scipy_sparse_matrix(G)

# Run the MCL algorithm
time_start = time.perf_counter()
result = mc.run_mcl(matrix)
clusters = mc.get_clusters(result)
time_elapsed = (time.perf_counter() - time_start)
print(f'The computational time to cluster the network is: {time_elapsed:.2f} seconds')

print(f'The amount of clusters detected: {len(clusters)}')

The computational time to cluster the network is: 38.70 seconds
The amount of clusters detected: 3703


In [6]:
clusters2 = []
for i in range(len(clusters)):
    r = set()
    for e in range(len(clusters[i])):
        r.add(list(G.nodes())[clusters[i][e]])
    clusters2.append(r)

In [7]:
# Get the lenghth of each cluster
mcl_dict_len = dict()
for i in range(len(clusters2)):
    mcl_dict_len[i] = len(clusters2[i])

# get the proteins per cluster
mcl_dict = dict()
for i in range(len(clusters2)):
    mcl_dict[i] = clusters2[i]

In [8]:
max(list(mcl_dict_len.values()))

1178

In [9]:
min(list(mcl_dict_len.values()))

1

## Breast Carcinoma

In [10]:
# check which clusters contain these proteins
bc = uniprot_ids[uniprot_ids['DisGeNet_disease_name'].str.contains('Breast Carcinoma')]
bc_proteins = bc.UniprotAccession.tolist()

# Create a dictionary containing all the disease-related proetins per cluster
communities = dict()
for i in range(len(mcl_dict)):
    x = list(mcl_dict[i])
    r = []
    for ele in x:
        if ele in bc_proteins:
            r.append(ele)
    if len(r) > 0:
        communities[i] = r

# Create a dictionary reporting the total amount of proteins per cluster that contains at least one disease-associated protein
communities2 = dict()
for i in range(len(mcl_dict)):
    x = list(mcl_dict[i])
    r = []
    for ele in x:
        if ele in bc_proteins:
            communities2[i] = len(mcl_dict[i])

# Create a dictionary reporting the amount of disease-associated proteins per cluster 
communities3 = dict()
for i in range(len(mcl_dict)):
    x = list(mcl_dict[i])
    r = []
    for ele in x:
        if ele in bc_proteins:
            r.append(ele)
    if len(r) > 0:
        communities3[i] = len(r)

In [11]:
print(f'The amount of disease-associated proteins is {len(bc_proteins)}')

The amount of disease-associated proteins is 308


In [16]:
print(f'The amount of disease-associated clusters is {len(communities3)}')
print(f'The maximum amount of disease-associated proteins per cluster is {max(communities3.values())}')
print(f'The minimum amount of disease-associated proteins per cluster is {min(communities3.values())}')

The amount of disease-associated clusters is 155
The maximum amount of disease-associated proteins per cluster is 18
The minimum amount of disease-associated proteins per cluster is 1


In [15]:
# Get the amount of total proteins in the network after cluster filtration
x = 0
for ele in communities2:
    x += communities2[ele]
x

6168