In [1]:
import pandas as pd
import numpy as np
import markov_clustering as mc
import networkx as nx
import time
from networkx.algorithms import community
from cdlib import algorithms
import community as community_louvain

In [2]:
# Load all necessary data
ppi = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\PPI data 20_08_2020\\PPI with Uniprot IDs\\PPI.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')
uniprot_ids = pd.read_csv(r'D:\\Jana De Coster\\Documents\\Ugent\\2de master\\Master thesis\\Network\\Data interaction network\\Metadata\\Filtered metadata\\Metadata.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')
ppi2 = ppi.copy()
uniprot_ids = uniprot_ids.astype(str)

## Create a NetworkX graph

In [3]:
nodes = list(set(list(ppi2.UniprotAccession_A.unique()) + list(ppi2.UniprotAccession_B.unique())))
edges = []
for i in range(len(ppi2)):
    edges.append(tuple([ppi2.UniprotAccession_A[i], ppi2.UniprotAccession_B[i]]))

In [4]:
# Create empty graph
G = nx.Graph()

# Add the nodes
G.add_nodes_from(nodes)

#Add the edges
G.add_edges_from(edges)

print(G.number_of_nodes())
print(G.number_of_edges())

18683
543266


#  Louvain cluster algorithm

In [5]:
# The partition returns a dictionary containing the communities numbered from 0 - the number of communities
time_start = time.perf_counter()
louvain_dict = community_louvain.best_partition(G, resolution = 1, randomize = False)
time_elapsed = (time.perf_counter() - time_start)
print(f'The computational time to cluster the network is: {time_elapsed:.2f} seconds')

# Identify the proteins for each cluster
clusters = {}
for key, value in louvain_dict.items():
    if value in clusters:
        clusters[value].append(key)
    else:
        clusters[value] = [key]
        
# Identify the amount of proteins for each cluster
clusters2 = {}
for key, value in clusters.items():
    clusters2[key] = len(clusters[key])
clusters2

The computational time to cluster the network is: 36.02 seconds


{0: 863,
 1: 3916,
 2: 2417,
 3: 1298,
 4: 2142,
 5: 1276,
 6: 1914,
 7: 2519,
 8: 122,
 9: 845,
 10: 696,
 11: 654,
 12: 6,
 13: 2,
 14: 6,
 15: 2,
 16: 3,
 17: 2}

## Breast carcinoma

In [6]:
# check which clusters contain these proteins
bc = uniprot_ids[uniprot_ids['DisGeNet_disease_name'].str.contains('Breast Carcinoma')]
bc_proteins = bc.UniprotAccession.tolist()

# Create a dictionary containing all the disease-related proetins per cluster
communities = dict()
for i in range(len(clusters)):
    x = list(clusters[i])
    r = []
    for ele in x:
        if ele in bc_proteins:
            r.append(ele)
    if len(r) > 0:
        communities[i] = r

# Create a dictionary reporting the total amount of proteins per cluster that contains at least one disease-associated protein
communities2 = dict()
for i in range(len(clusters)):
    x = list(clusters[i])
    r = []
    for ele in x:
        if ele in bc_proteins:
            communities2[i] = len(clusters[i])

# Create a dictionary reporting the amount of disease-associated proteins per cluster 
communities3 = dict()
for i in range(len(clusters)):
    x = list(clusters[i])
    r = []
    for ele in x:
        if ele in bc_proteins:
            r.append(ele)
    if len(r) > 0:
        communities3[i] = len(r)

In [7]:
print(f'The amount of disease-associated proteins is {len(bc_proteins)}')

The amount of disease-associated proteins is 308


In [12]:
print(f'The amount of disease-associated clusters is {len(communities3)}')
print(f'The maximum amount of disease-associated proteins per cluster is {max(communities3.values())}')
print(f'The minimum amount of disease-associated proteins per cluster is {min(communities3.values())}')

The amount of disease-associated clusters is 11
The maximum amount of disease-associated proteins per cluster is 150
The minimum amount of disease-associated proteins per cluster is 1


In [10]:
# Get the amount of total proteins in the network after cluster filtration
x = 0
for ele in communities2:
    x += communities2[ele]
x

18540