In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import community
from networkx.algorithms.community import girvan_newman
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import itertools
import time

In [10]:
# load the data and create a copy dataset
ppi = pd.read_csv(r'/home/compomics/Jana/PPI.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')
uniprot_ids = pd.read_csv(r'/home/compomics/Jana/Metadata.csv', encoding = 'utf-8', compression = 'gzip', low_memory = False, sep = '\t')
ppi2 = ppi.copy()

## Create a NetworkX graph

In [3]:
nodes = list(set(list(ppi2.UniprotAccession_A.unique()) + list(ppi2.UniprotAccession_B.unique())))
edges = []
for i in range(len(ppi2)):
    edges.append(tuple([ppi2.UniprotAccession_A[i], ppi2.UniprotAccession_B[i]]))

In [4]:
# Create empty graph
G = nx.Graph()

# Add the nodes
G.add_nodes_from(nodes)

#Add the edges
G.add_edges_from(edges)

print(G.number_of_nodes())
print(G.number_of_edges())

18683
543266


# Newman-Girvan fast greedy algorithm

In [5]:
# Run the algorithm and print the computational time the algorithm takes
time_start = time.perf_counter()
k = 1
gn = girvan_newman(G)
for comm in itertools.islice(gn, k):
    communities = tuple(sorted(c) for c in comm)
time_elapsed = (time.perf_counter() - time_start)
print(f'The computational time to cluster the network is: {time_elapsed:.2f} seconds')

The computational time to cluster the network is: 391806.76 seconds


In [8]:
# Get the amount of proteins for each cluster detected
x = dict()
for i in range(len(communities)):
    x[i] = len(communities[i])
x

{0: 18670, 1: 4, 2: 2, 3: 2, 4: 2, 5: 3}

## Breast Carcinoma

In [12]:
# check which clusters contain these proteins
uniprot_ids = uniprot_ids.astype(str)
bc = uniprot_ids[uniprot_ids['DisGeNet_disease_name'].str.contains('Breast Carcinoma')]
bc_proteins = bc.UniprotAccession.tolist()

communities3 = dict()
for i in range(len(communities)):
    x = communities[i]
    r = []
    for ele in x:
        if ele in bc_proteins:
            r.append(ele)
    if len(r) > 0:
        communities3[i] = len(r)

In [13]:
# print the amount of disease-associated nodes per detected cluster
communities3

{0: 307}