Notebook to analyse the structural properties of the PPI and GDA networks

In [16]:
import os
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from pyvis.network import Network

### RAW PPI

1. Order: The order of a graph is the total number of vertices it contains.

2. Size: The size of a graph is the total number of edges it has.

3. Degree: The degree of a vertex is the number of edges incident to it. In directed graphs, there can be both an incoming degree and an outgoing degree for each vertex.

4. Regularity: A graph is said to be regular if all of its vertices have the same degree.

5. Connectivity: Graphs can have different levels of connectivity:

6. Connected graph: A graph is connected if there is a path between every pair of vertices.

6. Disconnected graph: A graph is disconnected if there are at least two vertices that do not have a path connecting them.

6. Components: In a disconnected graph, the connected subgraphs are called components. Each component is itself a connected graph.

7. Cycles: A cycle is a closed path in a graph that starts and ends at the same vertex, passing through different vertices.

8. Trees: A tree is an acyclic, connected graph. It does not contain any cycles.

9. Planarity: A graph is planar if it can be drawn on a plane without any edges crossing each other.

10. Diameter: The diameter of a graph is the maximum distance between any pair of vertices. It represents the longest shortest path in the graph.

11. Clustering coefficient: The clustering coefficient measures the tendency of vertices in a graph to form clusters or communities.

12. Centrality measures: eigenvector centrality, betweenness centrality, and closeness centrality

13. Avg shortest path length

14. Density

In [6]:
path_to_PPI = 'Datasets/BIOGRID-ORGANISM-Homo_sapiens-4.4.206.tab3.txt'

print('[+] Reading PPI...', end='')
biogrid = pd.read_csv(path_to_PPI, sep='\t', low_memory=False)

# Filtering non-human proteins
biogrid = biogrid[(biogrid['Organism ID Interactor A'] == 9606) & (biogrid['Organism ID Interactor B'] == 9606)]
print('ok')

[+] Reading PPI...ok


In [7]:
print('[+] Creating the graph...', end='')
G = nx.Graph()

for index, row in biogrid.iterrows():
	p1 = row['Official Symbol Interactor A'].replace('-', '_').replace('.', '_')
	p2 = row['Official Symbol Interactor B'].replace('-', '_').replace('.', '_')
	G.add_edge(p1, p2)
print('ok')

print('\t[+] Added', len(list(G.nodes)), 'nodes')
print('\t[+] Added', len(list(G.edges)), 'edges')

# Remove self loops
print('[+] Removing self loops...', end='')
G.remove_edges_from(nx.selfloop_edges(G))
print('ok')

print('\t[+]', len(list(G.nodes)), 'nodes')
print('\t[+]', len(list(G.edges)), 'edges')

# Let's tale only the largest connected component
ncc = len(list(nx.connected_components(G)))
print(ncc, 'connected_components')
print('[+] Taking the LCC...', end='')
lcc = max(nx.connected_components(G), key=len)
G = G.subgraph(lcc).copy()
print('ok')

print('\t[+]', len(list(G.nodes)), 'nodes')
print('\t[+]', len(list(G.edges)), 'edges')

[+] Creating the graph...ok
	[+] Added 19764 nodes
	[+] Added 682198 edges
[+] Removing self loops...ok
	[+] 19764 nodes
	[+] 678932 edges
4 connected_components
[+] Taking the LCC...ok
	[+] 19761 nodes
	[+] 678932 edges


#### Structural properties

In [8]:
# 1. Order
order = G.order()
print("Order:", order)

# 2. Size
size = G.size()
print("Size:", size)

# 3. Degree
degree_sequence = [G.degree(node) for node in G.nodes()]
avg_degree = round(sum(degree_sequence)/len(degree_sequence), 3)
print("Average degree:", avg_degree)

# 4. Regularity
is_regular = all(degree == degree_sequence[0] for degree in degree_sequence)
print("Is regular:", is_regular)

# 5. Connectivity
is_connected = nx.is_connected(G)
print("Is connected:", is_connected)

# 6. Components
components = nx.connected_components(G)
print("Components:", len(list(components)))

# 7. Cycles
cycles = len(list(nx.cycle_basis(G)))
print("Cycles:", cycles)

# 8. Trees
is_tree = nx.is_tree(G)
print("Is tree:", is_tree)

# 9. Planarity
is_planar = nx.check_planarity(G)[0]
print("Is planar:", is_planar)

# 10. Diameter
diameter = nx.diameter(G)
print("Diameter:", diameter)

# 11. Clustering coefficient
clustering_coefficient = nx.average_clustering(G)
print("Clustering coefficient:", clustering_coefficient)

# 12. Centrality measures (Example: Degree centrality)
# degree_centrality = nx.degree_centrality(G)
# print("Degree centrality:", degree_centrality)

Order: 19761
Size: 678932
Average degree: 68.714
Is regular: False
Is connected: True
Components: 1
Cycles: 659172
Is tree: False
Is planar: False
Diameter: 7
Clustering coefficient: 0.11514935421331564


In [9]:
density = nx.density(G)
print('Density:', density)

Density: 0.0034774461699904956


In [11]:
avg_spl = nx.average_shortest_path_length(G)
print('Average shortest path length:', avg_spl)

Average shortest path length: 2.8044735244061267


#### Visualization v1

In [None]:
# Generate positions for nodes using a layout algorithm
pos = nx.spring_layout(G)

# Create a list of edge coordinates for Plotly
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

# Create a Scatter trace for edges
edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# Create a Scatter trace for nodes
node_trace = go.Scatter(
    x=[pos[node][0] for node in G.nodes()],
    y=[pos[node][1] for node in G.nodes()],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=False,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2
    )
)

# Set the text and color of each node
node_trace.text = [f'Node: {node}<br>Degree: {G.degree[node]}' for node in G.nodes()]
node_trace.marker.color = [G.degree[node] for node in G.nodes()]

# Create the figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='PPI',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    annotations=[dict(
                        text='',
                        showarrow=False,
                        xref="paper",
                        yref="paper",
                        x=0.005,
                        y=-0.002
                    )],
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                )
            )

# Display the graph
fig.show()

#### Visualization v2

In [10]:
# Create a pyvis Network instance
nt = Network(cdn_resources='in_line')

# Add nodes and edges to the pyvis network
for node in G.nodes():
    nt.add_node(node)

for edge in G.edges():
    nt.add_edge(edge[0], edge[1])

# Set node attributes (e.g., degree) for visualization
node_degrees = dict(G.degree)
nt.set_node_attributes(node_degrees, "degree")

# Set node size and color based on degrees
nt.set_node_size("degree")
nt.set_node_color("degree")

# Visualize the graph
nt.show("pyvis_graph.html")

KeyboardInterrupt: 

### PPI + GDA

In [59]:
for file in tqdm(os.listdir('Graphs/')):
	if 'nedbit' in file and 'diamond' not in file:
		disease_name = file.strip().split('_')[2].split('.')[0]
		gda_path = 'Datasets/'+disease_name+'_seed_genes.txt'

		gda_d = {}
		with open(gda_path, 'r') as gdas:
			for line in gdas.readlines():
				gene = line.strip().split(' ')[0]
				if gene not in gda_d:
					gda_d[gene] = 0

		G = nx.read_gml(os.path.join('Graphs/', file))

		for node in G.nodes:
			associated = False if gene not in gda_d else True
			G.nodes[node]['associated'] = associated
		print('Disease:', disease_name)
		print('\tNumber of associated genese:', len(gda_d))

		disease_module = nx.Graph()
		for node in gda_d:
				disease_module.add_node(node)
				neighbors = list(G.neighbors(node))
				disease_module.add_nodes_from(neighbors)
				disease_module.add_edges_from([(node, neighbor) for neighbor in neighbors])

		# Components
		components = nx.connected_components(disease_module)
		n_components = len(list(components))
		print("\tComponents:", n_components)
		
		# If n_components > 1 take LCC
		if n_components > 1:
			print('\tThe following statistics regard only the LCC')
			lcc = max(nx.connected_components(disease_module), key=len)
			disease_module = disease_module.subgraph(lcc).copy()

		# Order
		order = disease_module.order()
		print("\tOrder:", order)
		# Size
		size = disease_module.size()
		print("\tSize:", size)
		# Degree
		degree_sequence = [disease_module.degree(node) for node in disease_module.nodes()]
		avg_degree = round(sum(degree_sequence)/len(degree_sequence), 3)
		print("\tAverage degree:", avg_degree)
		# Regularity
		is_regular = all(degree == degree_sequence[0] for degree in degree_sequence)
		print("\tIs regular:", is_regular)
		# Connectivity
		is_connected = nx.is_connected(disease_module)
		print("\tIs connected:", is_connected)
		# Cycles
		cycles = len(list(nx.cycle_basis(disease_module)))
		print("\tCycles:", cycles)
		# Trees
		is_tree = nx.is_tree(disease_module)
		print("\tIs tree:", is_tree)
		# Planarity
		is_planar = nx.check_planarity(disease_module)[0]
		print("\tIs planar:", is_planar)
		# Diameter
		diameter = nx.diameter(disease_module)
		print("\tDiameter:", diameter)
		# Clustering coefficient
		clustering_coefficient = nx.average_clustering(disease_module)
		print("\tClustering coefficient:", clustering_coefficient)
		# Density
		density = nx.density(disease_module)
		print('\tDensity:', density)
		# Avg shortest path length
		avg_spl = nx.average_shortest_path_length(disease_module)
		print('\tAverage shortest path length:', avg_spl)


  0%|          | 0/14 [00:00<?, ?it/s]

Disease: C0860207
	Number of associated genese: 320
	Components: 1
	Order: 9907
	Size: 34772
	Average degree: 7.02
	Is regular: False
	Is connected: True
	Cycles: 24866
	Is tree: False
	Is planar: False
	Diameter: 7
	Clustering coefficient: 0.11675607158546127
	Density: 0.0007086294218036747
	Average shortest path length: 3.483242407977881
Disease: C0005586
	Number of associated genese: 451
	Components: 2
	The following statistics regard only the LCC
	Order: 12160
	Size: 42841
	Average degree: 7.046
	Is regular: False
	Is connected: True
	Cycles: 30682
	Is tree: False
	Is planar: False
	Diameter: 8
	Clustering coefficient: 0.12715295692136572
	Density: 0.000579506300293047
	Average shortest path length: 3.332333248384346
Disease: C3714756
	Number of associated genese: 431
	Components: 2
	The following statistics regard only the LCC
	Order: 11079
	Size: 46761
	Average degree: 8.441
	Is regular: False
	Is connected: True
	Cycles: 35683
	Is tree: False
	Is planar: False
	Diameter: 7
	Clus