In [1]:
import pandas as pd
import networkx as nx

In [2]:
bipartite_df = pd.read_csv('../data/processed/bipartite-network-edges.csv')
bipartite_graph = nx.from_pandas_edgelist(bipartite_df, 'source', 'target')

In [3]:
# Analyze the bipartite graph
print(f"Number of nodes: {bipartite_graph.number_of_nodes()}")
print(f"Number of edges: {bipartite_graph.number_of_edges()}")

# Connected components
connected_components = list(nx.connected_components(bipartite_graph))
print(f"Number of connected components: {len(connected_components)}")

# Degree distribution
degrees = sorted((d for n, d in bipartite_graph.degree()), reverse=True)
degree_series = pd.Series(degrees)
print("Degree distribution statistics:")
print(degree_series.describe())

print(f"Top 5 nodes by degree: {degrees[:5]}")

Number of nodes: 231660
Number of edges: 1506044
Number of connected components: 10
Degree distribution statistics:
count    231660.000000
mean         13.002193
std         562.660740
min           1.000000
25%           4.000000
50%           6.000000
75%           8.000000
max      145675.000000
dtype: float64
Top 5 nodes by degree: [145675, 128702, 93113, 92904, 68533]


In [None]:
# Advanced metrics

print(f"Average clustering coefficient: {nx.average_clustering(bipartite_graph)}")
print(f"Diameter of the largest connected component: {nx.diameter(bipartite_graph.subgraph(max(connected_components, key=len)))}")

# Betweenness centrality
betweenness = nx.betweenness_centrality(bipartite_graph)
top_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 nodes by betweenness centrality:")
for node, centrality in top_betweenness:    
    print(f"Node: {node}, Betweenness Centrality: {centrality:.4f}")


In [None]:
import nx_parallel as nxp

# Create a parallel bipartite graph from the original bipartite graph
parallel_bipartite_graph = nxp.ParallelGraph(bipartite_graph)

# Advanced metrics using parallel processing

# Average clustering coefficient
# avg_clustering_parallel = nx.average_clustering(parallel_bipartite_graph, backend="parallel")
# print(f"Average clustering coefficient (parallel): {avg_clustering_parallel}")

# Diameter of the largest connected component
# largest_cc = max(nx.connected_components(parallel_bipartite_graph), key=len)
# diameter_parallel = nxp.diameter(parallel_bipartite_graph.subgraph(largest_cc))
# print(f"Diameter of the largest connected component (parallel): {diameter_parallel}")

# print(f"Diameter of the largest connected component: {nx.diameter(bipartite_graph.subgraph(max(connected_components, key=len)))}")

# Betweenness centrality
betweenness_parallel = nxp.betweenness_centrality(parallel_bipartite_graph)
top_betweenness_parallel = sorted(betweenness_parallel.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 nodes by betweenness centrality (parallel):")
for node, centrality in top_betweenness_parallel:       
    print(f"Node: {node}, Betweenness Centrality: {centrality:.4f}")

In [4]:
import nx_cugraph as nxcg

# Create a cuGraph bipartite graph from the original bipartite graph
gpu_bipartite_graph = nxcg.from_networkx(bipartite_graph)

# Betweenness centrality using GPU acceleration
# betweenness_gpu = nxcg.betweenness_centrality(gpu_bipartite_graph, normalized=True)
# top_betweenness_gpu = sorted(betweenness_gpu.items(), key=lambda x: x[1], reverse=True)[:5]
# print("Top 5 nodes by betweenness centrality (GPU):")
# for node, centrality in top_betweenness_gpu:
#     print(f"Node: {node}, Betweenness Centrality: {centrality:.4f}")

In [None]:

import nx_parallel as nxp
# Create a parallel bipartite graph from the original bipartite graph
parallel_bipartite_graph = nxp.ParallelGraph(bipartite_graph)

# Jaccard coefficient using NetworkX
jaccard_coeffs = list(nx.jaccard_coefficient(bipartite_graph))
top_jaccard = sorted(jaccard_coeffs, key=lambda x: x[2], reverse=True)[:5]
print("Top 5 node pairs by Jaccard coefficient:")
for u, v, coeff in top_jaccard:
    print(f"Nodes: ({u}, {v}), Jaccard Coefficient: {coeff:.4f}")