In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import dask.dataframe as dd
import dask.array as da
import igraph as ig
import matplotlib.pyplot as plt
import time
from fun.fun import *

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# START
edges_csv = "../data/edges.csv"
edges_prq = "../data/edges.parquet"
edges_total = 684_732_453 # hardcoded

In [3]:
# -> IN : PARQUET Read edges from parquet to dataframe
print("reading edges ... ", end='')
start = time.time()
df = dd.read_parquet(edges_prq)
df[['source', 'target']] = df[['source', 'target']].astype('int32')
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
print(df.dtypes)
df.head()

reading edges ... read 684_732_453 lines (took 0.1s)
source    int32
target    int32
dtype: object


Unnamed: 0,source,target
0,13,103151
1,13,214293
2,103151,214293
3,13,138731
4,13,42023


In [4]:
# -> IN : Read list of nodes
nodes = pd.read_csv('../data/node_ids.csv', index_col='index')
print("Number of nodes: {:_}".format(len(nodes)))

Number of nodes: 410_885


In [5]:
# (1) GRAPH (ALL EDGES) : Loading edges into iGraph
print("Creating iGraph ...")
g = ig.Graph()
g.add_vertices(len(nodes))
print("Processing {} partitions ...".format(df.npartitions))
for i, partition in enumerate(df.to_delayed()):
    p_df = partition.compute()
    g.add_edges(p_df.values.tolist())
    _, perc = track_progress(df.npartitions, i, text="partitions processed:")
    #if perc > 1: break
    #if input("...") == 'b': break
    #if i+1 >= 10: break
print("\nDone.")
print("\nGRAPH CREATED:")
print("  nodes: {:_}".format(len(g.vs)))
print("  edges: {:_}".format(len(g.es)))

Creating iGraph ...
Processing 157 partitions ...
 partitions processed: 75/157 (47.77070%)

In [None]:
# (2) GRAPH (N EDGES) : Loading edges into iGraph
nrows = 4_000_000 # max: 4557647
#nrows = 100 # max: 4557647
g = ig.Graph.TupleList(df.head(nrows).values.tolist())
print("\nGRAPH CREATED:")
print("  nodes: {:_}".format(len(g.vs)))
print("  edges: {:_}".format(len(g.es)))

In [None]:
# Find average path length
# 6 partitions (3.8%) 16m 40s
# 
apl = g.average_path_length()
print("Average path length: {:.3f}".format(apl))

In [None]:
# Get graph diameter
diam = g.diameter()
print("Diameter of graph:", diam)

In [None]:
# Get and save adjacency matrix (as sparse CSC matrix)
adj_matrix_sparse_fn = '../data/adjacency_matrix_sparce.npz'
adj_matrix_sparse = g.get_adjacency_sparse()
sp.sparse.save_npz(adj_matrix_sparse_fn, adj_matrix_sparse) # save CSC matrix
#sparse_matrix = sp.sparse.load_npz(adj_matrix_sparse_fn) # load CSC matrix

In [None]:
# Get degree centrality of graph
dc_fn = '../data/degree_centralities.csv'
dc = g.degree()
print("DEGREE CENTRALITY")
print("len: {:_}".format(len(dc)))
print("max: {:_}".format(max(dc)))
print("min: {}".format(min(dc)))
print("mean: {:.1f}".format(np.mean(dc)))
dc_df = pd.DataFrame(dc)
dc_df.to_csv(dc_fn, index=False, header=False)
plt.hist(dc, bins=50)
plt.show()