In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import igraph as ig
import matplotlib.pyplot as plt
import time
from fun.fun import *

In [2]:
# START
edges_csv = "../data/edges.csv"
edges_prq = "../data/edges.parquet"
edges_total = 684_732_453 # hardcoded

In [3]:
# -> IN : PARQUET Read edges from parquet to dataframe
print("reading edges ... ", end='')
start = time.time()
df = dd.read_parquet(edges_prq)
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
df.head()

reading edges ... read 684_732_453 lines (took 0.1s)


Unnamed: 0,source,target
0,13,103151
1,13,214293
2,103151,214293
3,13,138731
4,13,42023


In [4]:
# -> IN : Read list of nodes
nodes = pd.read_csv('../data/node_ids.csv', index_col='index')
print("Number of nodes: {:_}".format(len(nodes)))

Number of nodes: 410_885


In [None]:
# -> IN : CSV Read edges to dataframe
print("reading edges ... ", end='')
start = time.time()
df = dd.read_csv(edges_csv)
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
df.head()

In [5]:
# TESTING
print("Creating iGraph ...")
g = ig.Graph()
g.add_vertices(len(nodes))
print("Processing {} partitions ...".format(df.npartitions))
for i, partition in enumerate(df.to_delayed()):
    p_df = partition.compute()
    g.add_edges(p_df.values.tolist())
    _, perc = track_progress(df.npartitions, i, text="partitions processed:")
    #if perc > 1: break
    #if input("...") == 'b': break
print("\nDone.")
print("\nGRAPH CREATED:")
print("  nodes: {:_}".format(len(g.vs)))
print("  edges: {:_}".format(len(g.es)))

Creating iGraph ...
Processing 157 partitions ...
 partitions processed: 112/157 (71.33758%)

In [None]:
# create igraph
start = time.time()
g = ig.Graph.TupleList(df.values)
end = time.time()
print("created graph with {:_} nodes and {:_} edges (took {:.1f}s)".format(len(g.vs), len(g.es), (end-start)))

In [None]:
# Get degree centrality of graph
plt.hist(g.degree(), bins=50)
plt.show()

In [None]:
dff = df.head(100)
start = time.time()
#g = ig.Graph.TupleList(dff.values)
g = ig.Graph.TupleList(dff.to_dask_array(lengths=False))
end = time.time()
print("created graph with {:_} nodes and {:_} edges (took {:.1f}s)".format(len(g.vs), len(g.es), (end-start)))

In [None]:
#dd.to_parquet(df, '../data/edges.parquet', write_index=False, single_file=True, overwrite=True)
df.to_parquet('../data/edges.parquet', write_index=False, partition_on=False, overwrite=True)