In [1]:
import dask.dataframe as dd
import dask.array as da
import numpy as np
import os
import time
from numcodecs import Blosc
from fun.fun import *

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [18]:
# -> IN : Read Edge List
edges_fn = "../data/edges.parquet"
print("reading edges ... ", end='')
start = time.time()
df = dd.read_parquet("../data/edges.parquet").head(10000)
df[['source', 'target']] = df[['source', 'target']].astype('int32')
end = time.time()
print("read {:_} lines (took {:.1f}s)".format(len(df), (end-start)))
df

reading edges ... read 10_000 lines (took 0.1s)


Unnamed: 0,source,target
0,13,103151
1,13,214293
2,103151,214293
3,13,138731
4,13,42023
...,...,...
9995,3158,12082
9996,3158,30087
9997,3158,67961
9998,3158,71615


In [19]:
# extract unique nodes
nodes = dd.concat([df['source'], df['target']]).unique().compute()
num_nodes = len(nodes)
print("found {:_} unique nodes".format(num_nodes))

found 426 unique nodes


In [20]:
# RUN ONCE (DONT NEED WITH WHOLE DATASET) map node values to index values
node_index = {node: index for index, node in enumerate(nodes)}
df['source'] = df['source'].apply(lambda x: node_index[x])
df['target'] = df['target'].apply(lambda x: node_index[x])

In [40]:
# Write edges to adjacency matrix
am = da.zeros((num_nodes, num_nodes), chunks=(100,100), dtype='int8')
total, i = len(df), 0
for _, (s,t) in df.iterrows():
    am[s,t] = 1
    am[t,s] = 1
    i, perc = track_progress(total, i, text='edges written to matrix:', inc=25)
    if perc > 2: break
print("\nDone.")

 edges written to matrix: 201/10_000 (2.01000%)
Done.


In [44]:
print(am.shape)
p, s = 6, 10
am[p:p+s, p:p+s].compute()

(426, 426)


array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [None]:
# Save adjacency matrix
save_fn = '../data/test/mat.zarr'
#if not os.path.exists(save_fn): os.mkdir(save_fn)
#da.to_npy_stack(save_fn, mat)
#mat.to_zarr(save_fn, compressor={'clevel': 5, 'shuffle': True}, overwrite=True)
compressor = Blosc(cname='zstd', clevel=9)
da.to_zarr(am, save_fn, storage_options={"compressor": compressor}, overwrite=True)