In [1]:
import json
import os
import itertools
import glob

import multiprocessing
import rootpath
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib
# matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import geopandas as gpd
import pycountry as pc
import pycountry_convert as pcc

from lhledge import cfgLoader
from lhledge import lhlFilters
from lhledge import loadGeographicData

In [2]:
DIST_THREHOLD = 4700

# CYCLE = 8820
# DATE = 20201002
# CYCLE = 4578
# DATE = 20160302
DOWNSAMPLING = 1

In [3]:
N_PROCS = multiprocessing.cpu_count() * 2

In [4]:
# Change directory to run from the root dir of the project
path = rootpath.detect(pattern=".git")
os.chdir(path)

# load config file
cfg = cfgLoader.cfgLoader("config.yml")

In [5]:
def __extract_from_warts_filename(warts_filename):
    """
    Two file name formats
        1. 2018-present: aep2-ar.team-probing.c008820.20201002.json.gz
        2. 2008-2018:    daily.l7.t2.c004578.20160304.cmn-ma.json.gz
    """
    if "daily" in warts_filename:
        _, _, team,_cyclye, date, mon, _, _ = warts_filename.split("/")[-1].split(".")
    else:
        mon, _, _cyclye, date, _, _ = warts_filename.split("/")[-1].split(".")

    return mon, int(_cyclye[1:]), int(date)

def geneate_intput_filename(warts_filename, s, downsampling):
    mon, cycle, date = __extract_from_warts_filename(warts_filename)
    
    path = cfg["paths"]["ark"]["all-links-detection"].format(mon, date, cycle, f"1-{downsampling}")
    filename = f"{path}/{mon}_{cycle}_{date}_1-{downsampling}_{s}.csv.gz"
    # print(filename)

    # _create_dir(filename)
    return filename

In [6]:
df = pd.DataFrame()

In [7]:
for i in range(len(cfg["snapshots"]))[-6:-3]:

    snapshot = cfg["snapshots"][i]

    DATE = f'{snapshot["date"]["yyyy"]:04d}{snapshot["date"]["mm"]:02d}{snapshot["date"]["dd"]:02d}'

    traceroutes_files = glob.glob(f'{cfg["paths"]["ark"]["traceroute-measurements"]["json"].format(snapshot["traceroute-measurements"]["cycle"])}/*.json.gz')

    for traceroutes_file in traceroutes_files:
        for n_slice in range(N_PROCS):
            input_filename = geneate_intput_filename(traceroutes_file, n_slice, DOWNSAMPLING)
            tmp = pd.read_csv(input_filename, compression="gzip")

            df = pd.concat([df, tmp])
            df = df.drop_duplicates()

In [8]:
df.head()

Unnamed: 0,near_side_node_id,far_side_node_id
0,5331758,5331790
1,5331790,823371
2,823371,432518
3,432518,465634
4,465634,313524


In [9]:
df.shape

(3187868, 2)

In [10]:
df.loc[(df["near_side_node_id"] > 0) 
       & (df["far_side_node_id"] > 0)] \
    .drop_duplicates() \
    .shape

(2709113, 2)

In [11]:
G = nx.from_pandas_edgelist(
    df.loc[(df["near_side_node_id"] > 0) & (df["far_side_node_id"] > 0)] \
      .drop_duplicates(["near_side_node_id", "far_side_node_id",]),
    "near_side_node_id",
    "far_side_node_id",
)

In [12]:
len(G.edges())

2674577