In [1]:
import json
import os
import itertools

import rootpath
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib
# matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import geopandas as gpd
import pycountry as pc
from collections import Counter

from lhledge import cfgLoader
from lhledge import lhlFilters
from lhledge import superrouters
from lhledge import loadGeographicData

In [2]:

CYCLE = 8820
DATE = 20201002
# CYCLE = 4578
# DATE = 20160302
DOWNSAMPLING = 1

In [3]:
def ecdf(data, w=[]):
    """ Compute ECDF """
    if len(w) == 0:
        w = np.ones(len(data))
    #
    #     x = np.sort(data)
    idx = np.argsort(data).values
    #
    x = np.array(data)
    x = x[idx]
    w = w[idx]
    #
    n = x.size
    #     y = np.arange(1, n + 1) / n
    y = np.cumsum(w) / sum(w)
    return (np.squeeze(x), y)

In [4]:
def _get_max_subgraph(G):
    subg = nx.Graph()
    subg_size = 0
    subprahs = list(G.subgraph(c) for c in nx.connected_components(G))
    
    for subgraph in list(subprahs):
#         print(len(subgraph.nodes()))
        if  len(subgraph.nodes()) > subg_size:
            subg = subgraph
            subg_size = len(subg.nodes())

    return subg

In [5]:
# Change directory to run from the root dir of the project
path = rootpath.detect(pattern=".git")
os.chdir(path)

# load config file
cfg = cfgLoader.cfgLoader("config.yml")

In [6]:
# len(H.nodes())

# Load data

## Geographic data

In [7]:
for cycle in [4577, 5423, 6447, 7616, 8821, 9644, 10020]:
    
    df = pd.read_csv(f"data/processed/snapshots-longitudinal/{cycle}.csv.gz", compression="gzip")

    G = nx.from_pandas_edgelist(
        df.loc[df["diff_rtt"] > 47],
        "near_node_id",
        "far_node_id",
        edge_attr="diff_rtt"
    )

    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_node_asn"].values.astype(int),
            index=df["near_node_id"]
        ).to_dict(),
        'asn',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_node_asn"].values.astype(int),
            index=df["far_node_id"]
        ).to_dict(),
        'asn',
    )

    H = _get_max_subgraph(G)

    G_ases = set()
    H_ases = set()

    for node in G.nodes():
        G_ases.add(G.nodes[node]["asn"])

    for node in H.nodes():
        H_ases.add(H.nodes[node]["asn"])

    print(f"{cycle}\t&{nx.number_connected_components(G)}\t&{len(G.nodes())}\t&{len(G.edges())}\t&{len(G_ases)}\t&{len(H.nodes())}\t&{len(H.edges())}\t&{len(H_ases)}\\\\")

    # plot_graph(G, cycle)


4577	&898	&9560	&18026	&1633	&5443	&13723	&1146\\
5423	&1133	&12355	&23751	&1753	&7618	&19871	&1201\\
6447	&1084	&11478	&20615	&1698	&5880	&15478	&1078\\
7616	&1511	&18410	&38243	&1543	&12185	&32309	&687\\
8821	&1298	&18597	&40452	&1446	&13324	&35536	&752\\
9644	&1523	&24425	&57467	&2525	&17232	&48976	&1597\\
10020	&1605	&23267	&52066	&1965	&15078	&40518	&1207\\


In [8]:
for cycle in [4577, 5423, 6447, 7616, 8821, 9644, 10020]:
    
    df = pd.read_csv(f"data/processed/snapshots-longitudinal/{cycle}.csv.gz", compression="gzip")

    G = nx.from_pandas_edgelist(
        df.loc[df["diff_rtt"] > 47],
        "near_node_id",
        "far_node_id",
        edge_attr="diff_rtt"
    )

    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_node_asn"].values.astype(int),
            index=df["near_node_id"]
        ).to_dict(),
        'asn',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_node_asn"].values.astype(int),
            index=df["far_node_id"]
        ).to_dict(),
        'asn',
    )
    
    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_side_cc"].values.astype(str),
            index=df["near_node_id"]
        ).to_dict(),
        'cc',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_side_cc"].values.astype(str),
            index=df["far_node_id"]
        ).to_dict(),
        'cc',
    )

    H = _get_max_subgraph(G)

    Gasn = nx.Graph()
    
    for n1 in G.nodes():
        for n2 in G.neighbors(n1):
            Gasn.add_edge(G.nodes[n1]["asn"], G.nodes[n2]["asn"])
            
    Hasn = _get_max_subgraph(Gasn)
    
    Gcc = nx.Graph()
    
    for n1 in G.nodes():
        for n2 in G.neighbors(n1):
            Gcc.add_edge(G.nodes[n1]["cc"], G.nodes[n2]["cc"])
            
    Hcc = _get_max_subgraph(Gcc)
    
    
    
    print(f"{cycle}\t&{nx.number_connected_components(G)}\t&{len(G.nodes())}\t&{len(G.edges())}\t&{len(H.nodes())}\t&{len(H.edges())}\t&{nx.number_connected_components(Gasn)}\t&{len(Gasn.nodes())}\t&{len(Gasn.edges())}\t&{len(Hasn.nodes())}\t&{len(Hasn.edges())}\t&{nx.number_connected_components(Gcc)}\t&{len(Gcc.nodes())}\t&{len(Gcc.edges())}\t&{len(Hcc.nodes())}\t&{len(Hcc.edges())}\\\\")

    # plot_graph(G, cycle)


4577	&898	&9560	&18026	&5443	&13723	&179	&1633	&2326	&1406	&2123	&1	&122	&374	&122	&374\\
5423	&1133	&12355	&23751	&7618	&19871	&177	&1753	&2613	&1528	&2411	&1	&134	&413	&134	&413\\
6447	&1084	&11478	&20615	&5880	&15478	&164	&1698	&2458	&1492	&2277	&1	&135	&433	&135	&433\\
7616	&1511	&18410	&38243	&12185	&32309	&325	&1543	&2297	&1135	&1925	&1	&167	&775	&167	&775\\
8821	&1298	&18597	&40452	&13324	&35536	&253	&1446	&2235	&1104	&1931	&1	&164	&828	&164	&828\\
9644	&1523	&24425	&57467	&17232	&48976	&250	&2525	&3690	&2202	&3394	&1	&162	&961	&162	&961\\
10020	&1605	&23267	&52066	&15078	&40518	&269	&1965	&2805	&1611	&2488	&1	&158	&863	&158	&863\\


In [9]:
k = []
for cycle in [4577, 5423, 6447, 7616, 8821, 9644, 10020]:
    
    df = pd.read_csv(f"data/processed/snapshots-longitudinal/{cycle}.csv.gz", compression="gzip")

    G = nx.from_pandas_edgelist(
        df.loc[df["diff_rtt"] > 57],
        "near_node_id",
        "far_node_id",
        edge_attr="diff_rtt"
    )

    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_node_asn"].values.astype(int),
            index=df["near_node_id"]
        ).to_dict(),
        'asn',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_node_asn"].values.astype(int),
            index=df["far_node_id"]
        ).to_dict(),
        'asn',
    )
    
    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_side_cc"].values.astype(str),
            index=df["near_node_id"]
        ).to_dict(),
        'cc',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_side_cc"].values.astype(str),
            index=df["far_node_id"]
        ).to_dict(),
        'cc',
    )

    H = _get_max_subgraph(G)

    Gasn = nx.Graph()
    
    for n1 in G.nodes():
        for n2 in G.neighbors(n1):
            Gasn.add_edge(G.nodes[n1]["asn"], G.nodes[n2]["asn"])
            
    Hasn = _get_max_subgraph(Gasn)
    
    Gcc = nx.Graph()
    
    for n1 in G.nodes():
        for n2 in G.neighbors(n1):
            Gcc.add_edge(G.nodes[n1]["cc"], G.nodes[n2]["cc"])
            
    Hcc = _get_max_subgraph(Gcc)
    
    
    
    print(f"{cycle}\t&{nx.number_connected_components(G)}\t&{len(G.nodes())}\t&{len(G.edges())}\t&{len(H.nodes())}\t&{len(H.edges())}\t&{nx.number_connected_components(Gasn)}\t&{len(Gasn.nodes())}\t&{len(Gasn.edges())}\t&{len(Hasn.nodes())}\t&{len(Hasn.edges())}\t&{nx.number_connected_components(Gcc)}\t&{len(Gcc.nodes())}\t&{len(Gcc.edges())}\t&{len(Hcc.nodes())}\t&{len(Hcc.edges())}\\\\")
    k.append((len(H.nodes()) / len(G.nodes()), len(H.edges()) / len(G.edges()), 
          len(Hasn.nodes()) / len(Gasn.nodes()), len(Hasn.edges()) / len(Gasn.edges()),
          len(Hcc.nodes()) / len(Gcc.nodes()), len(Hcc.edges()) / len(Gcc.edges()), ))
    # plot_graph(G, cycle)


4577	&898	&9560	&18026	&5443	&13723	&179	&1633	&2326	&1406	&2123	&1	&122	&374	&122	&374\\
5423	&1133	&12355	&23751	&7618	&19871	&177	&1753	&2613	&1528	&2411	&1	&134	&413	&134	&413\\
6447	&1084	&11478	&20615	&5880	&15478	&164	&1698	&2458	&1492	&2277	&1	&135	&433	&135	&433\\
7616	&1511	&18410	&38243	&12185	&32309	&325	&1543	&2297	&1135	&1925	&1	&167	&775	&167	&775\\
8821	&1298	&18597	&40452	&13324	&35536	&253	&1446	&2235	&1104	&1931	&1	&164	&828	&164	&828\\
9644	&1523	&24425	&57467	&17232	&48976	&250	&2525	&3690	&2202	&3394	&1	&162	&961	&162	&961\\
10020	&1605	&23267	&52066	&15078	&40518	&269	&1965	&2805	&1611	&2488	&1	&158	&863	&158	&863\\


In [10]:
pd.DataFrame(k, columns=["rn", "re", "an", "ae", "cn", "ce"]).max(axis=0)

rn    0.716460
re    0.878473
an    0.878681
ae    0.926363
cn    1.000000
ce    1.000000
dtype: float64

In [11]:
pd.DataFrame(k, columns=["rn", "re", "an", "ae", "cn", "ce"]).min(axis=0)

rn    0.512284
re    0.750813
an    0.735580
ae    0.838050
cn    1.000000
ce    1.000000
dtype: float64

In [12]:
k = []
for cycle in [4577, 5423, 6447, 7616, 8821, 9644, 10020]:
    
    df = pd.read_csv(f"data/processed/snapshots-longitudinal/{cycle}.csv.gz", compression="gzip")

    G = nx.from_pandas_edgelist(
        df.loc[df["diff_rtt"] > 47],
        "near_node_id",
        "far_node_id",
        edge_attr="diff_rtt"
    )

    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_node_asn"].values.astype(int),
            index=df["near_node_id"]
        ).to_dict(),
        'asn',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_node_asn"].values.astype(int),
            index=df["far_node_id"]
        ).to_dict(),
        'asn',
    )
    
    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_side_cc"].values.astype(str),
            index=df["near_node_id"]
        ).to_dict(),
        'cc',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_side_cc"].values.astype(str),
            index=df["far_node_id"]
        ).to_dict(),
        'cc',
    )
    
    print(cycle, len(G.nodes()), len(G.edges()))


4577 9560 18026
5423 12355 23751
6447 11478 20615
7616 18410 38243
8821 18597 40452
9644 24425 57467
10020 23267 52066


In [13]:
for cycle in [4577, 5423, 6447, 7616, 8821, 9644, 10020]:
    
    df = pd.read_csv(f"data/processed/snapshots-longitudinal/{cycle}.csv.gz", compression="gzip")

    G = nx.from_pandas_edgelist(
        df.loc[df["diff_rtt"] > 47],
        "near_node_id",
        "far_node_id",
        edge_attr="diff_rtt"
    )

    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_node_asn"].values.astype(int),
            index=df["near_node_id"]
        ).to_dict(),
        'asn',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_node_asn"].values.astype(int),
            index=df["far_node_id"]
        ).to_dict(),
        'asn',
    )

    H = _get_max_subgraph(G)

    G_ases = set()
    H_ases = set()

    for node in G.nodes():
        G_ases.add(G.nodes[node]["asn"])

    for node in H.nodes():
        H_ases.add(H.nodes[node]["asn"])

        
    print(len(H.nodes())/ len(G.nodes()), len(H.edges())/ len(G.edges()))
    # plot_graph(G, cycle)


0.5693514644351464 0.7612892488627538
0.6165924726831242 0.8366384573281125
0.5122843700993205 0.7508125151588649
0.6618685497012493 0.8448343487697095
0.7164596440286067 0.8784732522495797
0.7055066530194473 0.8522456366262376
0.6480422916577127 0.7782045864863827


In [14]:
for cycle in [4577, 5423, 6447, 7616, 8821, 9644, 10020]:
    
    df = pd.read_csv(f"data/processed/snapshots-longitudinal/{cycle}.csv.gz", compression="gzip")

    G = nx.from_pandas_edgelist(
        df.loc[df["diff_rtt"] > 47],
        "near_node_id",
        "far_node_id",
        edge_attr="diff_rtt"
    )

    nx.set_node_attributes(
        G,
        pd.Series(
            df["near_node_asn"].values.astype(int),
            index=df["near_node_id"]
        ).to_dict(),
        'asn',
    )
    nx.set_node_attributes(
        G,
        pd.Series(
            df["far_node_asn"].values.astype(int),
            index=df["far_node_id"]
        ).to_dict(),
        'asn',
    )

    H = _get_max_subgraph(G)

    G_ases = set()
    H_ases = set()

    for node in G.nodes():
        G_ases.add(G.nodes[node]["asn"])

    for node in H.nodes():
        H_ases.add(H.nodes[node]["asn"])
        
    print(f"{cycle} {len(H.nodes()) / len(G.nodes())} {len(H.edges()) / len(G.edges())}")

    # plot_graph(G, cycle)


4577 0.5693514644351464 0.7612892488627538
5423 0.6165924726831242 0.8366384573281125
6447 0.5122843700993205 0.7508125151588649
7616 0.6618685497012493 0.8448343487697095
8821 0.7164596440286067 0.8784732522495797
9644 0.7055066530194473 0.8522456366262376
10020 0.6480422916577127 0.7782045864863827
