In [1]:
import requests
import json
import pandas as pd
import pyasn
from aslookup import get_as_data
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [2]:
pauls_key = "32ea6dea-ec86-4ed9-9f20-004088064867"

In [3]:
def get_traceroute_data(api_key):
    """
    Function filters traceroutes out of all measurements and returns only valid ones tagged with 'orangecat'.
    @params
        api_key: api_key from ripe atlas with 'List your measurements' permission.
    @return
        pandas dataframe with traceroute data
    """
    measurements = requests.get(f"https://atlas.ripe.net/api/v2/measurements/my/?key={api_key}").json()['results']
    
    tracerout_list = []

    for measurement in measurements:
        if "traceroute" in measurement['tags'] and "orangecat" in measurement['tags']:
            result_url = measurement['result']
            start_time = measurement['start_time']
            tracerout_url = f"{result_url}?start={start_time}"

            traceroute_response = requests.get(tracerout_url)
            if traceroute_response.status_code == 200:
                tracerout_list.extend(traceroute_response.json())

    return pd.DataFrame(tracerout_list)

In [4]:
df_traceroutes = get_traceroute_data(pauls_key)

In [5]:
asndb = pyasn.pyasn('data/ipasn_20230214.dat')

def lookup_asn(ip):
    return asndb.lookup(ip)[0]

def get_asn_path(traceroute):
    """Gets the AS numbers to a tracerout. Only considers the first ip alternative of each hop."""
    ips = []
    asn_path = []

    for hop in traceroute:
        try:
            ip = hop['result'][0]['from']
            if not ips or ips[-1] != ip:
                ips.append(ip)
                asn = lookup_asn(ip)
                if (not asn_path or asn_path[-1] != asn) and asn != None:
                    asn_path.append(asn)
        except Exception:
            pass
    return asn_path

df_traceroutes['asn_path'] = df_traceroutes['result'].apply(get_asn_path)

# Top AS owners 

In [6]:
unique_asn = df_traceroutes['asn_path'].explode().unique().tolist()
unique_asn.remove(np.nan)

df_asn_info = pd.DataFrame({"as_number": unique_asn })

In [7]:
def get_any_ip_in_as(asn):
    random_prefixes = asndb.get_as_prefixes(asn)
    ip = random_prefixes.pop().split("/")[0]
    return ip

def get_as_name(asn):
    ip = get_any_ip_in_as(asn)
    as_name = get_as_data(ip, service="shadowserver").as_name
    return as_name

In [8]:
#df_asn_info["as_name"] = df_asn_info["as_number"].apply(get_as_name)
#df_asn_info.to_csv("data/asn_info.csv")
df_asn_info = pd.read_csv("data/asn_info.csv", index_col=0)

In [9]:
as_providers = len(df_asn_info.as_name.unique())
print(f"Number of AS providers: {as_providers}")

Number of AS providers: 284


In [56]:
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Space Exploration Technologies Corporation", "SpaceX")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Amazon.com, Inc.", "Amazon")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("TATA COMMUNICATIONS (AMERICA) INC", "TATA COMM")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Telefonica Germany GmbH & Co.OHG", "Telefonica Ger")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Cogent Communications", "Cogent Comm")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Deutsche Telekom AG", "Deutsche Telekom")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Level 3 Parent, LLC", "Level 3 Parent")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Microsoft Corporation", "Microsoft")
df_asn_info["as_name"] = df_asn_info["as_name"].replace("Vodafone Libertel B.V.", "Vodafone Libertel")

In [108]:
ip_mapping = pd.read_csv("data/domain_ip_mapping.csv")

In [98]:
df_tr_merge = pd.merge(df_traceroutes, ip_mapping, left_on=["msm_id","dst_addr"], right_on=["msm_id", "ip"])

In [99]:
df_tr_merge["dc_provider"].value_counts()

Amazon EC2    2030
Microsoft     2030
Google        2029
Name: dc_provider, dtype: int64

In [105]:
df_google = df_tr_merge[df_tr_merge["dc_provider"] == "Google"]
asns_google = df_google["asn_path"].explode()
asns_google = pd.merge(asns_google, df_asn_info, left_on="asn_path", right_on="as_number").drop(columns="asn_path")
asn_stats_google = asns_google["as_name"].value_counts().head(7).rename_axis("as_name").reset_index(name="count")
print(f"Total number of ASes traversed in traceroutes to Google datacenters: {asns_google.as_name.nunique()}")

Total number of ASes traversed in traceroutes to Google datacenters: 239


In [106]:
df_amazon = df_tr_merge[df_tr_merge["dc_provider"] == "Amazon EC2"]
asns_amazon = df_amazon["asn_path"].explode()
asns_amazon = pd.merge(asns_amazon, df_asn_info, left_on="asn_path", right_on="as_number").drop(columns="asn_path")
asn_stats_amazon = asns_amazon["as_name"].value_counts().head(7).rename_axis("as_name").reset_index(name="count")
print(f"Total number of ASes traversed in traceroutes to Amazon datacenters: {asns_amazon.as_name.nunique()}")

Total number of ASes traversed in traceroutes to Amazon datacenters: 267


In [107]:
df_microsoft = df_tr_merge[df_tr_merge["dc_provider"] == "Microsoft"]
asns_microsoft = df_microsoft["asn_path"].explode()
asns_microsoft = pd.merge(asns_microsoft, df_asn_info, left_on="asn_path", right_on="as_number").drop(columns="asn_path")
asn_stats_microsoft = asns_microsoft["as_name"].value_counts().head(7).rename_axis("as_name").reset_index(name="count")
print(f"Total number of ASes traversed in traceroutes to Google datacenters: {asns_microsoft.as_name.nunique()}")

Total number of ASes traversed in traceroutes to Google datacenters: 240


In [94]:
trace_google = go.Bar(x=asn_stats_google["as_name"], y=asn_stats_google["count"], name="Google")
trace_amazon = go.Bar(x=asn_stats_amazon["as_name"], y=asn_stats_amazon["count"], name="Amazon")
trace_microsoft = go.Bar(x=asn_stats_microsoft["as_name"], y=asn_stats_microsoft["count"], name="Microsoft")

fig = make_subplots(rows=3, cols=1)

fig.add_trace(trace_google, row=1, col=1)
fig.add_trace(trace_amazon, row=2, col=1)
fig.add_trace(trace_microsoft, row=3, col=1)

fig.update_layout(title="Top 7 AS for Datacenter Providers")

fig.show()

# Plot a route for a sample probe by ip addresses

In [62]:
df_route = df_tr_merge.copy()
df_route.drop(['address_v4', 'address_v6', 'asn_v4', 'asn_v6', 'prefix_v4', 'prefix_v6', 'is_anchor', 'is_public', 'status', 'status_since', 'first_connected', 'total_uptime', 'tags', 'day', 'probe', 'status_name', 'fw', 'mver', 'lts', 'endtime', 'dst_name', 'dst_addr', 'src_addr', 'proto', 'af', 'size', 'paris_id', 'msm_id', 'prb_id', 'timestamp', 'msm_name', 'from', 'type', 'group_id', 'stored_timestamp'], axis=1, inplace=True)
df_route = df_route[df_route['City'].str.contains('Frankfurt', na=False)]
df_route = df_route[df_route['continent_code'].str.contains('AS', na=False)]
df_route = df_route[df_route['country_code'].str.contains('HK', na=False)]
df_route = df_route[df_route['id'] == 54778]

In [63]:
def get_ip_path(traceroute):
    """Gets the AS numbers to a tracerout. Only considers the first ip alternative of each hop."""
    ip_path = []

    for hop in traceroute:
        try:
            ip = hop['result'][0]['from']
            ip_path.append(ip)
        except:
            pass
    return ip_path

In [64]:
def get_ip_coordinates(ip):
    try:
        response = requests.get(f"https://ipapi.co/{ip}/json/")
        lat = response.json()["latitude"]
        long = response.json()["longitude"]
    except:
        lat, long = pd.NA, pd.NA
    return (lat, long)

In [65]:
fig = go.Figure()

colors = ["red", "yellow", "blue"]

for i in range(len(df_route)):
    tr = df_route["result"].iloc[i]
    tr_ips = pd.DataFrame(dict(ip=get_ip_path(tr)))
    tr_ips["latitude"], tr_ips["longitude"] = zip(*tr_ips["ip"].apply(get_ip_coordinates))

    fig.add_trace(
        go.Scattergeo(
            lon = tr_ips["longitude"],
            lat = tr_ips["latitude"],
            mode = 'markers+lines',
            line = dict(width=1, color=colors[i]),
            marker = dict(size=3, color='black'),
            name = df_route['Provider'].iloc[i]
        )
    )
    
fig.show()

# Path length of different providers

In [100]:
df_hops = df_tr_merge.copy()
df_hops.loc[:, "hop_count"] = df_hops.result.apply(lambda x: len(x))

In [101]:
df_hops = df_hops.loc[:, ["dc_provider", "probe_type", "hop_count"]]
df_hops.loc[:, "mean_hops"] = df_hops.groupby(["dc_provider", "probe_type"])["hop_count"].transform("mean")
df_hops = df_hops.drop("hop_count", axis=1).drop_duplicates()

In [102]:
fig = go.Figure()

for provider in df_hops.dc_provider.unique():
    fig.add_trace(
        go.Bar(name=provider,
               x=df_hops.loc[df_hops.dc_provider==provider, "probe_type"],
               y=df_hops.loc[df_hops.dc_provider==provider, "mean_hops"]
              )
    )


fig.update_layout(barmode='group')
fig.show()

# AS Path length of Providers

In [68]:
df_pl = df_tr_merge.copy()

In [69]:
df_pl["asn_path_length"] = df_pl["asn_path"].apply(lambda x: len(x))

In [70]:
df_pl = df_pl[["dc_provider", "probe_type", "asn_path_length"]]
df_pl.loc[:,"mean_asn_path_length"] = df_pl.groupby(["dc_provider", "probe_type"])["asn_path_length"].transform("mean")
df_pl = df_pl.drop("asn_path_length", axis=1).drop_duplicates()

In [71]:
fig = go.Figure()

for provider in df_pl.dc_provider.unique():
    fig.add_trace(
        go.Bar(name=provider,
               x=df_pl[(df_pl.dc_provider==provider)]["probe_type"],
               y=df_pl[(df_pl.dc_provider==provider)].mean_asn_path_length)
    )


fig.update_layout(barmode='group')
fig.show()