In [1]:
import json
from pathlib import Path
import pandas as pd

filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


In [2]:
bgp_df = pd.read_csv('../../scripts/as_metrics_with_rpki.csv')

In [5]:
inactive_rows = (
    net_df[~net_df['asn'].isin(bgp_df['asn'].dropna())]
    .loc[net_df['asn'].notna()]
)

# Nur die inaktiven ASNs (unique):
inactive_rows

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
9,12,912,AT&T US - AS7132,see AT&T US - AS7018,,http://www.att.com,"[{'service': 'website', 'identifier': 'http://...",7132,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2024-03-19T15:08:43Z,ok
15,21,55,Renesys,Oracle/Renesys Internet Intelligence Group,,https://internetintel.oracle.com/,"[{'service': 'website', 'identifier': 'https:/...",34135,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:34:08Z,ok
18,25,1152,MCI - MAE.net,,,http://www.mae.net,"[{'service': 'website', 'identifier': 'http://...",6066,,,...,False,,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:24Z,ok
20,28,70,New Edge Networks,Earthlink Business Solutions,,http://www.newedgenetworks.com,"[{'service': 'website', 'identifier': 'http://...",19029,,,...,False,,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:44Z,ok
24,32,80,Fused Networks,,,,[],6320,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:24Z,ok
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33126,40537,42286,ARTEMIS,ARTEMIS,ARTEMIS Network,https://as207258.net/,"[{'service': 'website', 'identifier': 'https:/...",207258,https://bgp.tools/lg/207258,,...,False,Not Required,True,,ok,2025-10-11T22:55:06Z,,2025-10-09T21:16:36Z,2025-10-14T17:25:56Z,ok
33129,40550,42300,OCTIGNITE,Sergey Proshutinskiy,,https://sm00th.me,"[{'service': 'website', 'identifier': 'https:/...",205163,,,...,False,Not Required,True,,ok,2025-10-12T22:55:08Z,,2025-10-11T16:21:06Z,2025-10-21T10:38:36Z,ok
33136,40586,42335,ABJUST-NET,MSST-Net,,https://msst-net.msst2031.cn/,"[{'service': 'website', 'identifier': 'https:/...",204856,,,...,False,Not Required,True,,ok,2025-10-17T22:55:08Z,,2025-10-15T18:41:12Z,2025-10-21T03:52:33Z,ok
33137,40588,42337,MijaNET,,Mi-Ja Javorník s.r.o. ISP & telco,https://www.mijanet.cz,"[{'service': 'website', 'identifier': 'https:/...",204917,,,...,False,Not Required,True,,ok,2025-10-16T22:55:07Z,,2025-10-15T20:31:36Z,2025-10-20T06:03:40Z,ok


In [4]:
import requests, time, random
import pandas as pd
from datetime import datetime, timezone, timedelta
from tqdm import tqdm

# ---- Parameter (defensiv) ----
THIRTY_DAYS  = timedelta(days=30)
UA           = "asn-activity-check/1.0 (contact: you@example.com)"
TIMEOUT_S    = 20
RETRIES      = 4
BASE_SLEEP_S = 0.25   # Grundpause nach JEDER Anfrage
MAX_BACKOFF  = 8.0
RIPE_URL_TPL = "https://stat.ripe.net/data/routing-status/data.json?resource=AS{}"

def _normalize_asns(series):
    return (series.astype(str).str.upper()
            .str.replace(r"^\s*AS", "", regex=True)
            .str.replace(r"\D", "", regex=True)
            .replace("", pd.NA).dropna()
            .astype(int))

def _parse_iso_utc(s):
    if not s:
        return None
    s = s.replace("Z", "")
    try:
        return datetime.fromisoformat(s).replace(tzinfo=timezone.utc)
    except Exception:
        return None

def _get_last_seen_sync(asn):
    """
    Einzelne RIPE-Stat-Abfrage für ein ASN.
    Mit Retries bei 429/5xx/Timeout und Exponential Backoff + Jitter.
    Gibt dict(asn, last_seen, status) zurück.
    """
    url = RIPE_URL_TPL.format(asn)
    headers = {"User-Agent": UA}
    backoff = 1.0

    for attempt in range(1, RETRIES + 1):
        try:
            r = requests.get(url, headers=headers, timeout=TIMEOUT_S)
            status = r.status_code

            # Throttling/Serverfehler -> Retry mit Backoff
            if status in (429, 500, 502, 503, 504):
                time.sleep(backoff + random.uniform(0, 0.5))
                backoff = min(backoff * 2, MAX_BACKOFF)
                continue

            if status != 200:
                # andere Fehler: kein Retry
                return {"asn": asn, "last_seen": None, "status": status}

            j = r.json()
            data = j.get("data", {})
            prefixes = data.get("prefixes") or []
            if not prefixes:
                return {"asn": asn, "last_seen": None, "status": 200}

            last_seen = None
            for p in prefixes:
                ls = _parse_iso_utc(p.get("last_seen"))
                if ls and (last_seen is None or ls > last_seen):
                    last_seen = ls
            return {"asn": asn, "last_seen": last_seen, "status": 200}

        except (requests.Timeout, requests.ConnectionError):
            # Netzwerk/Timeout -> Retry
            time.sleep(backoff + random.uniform(0, 0.5))
            backoff = min(backoff * 2, MAX_BACKOFF)

    # Nach allen Retries gescheitert
    return {"asn": asn, "last_seen": None, "status": "retry_exhausted"}

def find_long_inactive_sync(inactive_rows, asn_col="asn"):
    """
    Prüft NUR die ASNs aus inactive_rows sequenziell gegen RIPE.
    Liefert:
      - long_inactive_full: merge auf deine PeeringDB-Zeilen, nur >30d inaktiv/nie gesehen
      - check_df: Übersicht je ASN (last_seen, days_since_seen, active_30d, status)
    """
    # deduplizierte, normalisierte ASNs
    asn_list = _normalize_asns(inactive_rows[asn_col]).drop_duplicates().tolist()

    results = []
    for asn in tqdm(asn_list, desc="RIPE (sequenziell)", unit="asn"):
        res = _get_last_seen_sync(asn)
        results.append(res)
        # sanfte Pause nach jeder Anfrage
        time.sleep(BASE_SLEEP_S + random.uniform(0, 0.1))

    check_df = pd.DataFrame(results)
    now = datetime.now(timezone.utc)

    check_df["days_since_seen"] = check_df["last_seen"].apply(
        lambda dt: (now - dt).days if isinstance(dt, datetime) else None
    )
    check_df["active_30d"] = check_df["last_seen"].apply(
        lambda dt: (dt is not None) and (now - dt <= THIRTY_DAYS)
    )

    # nur >30 Tage nicht gesehen ODER nie gesehen
    long_inactive_core = check_df[(check_df["active_30d"] == False) | check_df["active_30d"].isna()] \
                           .rename(columns={"asn": "asn_norm"})

    # zurück auf deine PeeringDB-Zeilen mergen
    merged = inactive_rows.copy()
    merged["asn_norm"] = _normalize_asns(merged[asn_col]).values

    long_inactive_full = (
        merged.merge(long_inactive_core, on="asn_norm", how="inner")
              .drop(columns=["asn_norm"])
              .sort_values(by=["days_since_seen"], ascending=False, na_position="last")
    )

    return long_inactive_full, check_df
long_inactive_df, check_df = find_long_inactive_sync(inactive_rows, asn_col="asn")

print("ASNs ohne BGP gesamt:", inactive_rows['asn'].nunique())
print("Davon >30 Tage inaktiv:", long_inactive_df['asn'].nunique())
long_inactive_df.head()


RIPE (sequenziell):   0%|          | 0/2646 [00:00<?, ?asn/s]

RIPE (sequenziell):  47%|████▋     | 1233/2646 [15:33<17:50,  1.32asn/s]  


KeyboardInterrupt: 