In [1]:
import json
from pathlib import Path
import pandas as pd

# locate a likely PeeringDB dump JSON in the workspace
candidates = list(Path('.').glob('*peeringdb*dump*.json')) + list(Path('.').glob('peeringdb*.json'))
if not candidates:
    raise FileNotFoundError("PeeringDB dump JSON not found. Place a file like 'peeringdb_dump.json' in the working directory.")
filepath = candidates[0]

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


In [2]:
# show all columns in net_df (net_df is defined in a previous cell)
print(f"{len(net_df.columns)} columns:")
pd.DataFrame(net_df.columns, columns=["column"])

41 columns:


Unnamed: 0,column
0,id
1,org_id
2,name
3,aka
4,name_long
5,website
6,social_media
7,asn
8,looking_glass
9,route_server


In [3]:
def date_stats(dataframe, column_name):
    numeric_data = pd.to_datetime(dataframe[column_name]).astype(int) / 10**9
    stats = {
        'min': pd.to_datetime(numeric_data.min(), unit='s'),
        'max': pd.to_datetime(numeric_data.max(), unit='s'),
        'median': pd.to_datetime(numeric_data.median(), unit='s'),
        'mean': pd.to_datetime(numeric_data.mean(), unit='s'),
        '10-percentile': pd.to_datetime(numeric_data.quantile(0.1), unit='s'),
        '20-percentile': pd.to_datetime(numeric_data.quantile(0.2), unit='s'),
        '30-percentile': pd.to_datetime(numeric_data.quantile(0.3), unit='s'),
        '40-percentile': pd.to_datetime(numeric_data.quantile(0.4), unit='s'),
        '50-percentile': pd.to_datetime(numeric_data.quantile(0.5), unit='s'),
        '60-percentile': pd.to_datetime(numeric_data.quantile(0.6), unit='s'),
        '70-percentile': pd.to_datetime(numeric_data.quantile(0.7), unit='s'),
        '80-percentile': pd.to_datetime(numeric_data.quantile(0.8), unit='s'),
        '90-percentile': pd.to_datetime(numeric_data.quantile(0.9), unit='s'),
    }
    for key, value in stats.items():
        stats[key] = pd.to_datetime(value, unit='s')
    return stats

In [4]:
date_stats(net_df, 'updated')

{'min': Timestamp('2022-07-27 05:33:15'),
 'max': Timestamp('2025-10-22 03:59:18'),
 'median': Timestamp('2024-03-23 06:38:05.500000'),
 'mean': Timestamp('2024-01-10 22:15:54.764705792'),
 '10-percentile': Timestamp('2022-07-27 05:34:16'),
 '20-percentile': Timestamp('2022-07-27 05:35:47'),
 '30-percentile': Timestamp('2022-07-27 05:37:01.700000'),
 '40-percentile': Timestamp('2023-04-07 12:59:21'),
 '50-percentile': Timestamp('2024-03-23 06:38:05.500000'),
 '60-percentile': Timestamp('2024-09-18 05:03:41.600000'),
 '70-percentile': Timestamp('2025-02-10 14:26:01.600000'),
 '80-percentile': Timestamp('2025-05-26 12:18:01.400000'),
 '90-percentile': Timestamp('2025-08-22 10:22:03.400000')}

In [5]:
date_stats(net_df, 'created')

{'min': Timestamp('2004-07-28 00:00:00'),
 'max': Timestamp('2025-10-21 14:40:30'),
 'median': Timestamp('2019-06-05 06:46:23.500000'),
 'mean': Timestamp('2018-10-16 00:34:18.902496768'),
 '10-percentile': Timestamp('2011-10-27 13:48:24.900000'),
 '20-percentile': Timestamp('2015-09-17 13:19:00.200000'),
 '30-percentile': Timestamp('2017-02-03 04:00:07.300000'),
 '40-percentile': Timestamp('2018-04-03 09:04:01.200000'),
 '50-percentile': Timestamp('2019-06-05 06:46:23.500000'),
 '60-percentile': Timestamp('2020-07-04 02:40:40.200000'),
 '70-percentile': Timestamp('2021-08-19 12:40:45.400000'),
 '80-percentile': Timestamp('2022-12-20 15:34:28.400000'),
 '90-percentile': Timestamp('2024-05-08 20:59:50.900000')}

In [6]:
date_stats(net_df, 'rir_status_updated')

{'min': NaT,
 'max': Timestamp('2025-10-21 22:55:08'),
 'median': Timestamp('2024-06-26 04:47:55'),
 'mean': Timestamp('2024-07-16 02:43:55.537119744'),
 '10-percentile': Timestamp('2024-06-26 04:47:55'),
 '20-percentile': Timestamp('2024-06-26 04:47:55'),
 '30-percentile': Timestamp('2024-06-26 04:47:55'),
 '40-percentile': Timestamp('2024-06-26 04:47:55'),
 '50-percentile': Timestamp('2024-06-26 04:47:55'),
 '60-percentile': Timestamp('2024-06-26 04:47:55'),
 '70-percentile': Timestamp('2024-06-26 04:47:55'),
 '80-percentile': Timestamp('2024-06-26 04:47:55'),
 '90-percentile': Timestamp('2024-07-22 12:47:41')}

In [7]:
import pandas as pd
from pathlib import Path

filepath = '../nro-delegated-stats/nro-delegated-stats'

# load the delegated stats into a DataFrame
# assuming it's a pipe-separated file with no header
nro_df = pd.read_csv(filepath, sep='|', header=None, names=['rir', 'country', 'type', 'asn', 'size', 'ignore1', 'status', 'ignore2', 'ignore3'], on_bad_lines='skip')
nro_df = nro_df[nro_df['type'] == 'asn']
nro_df.dropna(inplace=True)
nro_df = nro_df[nro_df['country'] != 'ZZ']

# expand nro_df to include individual ASNs based on size
expanded_rows = []
for _, row in nro_df.iterrows():
    start_asn = int(row['asn'])
    count = row['size']
    for i in range(count):
        new_row = row.copy()
        new_row['asn'] = str(start_asn + i)
        expanded_rows.append(new_row)
nro_df = pd.DataFrame(expanded_rows)

# show a quick preview
nro_df.head()

  nro_df = pd.read_csv(filepath, sep='|', header=None, names=['rir', 'country', 'type', 'asn', 'size', 'ignore1', 'status', 'ignore2', 'ignore3'], on_bad_lines='skip')


Unnamed: 0,rir,country,type,asn,size,ignore1,status,ignore2,ignore3
5,arin,US,asn,1,1,20010920,assigned,e5e3b9c13678dfc483fb1f819d70883c,e-stats
6,arin,US,asn,2,1,19910110,assigned,279fb28df10add3bd7028865951995a6,e-stats
7,arin,US,asn,3,1,0,assigned,d98c567cda2db06e693f2b574eafe848,e-stats
8,arin,US,asn,4,1,19840222,assigned,8f5d315929a560376b0b58b40a1932fa,e-stats
9,arin,US,asn,5,1,19840202,assigned,481404355c401f2604c57a0fda4ee68f,e-stats


In [8]:
nro_df['rir'].value_counts()

ripencc    38960
arin       33517
apnic      30532
lacnic     13912
afrinic     2603
Name: rir, dtype: int64

In [9]:
nro_df[nro_df['asn'] == '3320']

Unnamed: 0,rir,country,type,asn,size,ignore1,status,ignore2,ignore3
2669,ripencc,DE,asn,3320,1,19950223,assigned,4227b4f5-07d9-4852-b1d5-26559261d043,e-stats


In [10]:
# Merge net_df with nro_df on 'asn' to add RIR and country information
net_df['asn'] = net_df['asn'].astype(str)
merged_df = net_df.merge(nro_df[['asn', 'rir', 'country']], on='asn', how='left')
merged_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status,rir,country
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok,arin,US
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok,ripencc,NL
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok,arin,US
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok,ripencc,CH
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok,arin,US


In [11]:
merged_df['rir'].value_counts()

ripencc    8396
lacnic     5292
apnic      5165
arin       3758
afrinic     928
Name: rir, dtype: int64

In [2]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")


In [13]:
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


In [3]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


In [6]:
peering_df_joined['info_type'].value_counts()

info_type
Cable/DSL/ISP           11787
NSP                      3982
Content                  2486
Enterprise               1721
Educational/Research     1457
Network Services          804
Route Server              623
Non-Profit                613
Government                126
Route Collector            31
Name: count, dtype: int64

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

# Angenommen, dein DataFrame ist 'df' mit Spalten 'asn', 'org_name', 'country', 'source' und 'label' (die Klasse)
# Beispiel-DataFrame (ersetze durch deinen)
data = {
    'asn': ['AS15169', 'AS209', 'AS174', 'AS15169', 'AS67890'],
    'org_name': ['Google LLC', 'Akamai Technologies Inc.', 'Cogent Communications', 'Google LLC', 'Unknown Org'],
    'country': ['US', 'US', 'US', 'US', 'Unknown'],
    'source': ['arin', 'arin', 'arin', 'arin', 'Unknown'],
    'label': ['Content', 'Content', 'Transit', 'Content', 'Enterprise']  # Deine Klassen
}
df = pd.DataFrame(data)

# Preprocessing: Bereinige org_name (z. B. lower, NaN füllen)
df['org_name'] = df['org_name'].fillna('Unknown').str.lower()

# BERT-Tokenizer und Model laden (pre-trained bert-base-uncased)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Für Inference

# Funktion für BERT-Embeddings (CLS-Token als Sentence-Embedding)
def get_bert_embedding(text, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(text), batch_size)):
        batch = text[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS-Token
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Extrahiere Embeddings für org_name
X_embeddings = get_bert_embedding(df['org_name'].tolist())

# Labels
y = df['label']

# Train/Test-Split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.13, random_state=42, stratify=y)

X_train.shape, X_test.shape

  from .autonotebook import tqdm as notebook_tqdm
Disabling PyTorch because PyTorch >= 2.1 is required but found 2.1.0a0+b5021ba
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
  import pynvml  # type: ignore[import]


ImportError: 
BertModel requires the PyTorch library but it was not found in your environment. Check out the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
