# Manuelles Goldstandard-Labeling f√ºr ASN-Klassifikation

Erstelle einen sauberen Goldstandard-Test-Set durch manuelles Labeling mit Recherche.

In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from IPython.display import display
import webbrowser

## 1. Lade existierende Daten

In [2]:
# Lade PeeringDB
filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')
with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

net_data = dump.get('net', {}).get('data')
net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# Lade ASRank
as_rank_df = pd.read_csv('../../preprocessing/data/asrank/as_rank_df.csv')

# Merge
merged_df = pd.merge(net_df, as_rank_df, on='asn', how='left')

bgp_df = pd.read_csv('../../scripts/as_metrics_with_rpki.csv')

# Merge
merged_df = pd.merge(merged_df, bgp_df, on='asn', how='inner')


print(f"Total ASNs: {len(merged_df)}")
print(f"\nKlassenverteilung:")
print(merged_df['info_type'].value_counts())

Total ASNs: 20984

Klassenverteilung:
info_type
Cable/DSL/ISP           10973
NSP                      3658
Content                  2130
Enterprise               1500
Educational/Research     1266
Network Services          701
Non-Profit                524
Government                122
Route Server               99
Route Collector            11
Name: count, dtype: int64


## 2. Stratifiziertes Random Sampling

W√§hle zuf√§llig ASNs aus jeder Klasse (stratified sampling):

In [3]:
# Konfiguration
SAMPLES_PER_CLASS = 25  # Anzahl pro Klasse
RANDOM_SEED = 42

# Stratified Sampling
target_classes = merged_df['info_type'].unique()
sampled_asns = []

for cls in target_classes:
    class_df = merged_df[merged_df['info_type'] == cls]
    
    # Nehme nur ASNs mit ASRank-Daten f√ºr bessere Infos
    class_df = class_df[class_df['rank'].notna()]
    
    # Sample
    if len(class_df) >= SAMPLES_PER_CLASS:
        sample = class_df.sample(n=SAMPLES_PER_CLASS, random_state=RANDOM_SEED)
    else:
        sample = class_df  # Nimm alle wenn weniger vorhanden
    
    sampled_asns.append(sample)
    print(f"{cls}: {len(sample)} ASNs")

# Kombiniere
goldstandard_candidates = pd.concat(sampled_asns, ignore_index=True)

# Shuffle f√ºr Random Order beim Labeling
goldstandard_candidates = goldstandard_candidates.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"\nTotal Candidates: {len(goldstandard_candidates)}")

NSP: 25 ASNs
Content: 25 ASNs
Non-Profit: 25 ASNs
Cable/DSL/ISP: 25 ASNs
Educational/Research: 25 ASNs
Route Server: 25 ASNs
Route Collector: 11 ASNs
Enterprise: 25 ASNs
Network Services: 25 ASNs
Government: 25 ASNs

Total Candidates: 236


## 3. Interaktives Labeling Tool

Zeige ASN-Informationen an und erlaube manuelles Labeling:

In [4]:
# Lade oder erstelle Label-Datei
LABEL_FILE = Path('goldstandard_labels.json')

if LABEL_FILE.exists():
    with open(LABEL_FILE, 'r') as f:
        labels = json.load(f)
    print(f"Loaded {len(labels)} existing labels")
else:
    labels = {}
    print("Starting fresh labeling session")

def save_labels():
    with open(LABEL_FILE, 'w') as f:
        json.dump(labels, f, indent=2)
    print(f"Saved {len(labels)} labels to {LABEL_FILE}")

Starting fresh labeling session


In [5]:
def show_asn_info(asn_row):
    """Zeige alle verf√ºgbaren Infos zu einem ASN"""
    asn = int(asn_row['asn'])
    
    html = f"""
    <div style="border: 2px solid #4CAF50; padding: 20px; border-radius: 10px; background-color: #f9f9f9;">
        <h2 style="color: #4CAF50;">AS{asn}</h2>
        
        <h3>üìù Textuelle Informationen:</h3>
        <ul>
            <li><b>Organization Name:</b> {asn_row.get('org_name', 'N/A')}</li>
            <li><b>Network Name:</b> {asn_row.get('name', 'N/A')}</li>
            <li><b>Country:</b> {asn_row.get('country', 'N/A')}</li>
            <li><b>Website:</b> <a href="{asn_row.get('website', '#')}" target="_blank">{asn_row.get('website', 'N/A')}</a></li>
            <li><b>PeeringDB Label:</b> <span style="color: #FF6347;">{asn_row.get('info_type', 'N/A')}</span></li>
        </ul>
        
        <h3>üìä Topologie-Metriken (ASRank):</h3>
        <ul>
            <li><b>Global Rank:</b> {int(asn_row['rank']) if pd.notna(asn_row.get('rank')) else 'N/A'}</li>
            <li><b>Customer Degree:</b> {int(asn_row['asnDegree_customer']) if pd.notna(asn_row.get('asnDegree_customer')) else 'N/A'}</li>
            <li><b>Peer Degree:</b> {int(asn_row['asnDegree_peer']) if pd.notna(asn_row.get('asnDegree_peer')) else 'N/A'}</li>
            <li><b>Provider Degree:</b> {int(asn_row['asnDegree_provider']) if pd.notna(asn_row.get('asnDegree_provider')) else 'N/A'}</li>
            <li><b>Customer Cone (ASNs):</b> {int(asn_row['cone_numberAsns']) if pd.notna(asn_row.get('cone_numberAsns')) else 'N/A'}</li>
            <li><b>Customer Cone (Prefixes):</b> {int(asn_row['cone_numberPrefixes']) if pd.notna(asn_row.get('cone_numberPrefixes')) else 'N/A'}</li>
        </ul>
        
        <h3>üîó Recherche-Links:</h3>
        <ul>
            <li><a href="https://bgp.he.net/AS{asn}" target="_blank">üåê Hurricane Electric BGP Toolkit</a></li>
            <li><a href="https://www.peeringdb.com/asn/{asn}" target="_blank">üì° PeeringDB</a></li>
            <li><a href="https://stat.ripe.net/AS{asn}" target="_blank">üìà RIPE Stat</a></li>
            <li><a href="https://asrank.caida.org/asns/{asn}" target="_blank">üìä CAIDA ASRank</a></li>
        </ul>
    </div>
    """
    
    display(HTML(html))

import ipywidgets as widgets
from IPython.display import display, HTML

# --- Globale Variablen ---
selected_label = None

def create_label_buttons():
    """Erstelle interaktive Buttons f√ºr die kompletten 10 PeeringDB Info Types"""
    
    layout_btn = widgets.Layout(width='200px', height='60px', margin='4px')
    
    # --- GRUPPE 1: Traffic Profiles (Die gro√üen Drei) ---
    btn_isp = widgets.Button(
        description='üè† Cable/DSL/ISP',
        layout=layout_btn, button_style='info',
        tooltip='Access Networks, Endkunden'
    )
    
    btn_content = widgets.Button(
        description='‚òÅÔ∏è Content',
        layout=layout_btn, button_style='info',
        tooltip='Inhalte, CDN, Hosting'
    )
    
    btn_nsp = widgets.Button(
        description='üåê NSP',
        layout=layout_btn, button_style='info',
        tooltip='Network Service Provider / Transit'
    )

    # --- GRUPPE 2: Organisationen (Rechtlich/Strukturell) ---
    btn_enterprise = widgets.Button(
        description='üè¢ Enterprise',
        layout=layout_btn, button_style='primary',
        tooltip='Unternehmen, Banken'
    )
    
    btn_education = widgets.Button(
        description='üéì Educational/Research',
        layout=layout_btn, button_style='primary',
        tooltip='Unis, Forschungsnetze'
    )

    btn_government = widgets.Button(
        description='üèõÔ∏è Government',
        layout=layout_btn, button_style='primary',
        tooltip='Beh√∂rden, Milit√§r'
    )
    
    btn_nonprofit = widgets.Button(
        description='ü§ù Non-Profit',
        layout=layout_btn, button_style='primary',
        tooltip='NGOs, Associations'
    )

    # --- GRUPPE 3: Infrastruktur & Services (Technisch) ---
    btn_netsvc = widgets.Button(
        description='üõ†Ô∏è Network Services',
        layout=layout_btn, button_style='warning',
        tooltip='DNS, Security, VPN, Infrastruktur'
    )

    btn_routeserver = widgets.Button(
        description='‚öôÔ∏è Route Server',
        layout=layout_btn, button_style='warning',
        tooltip='IXP RS (Verteilt Routen)'
    )
    
    btn_routecollector = widgets.Button(
        description='üì• Route Collector',
        layout=layout_btn, button_style='warning',
        tooltip='Analyse (Sammelt Routen)'
    )

    # --- GRUPPE 4: Steuerung ---
    btn_skip = widgets.Button(description='‚è≠Ô∏è Skip', layout=layout_btn, button_style='')
    btn_quit = widgets.Button(description='üíæ Quit & Save', layout=layout_btn, button_style='danger')

    # --- Handler Logic ---
    def set_label(label):
        global selected_label
        selected_label = label
        # Optional: print(f"Label gesetzt: {label}") 

    btn_isp.on_click(lambda b: set_label('Cable/DSL/ISP'))
    btn_content.on_click(lambda b: set_label('Content'))
    btn_nsp.on_click(lambda b: set_label('NSP'))
    
    btn_enterprise.on_click(lambda b: set_label('Enterprise'))
    btn_education.on_click(lambda b: set_label('Educational/Research'))
    btn_government.on_click(lambda b: set_label('Government'))
    btn_nonprofit.on_click(lambda b: set_label('Non-Profit'))
    
    btn_netsvc.on_click(lambda b: set_label('Network Services')) # <-- NEU
    btn_routeserver.on_click(lambda b: set_label('Route Server'))
    btn_routecollector.on_click(lambda b: set_label('Route Collector'))
    
    btn_skip.on_click(lambda b: set_label('SKIP'))
    btn_quit.on_click(lambda b: set_label('QUIT'))
    
    # --- Layout Zusammenbau ---
    # Zeile 1: 3 Buttons
    row1 = widgets.HBox([btn_isp, btn_content, btn_nsp])
    # Zeile 2: 4 Buttons (Passt gut f√ºr die Org-Typen)
    row2 = widgets.HBox([btn_enterprise, btn_education, btn_government, btn_nonprofit])
    # Zeile 3: 3 Buttons (Tech & Infra)
    row3 = widgets.HBox([btn_netsvc, btn_routeserver, btn_routecollector])
    # Zeile 4: Steuerung
    row4 = widgets.HBox([btn_skip, btn_quit])
    
    return widgets.VBox([
        widgets.HTML("<b>Traffic Profile:</b>"), row1,
        widgets.HTML("<b>Organisationstyp:</b>"), row2,
        widgets.HTML("<b>Infrastruktur & Services:</b>"), row3,
        widgets.HTML("<hr>"), row4
    ], layout=widgets.Layout(align_items='center'))

print("Alle 10 Labels integriert!")
display(create_label_buttons())

Alle 10 Labels integriert!


VBox(children=(HTML(value='<b>Traffic Profile:</b>'), HBox(children=(Button(button_style='info', description='‚Ä¶

## 4. Starte Labeling-Session

**Anleitung:**
1. F√ºhre die Zelle unten aus
2. Schaue dir die ASN-Infos an
3. Klicke auf die Recherche-Links (√∂ffnen in neuem Tab)
4. Gib das korrekte Label ein (1-6)
5. Dr√ºcke 's' zum √úberspringen, 'q' zum Beenden

In [12]:
import time
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- Global State & Setup ---
current_idx = 0

# Finde nicht-gelabelte ASNs (Filtert bereits erledigte raus)
unlabeled_indices = [idx for idx, row in goldstandard_candidates.iterrows() if str(int(row['asn'])) not in labels]

if len(unlabeled_indices) == 0:
    print("‚úÖ Alle ASNs bereits gelabelt!")
else:
    print(f"üìã {len(unlabeled_indices)} ASNs verbleibend in dieser Session")

# --- Logik-Funktionen ---

def show_next_asn():
    """Zeige n√§chstes ASN und die Buttons"""
    global current_idx
    
    if current_idx >= len(unlabeled_indices):
        clear_output()
        print("üéâ Labeling abgeschlossen! Alle Kandidaten bearbeitet.")
        save_labels()
        return
    
    idx = unlabeled_indices[current_idx]
    row = goldstandard_candidates.iloc[idx]
    
    clear_output(wait=True)
    
    # Header / Progress
    progress = len(labels)
    total = len(goldstandard_candidates)
    print(f"{'='*60}")
    print(f"Fortschritt: {progress}/{total} ({progress/total*100:.1f}%) gelabelt")
    print(f"Session:   {current_idx+1}/{len(unlabeled_indices)}")
    print(f"{'='*60}\n")
    
    # Zeige die ASN Details (Deine Funktion)
    show_asn_info(row)
    
    # Zeige das Button-Grid
    display(button_box)

def on_label_selected(label):
    """Speichert Label und l√§dt n√§chstes ASN"""
    global current_idx
    
    # Sicherheitscheck, falls Loop schon durch ist
    if current_idx >= len(unlabeled_indices):
        return

    idx = unlabeled_indices[current_idx]
    row = goldstandard_candidates.iloc[idx]
    asn = int(row['asn'])
    
    if label == 'QUIT':
        clear_output()
        print("üõë Beende und speichere...")
        save_labels()
        print(f"‚úÖ Gespeichert: {len(labels)} Labels insgesamt.")
        return
    
    if label != 'SKIP':
        # Datenstruktur f√ºr das Label
        labels[str(asn)] = {
            'label': label,
            'peeringdb_original': row.get('info_type', 'N/A'), # Original zum Vergleich
            'org_name': row.get('org_name', 'N/A'),
            'country': row.get('country', 'N/A'),
            'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
        }
        # Kleines Feedback
        print(f"‚úÖ AS{asn} ‚Üí {label}")
        
        # Auto-save alle 5 Schritte (Sicherheit)
        if len(labels) % 5 == 0:
            save_labels()
    else:
        print(f"‚è≠Ô∏è AS{asn} √ºbersprungen")
    
    # Index erh√∂hen und n√§chstes laden
    current_idx += 1
    time.sleep(0.2) # Kurze Pause f√ºr UX
    show_next_asn()

# --- Button Definitionen (Die neuen 10 Typen) ---

layout_btn = widgets.Layout(width='200px', height='60px', margin='4px')

# 1. Traffic Profile (Blau)
btn_isp = widgets.Button(description='üè† Cable/DSL/ISP', layout=layout_btn, button_style='info', tooltip='Access/Eyeball')
btn_content = widgets.Button(description='‚òÅÔ∏è Content', layout=layout_btn, button_style='info', tooltip='CDN/Hosting')
btn_nsp = widgets.Button(description='üåê NSP', layout=layout_btn, button_style='info', tooltip='Transit/Backbone')

# 2. Organisationen (Dunkelblau)
btn_ent = widgets.Button(description='üè¢ Enterprise', layout=layout_btn, button_style='primary', tooltip='Firmen')
btn_edu = widgets.Button(description='üéì Edu/Research', layout=layout_btn, button_style='primary', tooltip='Unis')
btn_gov = widgets.Button(description='üèõÔ∏è Government', layout=layout_btn, button_style='primary', tooltip='Beh√∂rden')
btn_npo = widgets.Button(description='ü§ù Non-Profit', layout=layout_btn, button_style='primary', tooltip='NGOs')

# 3. Infra & Tech (Orange)
btn_net = widgets.Button(description='üõ†Ô∏è Network Services', layout=layout_btn, button_style='warning', tooltip='DNS/Sec/VPN')
btn_rs  = widgets.Button(description='‚öôÔ∏è Route Server', layout=layout_btn, button_style='warning', tooltip='IXP RS')
btn_rc  = widgets.Button(description='üì• Route Collector', layout=layout_btn, button_style='warning', tooltip='Analysis')

# 4. Control
btn_skip = widgets.Button(description='‚è≠Ô∏è Skip', layout=layout_btn, button_style='')
btn_quit = widgets.Button(description='üíæ Quit & Save', layout=layout_btn, button_style='danger')

# --- Event Binding (Lambda Magic) ---

btn_isp.on_click(lambda b: on_label_selected('Cable/DSL/ISP'))
btn_content.on_click(lambda b: on_label_selected('Content'))
btn_nsp.on_click(lambda b: on_label_selected('NSP'))

btn_ent.on_click(lambda b: on_label_selected('Enterprise'))
btn_edu.on_click(lambda b: on_label_selected('Educational/Research'))
btn_gov.on_click(lambda b: on_label_selected('Government'))
btn_npo.on_click(lambda b: on_label_selected('Non-Profit'))

btn_net.on_click(lambda b: on_label_selected('Network Services'))
btn_rs.on_click(lambda b: on_label_selected('Route Server'))
btn_rc.on_click(lambda b: on_label_selected('Route Collector'))

btn_skip.on_click(lambda b: on_label_selected('SKIP'))
btn_quit.on_click(lambda b: on_label_selected('QUIT'))

# --- Layout Zusammenbau ---

row1 = widgets.HBox([btn_isp, btn_content, btn_nsp])
row2 = widgets.HBox([btn_ent, btn_edu, btn_gov, btn_npo])
row3 = widgets.HBox([btn_net, btn_rs, btn_rc])
row4 = widgets.HBox([btn_skip, btn_quit])

button_box = widgets.VBox([
    widgets.HTML("<b>Traffic Profile:</b>"), row1,
    widgets.HTML("<b>Organisation:</b>"), row2,
    widgets.HTML("<b>Infrastruktur:</b>"), row3,
    widgets.HTML("<hr>"), row4
], layout=widgets.Layout(align_items='center'))

# --- Start der Loop ---
if len(unlabeled_indices) > 0:
    show_next_asn()

VBox(children=(HTML(value='<b>Traffic Profile:</b>'), HBox(children=(Button(button_style='info', description='‚Ä¶

## 5. Analyse der gelabelten Daten

In [10]:
# Lade Labels
with open(LABEL_FILE, 'r') as f:
    labels = json.load(f)

# Erstelle DataFrame
label_df = pd.DataFrame([
    {'asn': int(asn), **data}
    for asn, data in labels.items()
])

print(f"Total gelabelt: {len(label_df)}")
print(f"\nLabel-Verteilung (Manual):")
print(label_df['label'].value_counts())

print(f"\n√úbereinstimmung mit PeeringDB:")
agreement = (label_df['label'] == label_df['peeringdb_original']).sum()
print(f"√úbereinstimmung: {agreement}/{len(label_df)} ({agreement/len(label_df)*100:.1f}%)")

# Zeige Unterschiede
differences = label_df[label_df['label'] != label_df['peeringdb_original']]
print(f"\nAnzahl Korrekturen: {len(differences)}")

if len(differences) > 0:
    print("\nBeispiele f√ºr Korrekturen:")
    for _, row in differences.head(10).iterrows():
        print(f"  AS{row['asn']}: {row['peeringdb_original']} ‚Üí {row['label']} ({row['org_name']})")

Total gelabelt: 123

Label-Verteilung (Manual):
label
Cable/DSL/ISP           18
Educational/Research    18
NSP                     15
Network Services        12
Government              12
Enterprise              12
Content                 11
Route Server            11
Non-Profit               7
Route Collector          7
Name: count, dtype: int64

√úbereinstimmung mit PeeringDB:
√úbereinstimmung: 100/123 (81.3%)

Anzahl Korrekturen: 23

Beispiele f√ºr Korrekturen:
  AS12212: NSP ‚Üí Network Services (N/A)
  AS53040: Route Server ‚Üí Cable/DSL/ISP (N/A)
  AS262533: Enterprise ‚Üí Educational/Research (N/A)
  AS398090: NSP ‚Üí Network Services (N/A)
  AS268122: Network Services ‚Üí Cable/DSL/ISP (N/A)
  AS329527: Route Server ‚Üí Network Services (N/A)
  AS132448: Enterprise ‚Üí Content (N/A)
  AS8674: Non-Profit ‚Üí NSP (N/A)
  AS18592: Non-Profit ‚Üí Educational/Research (N/A)
  AS15606: Non-Profit ‚Üí Network Services (N/A)


## 6. Export f√ºr Training/Testing

In [None]:
# Speichere als CSV f√ºr einfachen Import
label_df[['asn', 'label']].to_csv('goldstandard_test_set.csv', index=False)
print("Goldstandard Test-Set gespeichert: goldstandard_test_set.csv")

# Statistiken
print(f"\nFinal Statistics:")
print(f"Total ASNs: {len(label_df)}")
print(f"\nPer Class:")
print(label_df['label'].value_counts())

Goldstandard Test-Set gespeichert: goldstandard_test_set.csv

Final Statistics:
Total ASNs: 50

Per Class:
label
Access                12
Content               11
Network Services       9
Education/Research     7
Enterprise             6
Transit                5
Name: count, dtype: int64
