# Manuelles Goldstandard-Labeling f√ºr ASN-Klassifikation

Erstelle einen sauberen Goldstandard-Test-Set durch manuelles Labeling mit Recherche.

In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from IPython.display import display
import webbrowser

## 1. Lade existierende Daten

In [None]:
# Lade PeeringDB
filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')
with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

net_data = dump.get('net', {}).get('data')
net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# Lade ASRank
as_rank_df = pd.read_csv('../../preprocessing/data/asrank/as_rank_df.csv')

# Merge
merged_df = pd.merge(net_df, as_rank_df, on='asn', how='left')

# Kategorien-Mapping
category_map = {
    "NSP": "Transit",
    "Content": "Content",
    "Cable/DSL/ISP": "Access",
    "Enterprise": "Enterprise",
    "Educational/Research": "Education/Research",
    "Non-Profit": "Enterprise",
    "Government": "Enterprise",
    "Route Server": "Network Services",
    "Route Collector": "Network Services",
    "Network Services": "Network Services",
}
merged_df["info_type"] = merged_df["info_type"].map(category_map).fillna(merged_df["info_type"])

print(f"Total ASNs: {len(merged_df)}")
print(f"\nKlassenverteilung:")
print(merged_df['info_type'].value_counts())

Total ASNs: 23630

Klassenverteilung:
info_type
Access                11787
Transit                3982
Content                2486
Enterprise             2460
Network Services       1458
Education/Research     1457
Name: count, dtype: int64


## 2. Stratifiziertes Random Sampling

W√§hle zuf√§llig ASNs aus jeder Klasse (stratified sampling):

In [None]:
# Konfiguration
SAMPLES_PER_CLASS = 50  # Anzahl pro Klasse
RANDOM_SEED = 42

# Stratified Sampling
target_classes = ['Access', 'Content', 'Education/Research', 'Enterprise', 'Network Services', 'Transit']
sampled_asns = []

for cls in target_classes:
    class_df = merged_df[merged_df['info_type'] == cls]
    
    # Nehme nur ASNs mit ASRank-Daten f√ºr bessere Infos
    class_df = class_df[class_df['rank'].notna()]
    
    # Sample
    if len(class_df) >= SAMPLES_PER_CLASS:
        sample = class_df.sample(n=SAMPLES_PER_CLASS, random_state=RANDOM_SEED)
    else:
        sample = class_df  # Nimm alle wenn weniger vorhanden
    
    sampled_asns.append(sample)
    print(f"{cls}: {len(sample)} ASNs")

# Kombiniere
goldstandard_candidates = pd.concat(sampled_asns, ignore_index=True)

# Shuffle f√ºr Random Order beim Labeling
goldstandard_candidates = goldstandard_candidates.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

print(f"\nTotal Candidates: {len(goldstandard_candidates)}")

Access: 50 ASNs
Content: 50 ASNs
Education/Research: 50 ASNs
Enterprise: 50 ASNs
Network Services: 50 ASNs
Transit: 50 ASNs

Total Candidates: 300


## 3. Interaktives Labeling Tool

Zeige ASN-Informationen an und erlaube manuelles Labeling:

In [None]:
# Lade oder erstelle Label-Datei
LABEL_FILE = Path('goldstandard_labels.json')

if LABEL_FILE.exists():
    with open(LABEL_FILE, 'r') as f:
        labels = json.load(f)
    print(f"Loaded {len(labels)} existing labels")
else:
    labels = {}
    print("Starting fresh labeling session")

def save_labels():
    with open(LABEL_FILE, 'w') as f:
        json.dump(labels, f, indent=2)
    print(f"Saved {len(labels)} labels to {LABEL_FILE}")

Loaded 0 existing labels


In [None]:
def show_asn_info(asn_row):
    """Zeige alle verf√ºgbaren Infos zu einem ASN"""
    asn = int(asn_row['asn'])
    
    html = f"""
    <div style="border: 2px solid #4CAF50; padding: 20px; border-radius: 10px; background-color: #f9f9f9;">
        <h2 style="color: #4CAF50;">AS{asn}</h2>
        
        <h3>üìù Textuelle Informationen:</h3>
        <ul>
            <li><b>Organization Name:</b> {asn_row.get('org_name', 'N/A')}</li>
            <li><b>Network Name:</b> {asn_row.get('name', 'N/A')}</li>
            <li><b>Country:</b> {asn_row.get('country', 'N/A')}</li>
            <li><b>Website:</b> <a href="{asn_row.get('website', '#')}" target="_blank">{asn_row.get('website', 'N/A')}</a></li>
            <li><b>PeeringDB Label:</b> <span style="color: #FF6347;">{asn_row.get('info_type', 'N/A')}</span></li>
        </ul>
        
        <h3>üìä Topologie-Metriken (ASRank):</h3>
        <ul>
            <li><b>Global Rank:</b> {int(asn_row['rank']) if pd.notna(asn_row.get('rank')) else 'N/A'}</li>
            <li><b>Customer Degree:</b> {int(asn_row['asnDegree_customer']) if pd.notna(asn_row.get('asnDegree_customer')) else 'N/A'}</li>
            <li><b>Peer Degree:</b> {int(asn_row['asnDegree_peer']) if pd.notna(asn_row.get('asnDegree_peer')) else 'N/A'}</li>
            <li><b>Provider Degree:</b> {int(asn_row['asnDegree_provider']) if pd.notna(asn_row.get('asnDegree_provider')) else 'N/A'}</li>
            <li><b>Customer Cone (ASNs):</b> {int(asn_row['cone_numberAsns']) if pd.notna(asn_row.get('cone_numberAsns')) else 'N/A'}</li>
            <li><b>Customer Cone (Prefixes):</b> {int(asn_row['cone_numberPrefixes']) if pd.notna(asn_row.get('cone_numberPrefixes')) else 'N/A'}</li>
        </ul>
        
        <h3>üîó Recherche-Links:</h3>
        <ul>
            <li><a href="https://bgp.he.net/AS{asn}" target="_blank">üåê Hurricane Electric BGP Toolkit</a></li>
            <li><a href="https://www.peeringdb.com/asn/{asn}" target="_blank">üì° PeeringDB</a></li>
            <li><a href="https://stat.ripe.net/AS{asn}" target="_blank">üìà RIPE Stat</a></li>
            <li><a href="https://asrank.caida.org/asns/{asn}" target="_blank">üìä CAIDA ASRank</a></li>
        </ul>
    </div>
    """
    
    display(HTML(html))

# Global state for interactive labeling
current_asn = None
current_row = None
selected_label = None

def create_label_buttons():
    """Erstelle interaktive Buttons f√ºr Label-Auswahl"""
    
    button_style = {
        'button_color': 'lightgray',
        'font_weight': 'bold'
    }
    
    # Definiere Buttons f√ºr jede Klasse
    btn_access = widgets.Button(
        description='üè† Access',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='info',
        tooltip='ISP f√ºr Endkunden'
    )
    
    btn_content = widgets.Button(
        description='‚òÅÔ∏è Content',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='info',
        tooltip='CDN, Hosting, Cloud'
    )
    
    btn_education = widgets.Button(
        description='üéì Education/Research',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='info',
        tooltip='Universit√§ten, Forschung'
    )
    
    btn_enterprise = widgets.Button(
        description='üè¢ Enterprise',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='info',
        tooltip='Firmennetzwerke'
    )
    
    btn_network_services = widgets.Button(
        description='üîß Network Services',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='info',
        tooltip='IX, DNS, etc.'
    )
    
    btn_transit = widgets.Button(
        description='üåê Transit',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='info',
        tooltip='Wholesale IP Transit'
    )
    
    btn_skip = widgets.Button(
        description='‚è≠Ô∏è Skip',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='warning',
        tooltip='Unsicher/mehrdeutig'
    )
    
    btn_quit = widgets.Button(
        description='üíæ Quit & Save',
        layout=widgets.Layout(width='200px', height='50px'),
        button_style='danger',
        tooltip='Speichern und beenden'
    )
    
    # Button handlers
    def on_access_clicked(b):
        global selected_label
        selected_label = 'Access'
    
    def on_content_clicked(b):
        global selected_label
        selected_label = 'Content'
    
    def on_education_clicked(b):
        global selected_label
        selected_label = 'Education/Research'
    
    def on_enterprise_clicked(b):
        global selected_label
        selected_label = 'Enterprise'
    
    def on_network_services_clicked(b):
        global selected_label
        selected_label = 'Network Services'
    
    def on_transit_clicked(b):
        global selected_label
        selected_label = 'Transit'
    
    def on_skip_clicked(b):
        global selected_label
        selected_label = 'SKIP'
    
    def on_quit_clicked(b):
        global selected_label
        selected_label = 'QUIT'
    
    # Attach handlers
    btn_access.on_click(on_access_clicked)
    btn_content.on_click(on_content_clicked)
    btn_education.on_click(on_education_clicked)
    btn_enterprise.on_click(on_enterprise_clicked)
    btn_network_services.on_click(on_network_services_clicked)
    btn_transit.on_click(on_transit_clicked)
    btn_skip.on_click(on_skip_clicked)
    btn_quit.on_click(on_quit_clicked)
    
    # Layout in rows
    row1 = widgets.HBox([btn_access, btn_content, btn_education])
    row2 = widgets.HBox([btn_enterprise, btn_network_services, btn_transit])
    row3 = widgets.HBox([btn_skip, btn_quit])
    
    button_box = widgets.VBox([row1, row2, row3])
    
    return button_box

print("Labeling Tool bereit!")
print(f"Fortschritt: {len(labels)}/{len(goldstandard_candidates)} gelabelt")

Labeling Tool bereit!
Fortschritt: 0/300 gelabelt


## 4. Starte Labeling-Session

**Anleitung:**
1. F√ºhre die Zelle unten aus
2. Schaue dir die ASN-Infos an
3. Klicke auf die Recherche-Links (√∂ffnen in neuem Tab)
4. Gib das korrekte Label ein (1-6)
5. Dr√ºcke 's' zum √úberspringen, 'q' zum Beenden

In [None]:
# Interaktive Labeling-Loop mit Buttons
import time

# Global state
selected_label = None
current_idx = 0

# Finde nicht-gelabelte ASNs
unlabeled_indices = [idx for idx, row in goldstandard_candidates.iterrows() if str(int(row['asn'])) not in labels]

if len(unlabeled_indices) == 0:
    print("‚úÖ Alle ASNs bereits gelabelt!")
else:
    print(f"üìã {len(unlabeled_indices)} ASNs zu labeln")

def show_next_asn():
    """Zeige n√§chstes ASN"""
    global current_idx
    
    if current_idx >= len(unlabeled_indices):
        clear_output()
        print("üéâ Labeling abgeschlossen!")
        save_labels()
        return
    
    idx = unlabeled_indices[current_idx]
    row = goldstandard_candidates.iloc[idx]
    asn = int(row['asn'])
    
    clear_output(wait=True)
    
    # Progress
    progress = len(labels)
    total = len(goldstandard_candidates)
    print(f"{'='*60}")
    print(f"Fortschritt: {progress}/{total} ({progress/total*100:.1f}%)")
    print(f"ASN {current_idx+1}/{len(unlabeled_indices)} dieser Session")
    print(f"{'='*60}\n")
    
    # ASN Info
    show_asn_info(row)
    
    # Buttons
    display(button_box)

def on_label_selected(label):
    """Handle label selection"""
    global current_idx
    
    idx = unlabeled_indices[current_idx]
    row = goldstandard_candidates.iloc[idx]
    asn = int(row['asn'])
    
    if label == 'QUIT':
        clear_output()
        print("üõë Beende und speichere...")
        save_labels()
        print(f"‚úÖ Gespeichert: {len(labels)} Labels")
        return
    
    if label != 'SKIP':
        labels[str(asn)] = {
            'label': label,
            'peeringdb_label': row.get('info_type', 'N/A'),
            'org_name': row.get('org_name', 'N/A'),
            'country': row.get('country', 'N/A')
        }
        print(f"‚úÖ AS{asn} ‚Üí {label}")
        
        # Auto-save
        if len(labels) % 10 == 0:
            save_labels()
            print("üíæ Auto-saved")
    else:
        print(f"‚è≠Ô∏è AS{asn} √ºbersprungen")
    
    current_idx += 1
    time.sleep(0.3)
    show_next_asn()

# Create buttons once
btn_access = widgets.Button(description='üè† Access', button_style='primary')
btn_content = widgets.Button(description='‚òÅÔ∏è Content', button_style='primary')
btn_education = widgets.Button(description='üéì Education', button_style='primary')
btn_enterprise = widgets.Button(description='üè¢ Enterprise', button_style='primary')
btn_network = widgets.Button(description='üîß Network Svc', button_style='primary')
btn_transit = widgets.Button(description='üåê Transit', button_style='primary')
btn_skip = widgets.Button(description='‚è≠Ô∏è Skip', button_style='warning')
btn_quit = widgets.Button(description='üíæ Quit', button_style='danger')

# Event handlers
btn_access.on_click(lambda b: on_label_selected('Access'))
btn_content.on_click(lambda b: on_label_selected('Content'))
btn_education.on_click(lambda b: on_label_selected('Education/Research'))
btn_enterprise.on_click(lambda b: on_label_selected('Enterprise'))
btn_network.on_click(lambda b: on_label_selected('Network Services'))
btn_transit.on_click(lambda b: on_label_selected('Transit'))
btn_skip.on_click(lambda b: on_label_selected('SKIP'))
btn_quit.on_click(lambda b: on_label_selected('QUIT'))

# Layout
row1 = widgets.HBox([btn_access, btn_content, btn_education])
row2 = widgets.HBox([btn_enterprise, btn_network, btn_transit])
row3 = widgets.HBox([btn_skip, btn_quit])
button_box = widgets.VBox([row1, row2, row3])

# Start
if len(unlabeled_indices) > 0:
    show_next_asn()

Output()

## 5. Analyse der gelabelten Daten

In [None]:
# Lade Labels
with open(LABEL_FILE, 'r') as f:
    labels = json.load(f)

# Erstelle DataFrame
label_df = pd.DataFrame([
    {'asn': int(asn), **data}
    for asn, data in labels.items()
])

print(f"Total gelabelt: {len(label_df)}")
print(f"\nLabel-Verteilung (Manual):")
print(label_df['label'].value_counts())

print(f"\n√úbereinstimmung mit PeeringDB:")
agreement = (label_df['label'] == label_df['peeringdb_label']).sum()
print(f"√úbereinstimmung: {agreement}/{len(label_df)} ({agreement/len(label_df)*100:.1f}%)")

# Zeige Unterschiede
differences = label_df[label_df['label'] != label_df['peeringdb_label']]
print(f"\nAnzahl Korrekturen: {len(differences)}")

if len(differences) > 0:
    print("\nBeispiele f√ºr Korrekturen:")
    for _, row in differences.head(10).iterrows():
        print(f"  AS{row['asn']}: {row['peeringdb_label']} ‚Üí {row['label']} ({row['org_name']})")

Total gelabelt: 0

Label-Verteilung (Manual):


KeyError: 'label'

## 6. Export f√ºr Training/Testing

In [None]:
# Speichere als CSV f√ºr einfachen Import
label_df[['asn', 'label']].to_csv('goldstandard_test_set.csv', index=False)
print("Goldstandard Test-Set gespeichert: goldstandard_test_set.csv")

# Statistiken
print(f"\nFinal Statistics:")
print(f"Total ASNs: {len(label_df)}")
print(f"\nPer Class:")
print(label_df['label'].value_counts())