# Load the Data
## Peeringdb

In [1]:
import json
from pathlib import Path
import pandas as pd

filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


# Caida AS Names

In [2]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


## Join both

In [3]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


## Load AS Rank

In [4]:
as_rank_df = pd.read_csv('/workspaces/pytorch-gpu-2/preprocessing/data/asrank/as_rank_df.csv')
as_rank_df.head()

Unnamed: 0,asn,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses
0,3356,1,6613,6545,68,0,53986,873410,3468642119
1,1299,2,2567,2509,58,0,41193,776707,3219679484
2,174,3,6723,6626,97,0,38887,730166,3034352967
3,3257,4,1853,1816,37,0,36040,612491,2791999209
4,2914,5,1541,1483,58,0,25179,576134,2918763154


## Join both

In [5]:
peering_df_joined_with_asrank = pd.merge(
    peering_df_joined,
    as_rank_df,
    left_on='asn',
    right_on='asn',
    how='left'
)
peering_df_joined_with_asrank['rank'] = peering_df_joined_with_asrank['rank'].fillna(peering_df_joined_with_asrank['rank'].median())
peering_df_joined_with_asrank['asnDegree_total'] = peering_df_joined_with_asrank['asnDegree_total'].fillna(peering_df_joined_with_asrank['asnDegree_total'].median())
peering_df_joined_with_asrank['asnDegree_customer'] = peering_df_joined_with_asrank['asnDegree_customer'].fillna(peering_df_joined_with_asrank['asnDegree_customer'].median())
peering_df_joined_with_asrank['asnDegree_peer'] = peering_df_joined_with_asrank['asnDegree_peer'].fillna(peering_df_joined_with_asrank['asnDegree_peer'].median())
peering_df_joined_with_asrank['asnDegree_provider'] = peering_df_joined_with_asrank['asnDegree_provider'].fillna(peering_df_joined_with_asrank['asnDegree_provider'].median())
peering_df_joined_with_asrank['cone_numberAsns'] = peering_df_joined_with_asrank['cone_numberAsns'].fillna(peering_df_joined_with_asrank['cone_numberAsns'].median())
peering_df_joined_with_asrank['cone_numberPrefixes'] = peering_df_joined_with_asrank['cone_numberPrefixes'].fillna(peering_df_joined_with_asrank['cone_numberPrefixes'].median())
peering_df_joined_with_asrank['cone_numberAddresses'] = peering_df_joined_with_asrank['cone_numberAddresses'].fillna(peering_df_joined_with_asrank['cone_numberAddresses'].median())

peering_df_joined_with_asrank.head()

Unnamed: 0,asn,org_name,country,source,info_type,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses
0,4436,"GTT Americas, LLC",US,ARIN,NSP,78320.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,20940,Akamai International B.V.,NL,RIPE,Content,1894.0,485.0,14.0,366.0,105.0,15.0,8945.0,14612752.0
2,31800,DALnet,US,ARIN,Non-Profit,47745.0,78.0,0.0,74.0,4.0,1.0,2.0,512.0
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP,81.0,1273.0,166.0,1101.0,6.0,733.0,22131.0,42899794.0
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP,110.0,499.0,489.0,8.0,2.0,505.0,11982.0,31992440.0


## Load domains

In [6]:
ipinfo_df = pd.read_csv('../../preprocessing/data/ipinfo_domains/ipinfo_domains.csv')
ipinfo_df.head()

Unnamed: 0,ASN,domains
0,16509,139276485
1,13335,63477595
2,52925,32915972
3,396982,24543491
4,47846,17833760


## Join both

In [7]:
peering_df_joined_with_asrank_and_domains = pd.merge(
    peering_df_joined_with_asrank,
    ipinfo_df,
    left_on='asn',
    right_on='ASN',
    how='left'
)
peering_df_joined_with_asrank_and_domains['domains'] = peering_df_joined_with_asrank_and_domains['domains'].fillna(peering_df_joined_with_asrank_and_domains['domains'].median(), inplace=True)
peering_df_joined_with_asrank_and_domains.head()

Unnamed: 0,asn,org_name,country,source,info_type,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses,ASN,domains
0,4436,"GTT Americas, LLC",US,ARIN,NSP,78320.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,
1,20940,Akamai International B.V.,NL,RIPE,Content,1894.0,485.0,14.0,366.0,105.0,15.0,8945.0,14612752.0,20940.0,
2,31800,DALnet,US,ARIN,Non-Profit,47745.0,78.0,0.0,74.0,4.0,1.0,2.0,512.0,,
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP,81.0,1273.0,166.0,1101.0,6.0,733.0,22131.0,42899794.0,3303.0,
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP,110.0,499.0,489.0,8.0,2.0,505.0,11982.0,31992440.0,22773.0,


## Load geolocations

In [1]:
import clickhouse_connect

client = clickhouse_connect.get_client(
    host='localhost', port=8123,
    username='default', password=''
)

query = """
/* Gewichtetes Zentrum und umfangreiche Distanz-Statistiken (Kilometer) */
WITH base AS (
    SELECT
        asn,
        latitude,
        longitude,
        country,
        toUInt64(ip_end - ip_start + 1) AS w
    FROM ip_location_asn
    WHERE ip_version = 4
      AND origin = 'ipinfo'
),
country_entropy AS (
    SELECT
        asn,
        -- Gewichte je Land
        groupArray(wc)           AS arr_w,
        sum(wc)                  AS W,
        -- Shannon-Entropie (Bits)
        -arraySum(x -> (x / W) * log2(x / W), arr_w) AS country_entropy_bits
    FROM (
        SELECT asn, country, sum(w) AS wc
        FROM base
        GROUP BY asn, country
    )
    GROUP BY asn
),
vec AS (
    SELECT
        asn,
        sum(w * cos(radians(latitude)) * cos(radians(longitude))) AS X,
        sum(w * cos(radians(latitude)) * sin(radians(longitude))) AS Y,
        sum(w * sin(radians(latitude)))                           AS Z,
        sum(w)                                                    AS W
    FROM base
    GROUP BY asn
),
center AS (
    SELECT
        asn,
        degrees(atan2(Y, X))                       AS center_lon,
        degrees(atan2(Z, sqrt(X * X + Y * Y)))     AS center_lat
    FROM vec
),
joined AS (
    SELECT
        b.asn,
        b.w,
        b.country,
        c.center_lat,
        c.center_lon,
        greatCircleDistance(b.longitude, b.latitude, c.center_lon, c.center_lat) / 1000 AS d_km
    FROM base AS b
    INNER JOIN center AS c USING (asn)
),
stats AS (
    SELECT
        asn,
        any(center_lat) AS center_lat,
        any(center_lon) AS center_lon,
        sum(w)  AS total_weight,
        avgWeighted(d_km,       w) AS mean_km,
        avgWeighted(d_km * d_km, w) AS mean_sq_km2,
        quantileExactWeighted(0.25)(d_km, w) AS p25_km,
        quantileExactWeighted(0.50)(d_km, w) AS p50_km,
        quantileExactWeighted(0.75)(d_km, w) AS p75_km,
        quantileExactWeighted(0.90)(d_km, w) AS p90_km,
        quantileExactWeighted(0.95)(d_km, w) AS p95_km,
        quantileExactWeighted(0.99)(d_km, w) AS p99_km,
        min(d_km) AS min_km,
        max(d_km) AS max_km,
        sumIf(w, d_km <=  100) / sum(w) AS share_le_100km,
        sumIf(w, d_km <=  500) / sum(w) AS share_le_500km,
        sumIf(w, d_km <= 1000) / sum(w) AS share_le_1000km,
    FROM joined
    GROUP BY asn
),
geo_meta AS (
    SELECT
        asn,
        uniqExact((latitude, longitude)) AS unique_points,
        uniqExact(country)               AS country_count
    FROM base
    GROUP BY asn
)
SELECT
    s.asn,
    round(s.center_lat, 5) AS center_lat,
    round(s.center_lon, 5) AS center_lon,
    s.total_weight,
    gm.unique_points,
    gm.country_count,
    round(s.mean_km, 2)                         AS mean_km,
    round(greatest(s.mean_sq_km2 - s.mean_km * s.mean_km, 0), 2) AS var_km2,
    round(sqrt(greatest(s.mean_sq_km2 - s.mean_km * s.mean_km, 0)), 2) AS std_km,
    round(s.p75_km - s.p25_km, 2)               AS iqr_km,
    round(s.p25_km, 2)                          AS p25_km,
    round(s.p50_km, 2)                          AS p50_km,
    round(s.p75_km, 2)                          AS p75_km,
    round(s.p90_km, 2)                          AS p90_km,
    round(s.p95_km, 2)                          AS p95_km,
    round(s.p99_km, 2)                          AS p99_km,
    round(s.min_km, 2)                          AS min_km,
    round(s.max_km, 2)                          AS max_km,
    round(s.share_le_100km * 100, 2)            AS pct_ips_le_100km,
    round(s.share_le_500km * 100, 2)            AS pct_ips_le_500km,
    round(s.share_le_1000km * 100, 2)           AS pct_ips_le_1000km,
    cm.country_entropy_bits,
    round(cm.country_entropy_bits / nullIf(log2(gm.country_count), 0), 4) AS country_entropy_norm
FROM stats AS s
LEFT JOIN geo_meta AS gm USING (asn)
LEFT JOIN country_entropy cm USING (asn)
ORDER BY asn
"""

ch_df = client.query_df(query)
ch_df.head()


ModuleNotFoundError: No module named 'clickhouse_connect'

## Join both

In [10]:
peering_df_joined_with_asrank_and_domains_and_geoloc = pd.merge(
    peering_df_joined_with_asrank_and_domains,
    ch_df,
    left_on='asn',
    right_on='asn',
    how='left'
)
peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'] = peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'].fillna('unknown').str.lower()
peering_df_joined_with_asrank_and_domains_and_geoloc.columns.tolist()

['asn',
 'org_name',
 'country',
 'source',
 'info_type',
 'rank',
 'asnDegree_total',
 'asnDegree_customer',
 'asnDegree_peer',
 'asnDegree_provider',
 'cone_numberAsns',
 'cone_numberPrefixes',
 'cone_numberAddresses',
 'ASN',
 'domains',
 'center_lat',
 'center_lon',
 'total_weight',
 'unique_points',
 'country_count',
 'mean_km',
 'var_km2',
 'std_km',
 'iqr_km',
 'p25_km',
 'p50_km',
 'p75_km',
 'p90_km',
 'p95_km',
 'p99_km',
 'min_km',
 'max_km',
 'pct_ips_le_100km',
 'pct_ips_le_500km',
 'pct_ips_le_1000km']

# Classification

## TF-IDF

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ==== Daten ====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()
df["org_name"] = df["org_name"].fillna("unknown").str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)]  # sehr kleine Klassen raus (optional)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    df["org_name"], df["info_type"], test_size=0.13, random_state=42, stratify=df["info_type"]
)

# Gemeinsamer Vectorizer (fit nur auf Train!)
vec = TfidfVectorizer(analyzer="char", ngram_range=(1,6),
                      lowercase=True, min_df=1, sublinear_tf=True)

# ==== 1) SVM + Kalibrierung ====
svm = LinearSVC(C=0.35, class_weight="balanced")
svm_cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)

svm_pipe = Pipeline([
    ("tfidf", vec),
    ("svm_cal", svm_cal)
])

svm_pipe.fit(X_train_text, y_train)
y_pred_svm = svm_pipe.predict(X_test_text)
print("\n=== SVM (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm))



=== SVM (calibrated) ===
Accuracy: 0.5964749536178108
Macro-F1: 0.36873452553342556
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.63      0.92      0.75      1633
             Content       0.48      0.32      0.39       330
Educational/Research       0.64      0.49      0.55       199
          Enterprise       0.51      0.20      0.29       233
          Government       0.54      0.39      0.45        18
                 NSP       0.39      0.19      0.26       542
    Network Services       0.33      0.04      0.07       110
          Non-Profit       0.72      0.40      0.52        84
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.59      0.32      0.42        81

            accuracy                           0.60      3234
           macro avg       0.48      0.33      0.37      3234
        weighted avg       0.56      0.60      0.54      3234



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Combine

In [13]:
# === TF-IDF (org_name) + Numerik -> Calibrated LinearSVC ===
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ---------- Daten vorbereiten ----------
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# Text normalisieren
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()

# Sehr kleine Klassen optional rausfiltern
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

# Zielspalte
y = df["info_type"].astype(str)

# Numerische Spalten bestimmen (alles außer Text/Kat/Label)
ignore = {"org_name", "info_type", "country", "source"}
num_candidates = [c for c in df.columns if c not in ignore]

# Nur numerisch verwertbare Spalten (coerce -> float)
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce")
num_df = num_df.replace([np.inf, -np.inf], np.nan)

num_cols = num_df.columns.tolist()

# Feature-DataFrame für Pipeline
X = pd.concat([df[["org_name"]].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

# ---------- Train/Test Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, random_state=42, stratify=y
)

# ---------- Preprocessing ----------
def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

num_pipe = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", MaxAbsScaler())
])

pre = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(analyzer="char", ngram_range=(2,6),
                                 lowercase=True, sublinear_tf=True, min_df=1),
         "org_name"),
        ("num",  num_pipe, num_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

# ---------- SVM + Kalibrierung ----------
base_svm = LinearSVC(C=0.35, class_weight="balanced", random_state=42)
cal = CalibratedClassifierCV(base_svm, cv=3, method="sigmoid")  # robust

pipe = Pipeline([
    ("pre", pre),
    ("svm", cal)
])

# ---------- Trainieren ----------
pipe.fit(X_train, y_train)

# ---------- Evaluieren ----------
y_pred = pipe.predict(X_test)
print("\n=== TF-IDF(Text) + Numerik -> Calibrated LinearSVC ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))

# ---------- (Optional) Probas für Ensembling ----------
# P_test = pipe.predict_proba(X_test)   # shape [N, n_classes]
# classes_ = pipe.named_steps["svm"].classes_.tolist()





=== TF-IDF(Text) + Numerik -> Calibrated LinearSVC ===
Accuracy: 0.6221397649969078
Macro-F1: 0.4139509901721804
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.66      0.92      0.77      1633
             Content       0.49      0.35      0.41       330
Educational/Research       0.64      0.56      0.60       199
          Enterprise       0.51      0.25      0.34       233
          Government       0.64      0.50      0.56        18
                 NSP       0.48      0.25      0.33       542
    Network Services       0.40      0.04      0.07       110
          Non-Profit       0.74      0.40      0.52        84
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.60      0.51      0.55        81

            accuracy                           0.62      3234
           macro avg       0.52      0.38      0.41      3234
        weighted avg       0.59      0.62      0.58      3234



In [12]:
# ===============================================
# TF-IDF (org_name) + Numerik + country
# -> LinearSVC (unkalibriert)  und  SGD(modified_huber, mit Probas)
# ===============================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import FunctionTransformer

def amplify_numeric(X, factor=3.0):
    return X * factor

# ==== 0) Quelle laden (passe ggf. den DF-Namen an) ====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# ==== 1) Vorverarbeitung & Label-Filter ====
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

# Ziel
y = df["info_type"].astype(str)

# ==== 2) Feature-Spalten bestimmen ====
# Wir nehmen country dazu (starkes Signal) und alle numerisch konvertierbaren Spalten
ignore = {"org_name", "info_type", "source"}  # 'country' NICHT ignorieren
all_cols = df.columns.tolist()
text_col = "org_name"
cat_cols  = ["country"] if "country" in df.columns else []

# numerische Kandidaten = alles außer Text/Label/Source/country
num_candidates = [c for c in all_cols if c not in ignore.union({text_col}).union(set(cat_cols))]
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
num_cols = num_df.columns.tolist()

# Endgültiges X-DF
X = pd.concat([df[[text_col] + cat_cols].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

# ==== 3) Train/Test Split ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

# ==== 4) Preprocessor bauen ====
# Numerik: Impute -> log1p -> MaxAbsScaler (gut in Kombi mit TF-IDF, bleibt sparse-freundlich)
def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.where(np.isfinite(A), A, np.nan)
    # Median-Imputation passiert vorher; hier nur Sicherung
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", MaxAbsScaler()),
    ("boost", FunctionTransformer(lambda X: X * 3.0, validate=False)),  # Gewichtung!
])

transformers = [
    ("text", TfidfVectorizer(analyzer="char", ngram_range=(2,6),
                             lowercase=True, sublinear_tf=True, min_df=1), text_col),
]
if cat_cols:
    transformers.append(("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols))
if num_cols:
    transformers.append(("num", num_pipe, num_cols))

pre = ColumnTransformer(transformers, remainder="drop", sparse_threshold=0.3)

# ==== 5) MODELL A: LinearSVC (unkalibriert) ====
svm_linear = LinearSVC(
    C=0.35,
    class_weight="balanced",
    max_iter=5000,          # Konvergenz sichern
    random_state=42
)
pipe_svm = Pipeline([("pre", pre), ("clf", svm_linear)])
pipe_svm.fit(X_train, y_train)

y_pred_svm = pipe_svm.predict(X_test)
print("\n=== LinearSVC (TF-IDF + Numerik + country) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm, zero_division=0))

# ==== 6) MODELL B: SGDClassifier (modified_huber) -> liefert predict_proba ====
sgd = SGDClassifier(
    loss="modified_huber",   # SVM-ähnlich, aber mit Probas
    alpha=1e-4,
    class_weight="balanced",
    max_iter=5000,
    tol=1e-3,
    random_state=42
)
pipe_sgd = Pipeline([("pre", pre), ("clf", sgd)])
pipe_sgd.fit(X_train, y_train)

y_pred_sgd = pipe_sgd.predict(X_test)
print("\n=== SGD(modified_huber) (TF-IDF + Numerik + country) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Macro-F1:", f1_score(y_test, y_pred_sgd, average="macro"))
print(classification_report(y_test, y_pred_sgd, zero_division=0))

# Optional: Probas (für Ensembling/Routing)
# P_test = pipe_sgd.predict_proba(X_test)
# classes_ = pipe_sgd.named_steps["clf"].classes_.tolist()

# ==== 7) Mini-Tuning (optional, schnell) ====
# Wenn du noch 2-3 Punkte rausholen willst, probier leicht andere C/alpha:
#   - LinearSVC: C in [0.25, 0.35, 0.5, 0.75, 1.0]
#   - SGD alpha in [5e-5, 1e-4, 2e-4]
# Oder n-gram Range auf (2,7) testen.


: 

: 

In [None]:
# --- schnelle & stabile Pipeline ---
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# Labels filtern
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)
y = df["info_type"].astype(str)

# Spalten
ignore = {"org_name", "info_type", "source"}
text_col = "org_name"
cat_cols  = ["country"] if "country" in df.columns else []
num_candidates = [c for c in df.columns if c not in ignore.union({text_col}).union(set(cat_cols))]
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
num_cols = num_df.columns.tolist()

X = pd.concat([df[[text_col] + cat_cols].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.where(np.isfinite(A), A, np.nan)
    A = np.clip(A, 0.0, None)
    return np.log1p(A)

# Numerik (klein → darf dicht sein)
NUM_BOOST = 8.0
num_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", StandardScaler()),
    ("boost", FunctionTransformer(lambda X: X * NUM_BOOST, validate=False)),
])

# Text: TF-IDF → SVD (SVD reduziert massiv! DANN ist Dichte ok)
text_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char", ngram_range=(2, 6),
        lowercase=True, sublinear_tf=True,
        min_df=3,          # etwas strenger für Speed
        dtype=np.float32
    )),
    ("svd", TruncatedSVD(n_components=350, random_state=42, n_iter=5)),
])

# Country: OHE (wenig Kardinalität → dicht ok)
try:
    cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

transformers = [("text", text_pipe, text_col)]
if cat_cols: transformers.append(("cat", cat_encoder, cat_cols))
if num_cols: transformers.append(("num", num_pipe, num_cols))

# WICHTIG: KEIN sparse_threshold=0.0 -> nichts wird unnötig verdichtet
pre = ColumnTransformer(transformers, remainder="drop")

clf = LogisticRegression(
    solver="saga",
    penalty="l2",
    C=2.0,
    max_iter=4000,               # dank SVD meist ausreichend
    class_weight="balanced",
    n_jobs=-1,
    multi_class="multinomial",
)

pipe = Pipeline([("pre", pre), ("clf", clf)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print("\n=== Fast LogisticRegression (TFIDF->SVD + OHE + Num[boost]) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))


In [None]:
# === Nur Numerik -> HistGradientBoostingClassifier (mit Log1p für schiefe Spalten) ===
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier

# ---------- 1) Daten vorbereiten ----------
# Nimm den gleichen DF wie in deinen letzten Läufen:
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# Optional: sehr kleine Klassen entfernen (wie zuvor)
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

# Ziel
y = df["info_type"].astype(str)

# Kandidaten für Numerik: alles außer offensichtlichem Text/Kat/Label
ignore = {"org_name", "info_type", "country", "source"}
num_candidates = [c for c in df.columns if c not in ignore]
num_candidates.remove("asn")  # ASN ist numerisch, aber keine sinnvolle Zahl

# In numerisch zwingen; Unendlichkeiten -> NaN
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)

# Falls Spalten komplett NaN sind, droppen
num_df = num_df.drop(columns=num_df.columns[num_df.isna().all()], errors="ignore")

# Endgültige Numerik-Spaltenliste
num_cols = num_df.columns.tolist()

# Feature-DF
X = num_df.copy()

# ---------- 2) Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

# ---------- 3) Spalten in log/linear trennen ----------
# Log1p nur für Spalten, die (nach Imputation) nichtnegativ sind
# Wir schätzen die Nichtnegativität vorab grob über die Trainingsdaten (NaN -> 0 für Prüfung)
_train_tmp = X_train.copy()
_train_tmp = _train_tmp.fillna(0)
log_cols = [c for c in _train_tmp.columns if _train_tmp[c].min() >= 0.0]
lin_cols = [c for c in _train_tmp.columns if c not in log_cols]

def log1p_array(A):
    A = np.asarray(A, dtype=float)
    # A kommt nach Imputation; zur Sicherheit clippen
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

# ---------- 4) Preprocessing ----------
num_log_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("sc",    StandardScaler(with_mean=False)),
])

num_lin_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("sc",  StandardScaler(with_mean=False)),
])

transformers = []
if log_cols:
    transformers.append(("num_log", num_log_pipe, log_cols))
if lin_cols:
    transformers.append(("num_lin", num_lin_pipe, lin_cols))

pre = ColumnTransformer(transformers, remainder="drop")

# ---------- 5) Modell ----------
hgb = HistGradientBoostingClassifier(
    learning_rate=0.06,
    max_leaf_nodes=31,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42
)

pipe = Pipeline([
    ("pre", pre),
    ("clf", hgb)
])

# ---------- 6) Train ----------
pipe.fit(X_train, y_train)

# ---------- 7) Eval ----------
y_pred = pipe.predict(X_test)
print("\n=== Nur Numerik -> HistGradientBoosting ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))

# ---------- 8) (Optional) Feature-Importance per Permutation ----------
# Hinweis: kann etwas dauern; zeigt dir, welche Numerik-Spalten wirklich tragen.
from sklearn.inspection import permutation_importance

res = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42)
# ColumnTransformer erzeugt eine kombinierte Featureliste:
feature_names = []
if log_cols:
    feature_names += [f"[log] {c}" for c in log_cols]
if lin_cols:
    feature_names += [f"[lin] {c}" for c in lin_cols]

imp = pd.Series(res.importances_mean, index=feature_names).sort_values(ascending=False)
print("\nTop-20 Numerik-Features (Permutation Importance):")
print(imp.head(20))



=== Nur Numerik -> HistGradientBoosting ===
Accuracy: 0.44712430426716143
Macro-F1: 0.26504239987683176
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.81      0.55      0.66      1633
             Content       0.41      0.39      0.40       330
Educational/Research       0.30      0.25      0.27       199
          Enterprise       0.23      0.25      0.24       233
          Government       0.05      0.28      0.08        18
                 NSP       0.40      0.32      0.36       542
    Network Services       0.10      0.18      0.13       110
          Non-Profit       0.18      0.33      0.23        84
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.16      0.93      0.28        81

            accuracy                           0.45      3234
           macro avg       0.26      0.35      0.27      3234
        weighted avg       0.57      0.45      0.49      3234



KeyboardInterrupt: 

In [None]:
# ===============================================
# TF-IDF (org_name) + Numerik + country
# -> LinearSVC (unkalibriert)  und  SGD(modified_huber, mit Probas)
# == Numerik gezielt boosten + Wirkung prüfen
# ===============================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ==== 0) Quelle laden ====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# ==== 1) Vorverarbeitung & Label-Filter ====
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

y = df["info_type"].astype(str)

# ==== 2) Feature-Spalten ====
ignore = {"org_name", "info_type", "source"}  # 'country' NICHT ignorieren
all_cols = df.columns.tolist()
text_col = "org_name"
cat_cols  = ["country"] if "country" in df.columns else []

num_candidates = [c for c in all_cols if c not in ignore.union({text_col}).union(set(cat_cols))]
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
num_cols = num_df.columns.tolist()

X = pd.concat([df[[text_col] + cat_cols].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

# ==== 3) Split ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

# ==== 4) Preprocessor ====
def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.where(np.isfinite(A), A, np.nan)
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

# >>> EINZIGER KNOPF: Boost-Faktor für Numerik <<<
NUM_BOOST = 8.0   # typ. 5–15 probieren; 8 ist guter Start

num_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", MaxAbsScaler()),                             # sparse-freundlich
    ("boost", FunctionTransformer(lambda X: X * NUM_BOOST, # hier boosten!
                                  validate=False)),
])

# OneHotEncoder explizit SPARSE lassen (Performance, kein Densify)
try:
    cat_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    cat_enc = OneHotEncoder(handle_unknown="ignore", sparse=True)

transformers = [
    ("text", TfidfVectorizer(analyzer="char", ngram_range=(2,6),
                             lowercase=True, sublinear_tf=True, min_df=1,
                             dtype=np.float32), text_col),
]
if cat_cols:
    transformers.append(("cat", cat_enc, cat_cols))
if num_cols:
    transformers.append(("num", num_pipe, num_cols))

# Wichtig: NICHT künstlich verdichten
pre = ColumnTransformer(transformers, remainder="drop")  # kein sparse_threshold=0.0

# ==== 5) MODELL A: LinearSVC ====
svm_linear = LinearSVC(
    C=0.5,                      # etwas höher, weil wir Numerik pushen
    class_weight="balanced",
    max_iter=5000,
    random_state=42
)
pipe_svm = Pipeline([("pre", pre), ("clf", svm_linear)])
pipe_svm.fit(X_train, y_train)

y_pred_svm = pipe_svm.predict(X_test)
print("\n=== LinearSVC (TF-IDF + Numerik*{:.1f} + country) ===".format(NUM_BOOST))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm, zero_division=0))

# ==== 6) MODELL B: SGD (modified_huber) ====
sgd = SGDClassifier(
    loss="modified_huber",
    alpha=1e-4,                 # ggf. auf 5e-5 senken, wenn du noch mehr Numerik willst
    class_weight="balanced",
    max_iter=5000,
    tol=1e-3,
    random_state=42
)
pipe_sgd = Pipeline([("pre", pre), ("clf", sgd)])
pipe_sgd.fit(X_train, y_train)

y_pred_sgd = pipe_sgd.predict(X_test)
print("\n=== SGD(modified_huber) (TF-IDF + Numerik*{:.1f} + country) ===".format(NUM_BOOST))
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Macro-F1:", f1_score(y_test, y_pred_sgd, average="macro"))
print(classification_report(y_test, y_pred_sgd, zero_division=0))

# ==== 7) Kurzer Wirkungs-Check: tragen die Numerik-Koeffizienten jetzt mehr? ====
def block_norm_ratio(pipe):
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]
    names = pre.get_feature_names_out()

    # Indexe der Numerik-Features
    num_idx = np.array([i for i, n in enumerate(names) if n.startswith("num__")])
    if num_idx.size == 0 or not hasattr(clf, "coef_"):
        return np.nan

    # multi-class: L2-Norm pro Klasse mitteln
    W = clf.coef_          # shape [K, D] (bei binary ggf. [1, D])
    num_norm = np.mean(np.linalg.norm(W[:, num_idx], axis=1))
    all_norm = np.mean(np.linalg.norm(W, axis=1))
    return float(num_norm / (all_norm + 1e-12))

print("\n[Diagnose] Anteil der Numerik-Koeffizienten (L2-Norm) an Gesamt:")
print("LinearSVC  num/all =", block_norm_ratio(pipe_svm))
print("SGD        num/all =", block_norm_ratio(pipe_sgd))


In [None]:
peering_df_joined_with_asrank_and_domains_and_geoloc.columns

Index(['asn', 'org_name', 'country', 'source', 'info_type', 'rank',
       'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer',
       'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes',
       'cone_numberAddresses', 'ASN', 'domains', 'center_lat', 'center_lon',
       'total_weight', 'unique_points', 'country_count', 'mean_km', 'var_km2',
       'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
       'p99_km', 'min_km', 'max_km', 'pct_ips_le_100km', 'pct_ips_le_500km',
       'pct_ips_le_1000km'],
      dtype='object')

In [None]:
peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'].fillna('unknown').str.lower()


0                                        gtt americas, llc
1                                akamai international b.v.
2                                                   dalnet
3                                    swisscom (schweiz) ag
4                                  cox communications inc.
                               ...                        
24870    max technology & support services private limited
24871                                                  NaN
24872                                                  NaN
24873                                      bjoern schleyer
24874                                         kiwi telecom
Name: org_name, Length: 24875, dtype: object

In [None]:
peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'] = peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'].fillna('unknown').str.lower()

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler
import numpy as np
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report


df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()
X = df[['org_name', 'rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes',
    'cone_numberAddresses', 'domains',  'total_weight', 'unique_points', 'country_count', 'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
    'p99_km', 'min_km', 'max_km', 'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']]
y = df['info_type']
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)
text_col = 'org_name'
num_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes',
    'cone_numberAddresses', 'domains',  'total_weight', 'unique_points', 'country_count', 'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
    'p99_km', 'min_km', 'max_km', 'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']

pct_cols = ['pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']

# Distanz-/Streumaße (km): oft stark rechtsschief
km_cols = ['mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km', 'p99_km',
           'min_km', 'max_km']

# Zähl-/Skalenwerte: Ränge, Degrees, Cones, Domains, Gewichte, Länder, Punkte
count_like_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
                   'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
                   'domains', 'total_weight', 'unique_points', 'country_count']

# Falls du ASN wirklich als Zahl nutzen willst: packe ihn hier rein
# count_like_cols += ['ASN']

# --- Teil-Pipelines ---

# 1) Count/Skalen: impute -> log1p -> robust scale
count_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda X: np.log1p(np.clip(X, a_min=0, a_max=None)), feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 2) km-Metriken: impute -> log1p -> robust scale
km_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda X: np.log1p(np.clip(X, a_min=0, a_max=None)), feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 3) Prozente: impute -> clip [0,1] -> standard scale
pct_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("clip01", FunctionTransformer(lambda X: np.clip(X, 0.0, 100.0), feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

# --- Text-Pipeline (deine) ---
from sklearn.feature_extraction.text import TfidfVectorizer
text_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(2,5),
        min_df=1,
        lowercase=True
    ))
])

text_col = "org_name"

# --- ColumnTransformer zusammenbauen ---
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num_count", count_pipe, count_like_cols),
        ("num_km", km_pipe, km_cols),
        ("num_pct", pct_pipe, pct_cols),
    ],
    remainder="drop",
    transformer_weights={
        "text": 1.0,
        "num_count": 10.0,
        "num_km": 8.0,
        "num_pct": 2.0,
    },
    sparse_threshold=0.3
)
from xgboost import XGBClassifier
clf = Pipeline([
    ("prep", preprocessor),
    ("model", XGBClassifier(
        tree_method="gpu_hist",           # "gpu_hist" falls wirklich GPU
        n_estimators=1200,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        reg_alpha=0.5,
        objective="multi:softprob",
        eval_metric="mlogloss",
        num_class=n_classes,
        random_state=42
    ))
])


# Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)
class_counts = pd.Series(y_enc).value_counts().sort_index().values
alpha = 0.5                      # 0.5–1.0 probieren
class_weights = (1.0 / (class_counts ** alpha))
sample_weight = class_weights[y_enc]

clf.fit(X_train, y_train, model__sample_weight=sample_weight[X_train.index])

# --- 3) Threshold-Tuning auf Validation ------------------------------------
P_val = clf.predict_proba(X_train)  # shape: (n_val, n_classes)
n_classes = P_val.shape[1]

from sklearn.metrics import f1_score
import numpy as np

def predict_with_class_thresholds(P, th_vec):
    th_safe = np.clip(th_vec, 1e-6, 1.0)   # Schutz gegen 0
    scores = P / th_safe                   # kleiner th_j begünstigt Klasse j
    return scores.argmax(axis=1)

def coordinate_descent_thresholds(P, y_true, n_classes, iters=3, grid=None, init=0.5):
    if grid is None:
        grid = np.linspace(0.2, 0.8, 13)
    th = np.full(n_classes, init, dtype=float)
    best_pred = predict_with_class_thresholds(P, th)
    best_f1 = f1_score(y_true, best_pred, average="macro")
    for _ in range(iters):
        improved = False
        for c in range(n_classes):
            best_th_c = th[c]
            best_local = best_f1
            for t in grid:
                trial = th.copy(); trial[c] = t
                pred = predict_with_class_thresholds(P, trial)
                f1 = f1_score(y_true, pred, average="macro")
                if f1 > best_local:
                    best_local = f1
                    best_th_c = t
            if best_th_c != th[c]:
                th[c] = best_th_c
                best_f1 = best_local
                improved = True
        if not improved:
            break
    return th, best_f1

best_th_vec, best_f1_val = coordinate_descent_thresholds(
    P_val, y_test, n_classes=n_classes, iters=3, grid=np.linspace(0.2, 0.8, 13), init=0.5
)
print(f"[Val] Macro-F1 mit per-Klasse-Thresholds: {best_f1_val:.4f}")
# Optional: die Schwellen je Klasse ansehen:
# for cls, th in zip(le.classes_, best_th_vec): print(cls, round(th, 2))


# --- 4) Finales Training auf ganzem Training + Test-Eval -------------------
class_counts_train = pd.Series(y_train).value_counts().sort_index().values
class_weights_train = (1.0 / (class_counts_train ** alpha))
sw_train = class_weights_train[y_train]

clf.fit(X_train, y_train, model__sample_weight=sw_train)

P_test = clf.predict_proba(X_test)

# a) Thresholded Predictions
y_pred_thr = predict_with_class_thresholds(P_test, best_th_vec)

# b) Vergleich: Argmax (Baseline)
y_pred_argmax = P_test.argmax(axis=1)

print("=== Thresholded ===")
print("Accuracy:", accuracy_score(y_test, y_pred_thr))
print("Macro-F1:", f1_score(y_test, y_pred_thr, average="macro"))
print(classification_report(y_test, y_pred_thr, zero_division=0, target_names=le.classes_))

print("=== Argmax (Baseline) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_argmax))
print("Macro-F1:", f1_score(y_test, y_pred_argmax, average="macro"))
print(classification_report(y_test, y_pred_argmax, zero_division=0, target_names=le.classes_))


[Val] Macro-F1 mit per-Klasse-Thresholds: 0.2888


KeyboardInterrupt: 

In [None]:
# -*- coding: utf-8 -*-
# Voller Code: Tabular + TF-IDF Pipeline, XGBoost, per-Klasse Threshold-Tuning (ohne Test-Leakage)

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier


# ===== 0) Daten vorbereiten =====
# Erwartet: peering_df_joined_with_asrank_and_domains_and_geoloc (DataFrame)
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

X = df[['org_name',
        'rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
        'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
        'domains', 'total_weight', 'unique_points', 'country_count',
        'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
        'p99_km', 'min_km', 'max_km',
        'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km'
       ]].copy()

y = df['info_type'].copy()
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)

text_col = 'org_name'

# Distanz-/Streumaße (km): oft rechtsschief
km_cols = ['mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
           'p99_km', 'min_km', 'max_km']

# Zähl-/Skalenwerte: Ränge, Degrees, Cones, Domains, Gewichte, Länder, Punkte
count_like_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
                   'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
                   'domains', 'total_weight', 'unique_points', 'country_count']

# Prozent-Features (aus SQL typischerweise 0..100)
pct_cols = ['pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']


# ===== 1) Teil-Pipelines =====

# 1a) Count/Skalen: impute -> log1p -> robust scale
count_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda A: np.log1p(np.clip(A, a_min=0, a_max=None)),
                                  feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 1b) km-Metriken: impute -> log1p -> robust scale
km_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda A: np.log1p(np.clip(A, a_min=0, a_max=None)),
                                  feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 1c) Prozent: 0..100 -> 0..1, dann standardisieren
pct_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("to01", FunctionTransformer(lambda A: np.clip(A / 100.0, 0.0, 1.0),
                                 feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

# 1d) Text: TF-IDF (Char-NGrams). Optional könntest du hier noch Word-NGrams hinzufügen.
text_pipe = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(2, 5),
        min_df=1,
        lowercase=True
    ))
])


# ===== 2) ColumnTransformer =====
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num_count", count_pipe, count_like_cols),
        ("num_km", km_pipe, km_cols),
        ("num_pct", pct_pipe, pct_cols),
    ],
    remainder="drop",
    transformer_weights={
        "text": 1.0,
        "num_count": 10.0,
        "num_km": 8.0,
        "num_pct": 2.0,
    },
    sparse_threshold=0.3
)


# ===== 3) Modell (XGBoost) =====
clf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", XGBClassifier(
        tree_method="hist",            # "gpu_hist" falls GPU vorhanden
        n_estimators=1200,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        reg_alpha=0.5,
        objective="multi:softprob",
        eval_metric="mlogloss",
        num_class=n_classes,
        random_state=42
    ))
])


# ===== 4) Splits =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

# Innerer Validation-Split NUR aus dem Training (für Threshold-Tuning)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)


# ===== 5) Class Imbalance via Sample Weights =====
alpha = 0.5  # 0.5–1.0 testen
class_counts_tr = pd.Series(y_tr).value_counts().sort_index().values
class_weights_tr = (1.0 / (class_counts_tr ** alpha))
sw_tr = class_weights_tr[y_tr]  # y_tr sind 0..K-1 → Indexierung passt

# Fit fürs Tuning
clf.fit(X_tr, y_tr, model__sample_weight=sw_tr)


# ===== 6) Threshold-Tuning auf Validation (per-Klasse) =====
P_val = clf.predict_proba(X_val)  # (n_val, n_classes)
n_classes = P_val.shape[1]

def predict_with_class_thresholds(P: np.ndarray, th_vec: np.ndarray) -> np.ndarray:
    """
    Skaliert pro Klasse die Wahrscheinlichkeiten mit 1/th.
    Kleinere th_j -> Klasse j wird bevorzugt.
    """
    th_safe = np.clip(th_vec, 1e-6, 1.0)
    scores = P / th_safe
    return scores.argmax(axis=1)

def coordinate_descent_thresholds(P: np.ndarray, y_true: np.ndarray,
                                  n_classes: int, iters: int = 3,
                                  grid: np.ndarray | None = None,
                                  init: float = 0.5):
    """
    Einfache koordinatenweise Suche nach per-Klasse-Thresholds, die Macro-F1 maximieren.
    """
    if grid is None:
        grid = np.linspace(0.2, 0.8, 13)
    th = np.full(n_classes, init, dtype=float)
    best_pred = predict_with_class_thresholds(P, th)
    best_f1 = f1_score(y_true, best_pred, average="macro")

    for _ in range(iters):
        improved = False
        for c in range(n_classes):
            best_th_c = th[c]
            best_local = best_f1
            for t in grid:
                trial = th.copy(); trial[c] = t
                pred = predict_with_class_thresholds(P, trial)
                f1 = f1_score(y_true, pred, average="macro")
                if f1 > best_local:
                    best_local = f1
                    best_th_c = t
            if best_th_c != th[c]:
                th[c] = best_th_c
                best_f1 = best_local
                improved = True
        if not improved:
            break
    return th, best_f1

best_th_vec, best_f1_val = coordinate_descent_thresholds(
    P_val, y_val, n_classes=n_classes, iters=3, grid=np.linspace(0.2, 0.8, 13), init=0.5
)
print(f"[Validation] Macro-F1 (per-Klasse Thresholds): {best_f1_val:.4f}")
# Optional: Thresholds je Klasse anschauen
# for cls, th in zip(le.classes_, best_th_vec):
#     print(f"{cls:20s} -> th={th:.2f}")


# ===== 7) Finaler Fit auf komplettem Training + Test-Evaluation =====
class_counts_train = pd.Series(y_train).value_counts().sort_index().values
class_weights_train = (1.0 / (class_counts_train ** alpha))
sw_train = class_weights_train[y_train]

clf.fit(X_train, y_train, model__sample_weight=sw_train)

# Test-Probas
P_test = clf.predict_proba(X_test)

# a) Thresholded Predictions
y_pred_thr = predict_with_class_thresholds(P_test, best_th_vec)

# b) Vergleich: Argmax (Baseline)
y_pred_argmax = P_test.argmax(axis=1)

print("\n=== Thresholded ===")
print("Accuracy:", accuracy_score(y_test, y_pred_thr))
print("Macro-F1:", f1_score(y_test, y_pred_thr, average="macro"))
print(classification_report(y_test, y_pred_thr, zero_division=0, target_names=le.classes_))

print("\n=== Argmax (Baseline) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_argmax))
print("Macro-F1:", f1_score(y_test, y_pred_argmax, average="macro"))
print(classification_report(y_test, y_pred_argmax, zero_division=0, target_names=le.classes_))


In [13]:
from sklearn.metrics import accuracy_score, f1_score, classification_report
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy:  0.6290400385914134
Macro-F1: 0.4178898751477149
              precision    recall  f1-score   support

           0       0.68      0.91      0.78      3140
           1       0.50      0.41      0.45       635
           2       0.61      0.61      0.61       382
           3       0.48      0.23      0.31       449
           4       0.50      0.21      0.29        34
           5       0.46      0.28      0.35      1042
           6       0.43      0.10      0.17       212
           7       0.67      0.35      0.46       161
           8       0.33      0.12      0.18         8
           9       0.59      0.56      0.58       156

    accuracy                           0.63      6219
   macro avg       0.53      0.38      0.42      6219
weighted avg       0.60      0.63      0.59      6219



In [17]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier

# -------- Spalten (dein Setting) --------
text_col = "org_name"
pct_cols  = ['pct_ips_le_100km','pct_ips_le_500km','pct_ips_le_1000km']
km_cols   = ['mean_km','var_km2','std_km','iqr_km','p25_km','p50_km','p75_km','p90_km','p95_km','p99_km','min_km','max_km']
count_cols= ['rank','asnDegree_total','asnDegree_customer','asnDegree_peer','asnDegree_provider',
             'cone_numberAsns','cone_numberPrefixes','cone_numberAddresses','domains',
             'total_weight','unique_points','country_count']
num_cols  = count_cols + km_cols + pct_cols

# -------- Pipelines --------
# Text → sparse TF-IDF (leicht begrenzen, weniger Rauschen)
text_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char_wb", ngram_range=(3,5),
        min_df=3, max_features=100_000, sublinear_tf=True
    ))
])

# Numerik → impute + (optional) log1p für schiefe, am Ende in CSR, damit alles sparse bleibt
def _log1p_nonneg(X):
    return np.log1p(np.clip(X, a_min=0, a_max=None))

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(_log1p_nonneg, feature_names_out="one-to-one")),
    ("to_csr", FunctionTransformer(lambda X: csr_matrix(X)))
])

# ColumnTransformer → liefert sparse Matrix
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num",  num_pipe,  num_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)

# -------- Daten laden --------
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()
X = df[[text_col] + num_cols].copy()
X[text_col] = X[text_col].fillna("").astype(str)
y = df["info_type"]

# Label-Encode für XGBoost (multiclass)
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

# Preprocess
Xtr = preprocessor.fit_transform(X_train)
Xte = preprocessor.transform(X_test)

# Imbalance: Sample-Weights
w_train = compute_sample_weight(class_weight="balanced", y=y_train)

# -------- XGBoost (liefert Wahrscheinlichkeiten) --------
xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=n_classes,
    eval_metric="mlogloss",
    n_estimators=800,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_lambda=1.0,
    tree_method="gpu_hist",        # "gpu_hist" falls NVIDIA-GPU
    n_jobs=-1
)

# -------- Kalibrierung der Wahrscheinlichkeiten --------
# Hinweis: CalibratedClassifierCV refittet das Basismodell intern (cv=3).
calibrated = CalibratedClassifierCV(xgb, method="isotonic", cv=3)
calibrated.fit(Xtr, y_train, sample_weight=w_train)

# Vorhersage + kalibrierte Probas
y_proba = calibrated.predict_proba(Xte)
y_pred  = y_proba.argmax(axis=1)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(le.inverse_transform(y_test),
                            le.inverse_transform(y_pred),
                            zero_division=0))

# -------- Top-k Prozent je Beispiel (z.B. Top-3) --------
topk = 3
top_idx = np.argsort(y_proba, axis=1)[:, -topk:][:, ::-1]     # Indizes der Top-k Klassen
top_lbl = le.inverse_transform(top_idx)
top_pct = np.take_along_axis(y_proba, top_idx, axis=1) * 100  # in %

# Beispiel-Output als DataFrame
out = pd.DataFrame({
    "org_name": X_test[text_col].values,
    "true_label": le.inverse_transform(y_test),
    "pred_label": le.inverse_transform(y_pred),
})
# Top-1..3 Spalten anhängen
for k in range(topk):
    out[f"top{k+1}_label"] = top_lbl[:, k]
    out[f"top{k+1}_pct"]   = top_pct[:, k].round(1)

# Zeig dir mal die ersten 10 Zeilen
print(out.head(10))


Accuracy: 0.37256793696735807
Macro-F1: 0.22631878427191832
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.76      0.49      0.60      3140
             Content       0.27      0.28      0.28       635
Educational/Research       0.30      0.52      0.38       382
          Enterprise       0.21      0.17      0.19       449
          Government       0.09      0.68      0.15        34
                 NSP       0.35      0.20      0.26      1042
    Network Services       0.08      0.10      0.09       212
          Non-Profit       0.12      0.20      0.15       161
     Route Collector       0.00      0.25      0.00         8
        Route Server       0.13      0.21      0.16       156

            accuracy                           0.37      6219
           macro avg       0.23      0.31      0.23      6219
        weighted avg       0.51      0.37      0.42      6219



ValueError: y should be a 1d array, got an array of shape (6219, 3) instead.

NameError: name 'n_classes' is not defined

In [17]:
# -*- coding: utf-8 -*-
# Voller Code: Tabular + TF-IDF Pipeline, XGBoost, per-Klasse Threshold-Tuning (ohne Test-Leakage)

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier


# ===== 0) Daten vorbereiten =====
# Erwartet: peering_df_joined_with_asrank_and_domains_and_geoloc (DataFrame)
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

X = df[['org_name',
        'rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
        'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
        'domains', 'total_weight', 'unique_points', 'country_count',
        'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
        'p99_km', 'min_km', 'max_km',
        'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km'
       ]].copy()

y = df['info_type'].copy()
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)

text_col = 'org_name'

# Distanz-/Streumaße (km): oft rechtsschief
km_cols = ['mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
           'p99_km', 'min_km', 'max_km']

# Zähl-/Skalenwerte: Ränge, Degrees, Cones, Domains, Gewichte, Länder, Punkte
count_like_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
                   'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
                   'domains', 'total_weight', 'unique_points', 'country_count']

# Prozent-Features (aus SQL typischerweise 0..100)
pct_cols = ['pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']


# ===== 1) Teil-Pipelines =====

# 1a) Count/Skalen: impute -> log1p -> robust scale
count_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda A: np.log1p(np.clip(A, a_min=0, a_max=None)),
                                  feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 1b) km-Metriken: impute -> log1p -> robust scale
km_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda A: np.log1p(np.clip(A, a_min=0, a_max=None)),
                                  feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 1c) Prozent: 0..100 -> 0..1, dann standardisieren
pct_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("to01", FunctionTransformer(lambda A: np.clip(A / 100.0, 0.0, 1.0),
                                 feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

# 1d) Text: TF-IDF (Char-NGrams). Optional könntest du hier noch Word-NGrams hinzufügen.
text_pipe = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(2, 5),
        min_df=1,
        lowercase=True
    ))
])


# ===== 2) ColumnTransformer =====
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num_count", count_pipe, count_like_cols),
        ("num_km", km_pipe, km_cols),
        ("num_pct", pct_pipe, pct_cols),
    ],
    remainder="drop",
    transformer_weights={
        "text": 1.0,
        "num_count": 10.0,
        "num_km": 8.0,
        "num_pct": 2.0,
    },
    sparse_threshold=0.3
)


# ===== 3) Modell (XGBoost) =====
clf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", XGBClassifier(
        tree_method="gpu_hist",            # "gpu_hist" falls GPU vorhanden
        n_estimators=1200,
        learning_rate=0.03,
        max_depth=6,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        reg_alpha=0.5,
        objective="multi:softprob",
        eval_metric="mlogloss",
        num_class=n_classes,
        random_state=42
    ))
])


# ===== 4) Splits =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

# Innerer Validation-Split NUR aus dem Training (für Threshold-Tuning)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)


# ===== 5) Class Imbalance via Sample Weights =====
alpha = 0.5  # 0.5–1.0 testen
class_counts_tr = pd.Series(y_tr).value_counts().sort_index().values
class_weights_tr = (1.0 / (class_counts_tr ** alpha))
sw_tr = class_weights_tr[y_tr]  # y_tr sind 0..K-1 → Indexierung passt

# Fit fürs Tuning
clf.fit(X_tr, y_tr, model__sample_weight=sw_tr)


# ===== 6) Threshold-Tuning auf Validation (per-Klasse) =====
P_val = clf.predict_proba(X_val)  # (n_val, n_classes)
n_classes = P_val.shape[1]

def predict_with_class_thresholds(P: np.ndarray, th_vec: np.ndarray) -> np.ndarray:
    """
    Skaliert pro Klasse die Wahrscheinlichkeiten mit 1/th.
    Kleinere th_j -> Klasse j wird bevorzugt.
    """
    th_safe = np.clip(th_vec, 1e-6, 1.0)
    scores = P / th_safe
    return scores.argmax(axis=1)

def coordinate_descent_thresholds(P: np.ndarray, y_true: np.ndarray,
                                  n_classes: int, iters: int = 3,
                                  grid: np.ndarray | None = None,
                                  init: float = 0.5):
    """
    Einfache koordinatenweise Suche nach per-Klasse-Thresholds, die Macro-F1 maximieren.
    """
    if grid is None:
        grid = np.linspace(0.2, 0.8, 13)
    th = np.full(n_classes, init, dtype=float)
    best_pred = predict_with_class_thresholds(P, th)
    best_f1 = f1_score(y_true, best_pred, average="macro")

    for _ in range(iters):
        improved = False
        for c in range(n_classes):
            best_th_c = th[c]
            best_local = best_f1
            for t in grid:
                trial = th.copy(); trial[c] = t
                pred = predict_with_class_thresholds(P, trial)
                f1 = f1_score(y_true, pred, average="macro")
                if f1 > best_local:
                    best_local = f1
                    best_th_c = t
            if best_th_c != th[c]:
                th[c] = best_th_c
                best_f1 = best_local
                improved = True
        if not improved:
            break
    return th, best_f1

best_th_vec, best_f1_val = coordinate_descent_thresholds(
    P_val, y_val, n_classes=n_classes, iters=3, grid=np.linspace(0.2, 0.8, 13), init=0.5
)
print(f"[Validation] Macro-F1 (per-Klasse Thresholds): {best_f1_val:.4f}")
# Optional: Thresholds je Klasse anschauen
# for cls, th in zip(le.classes_, best_th_vec):
#     print(f"{cls:20s} -> th={th:.2f}")


# ===== 7) Finaler Fit auf komplettem Training + Test-Evaluation =====
class_counts_train = pd.Series(y_train).value_counts().sort_index().values
class_weights_train = (1.0 / (class_counts_train ** alpha))
sw_train = class_weights_train[y_train]

clf.fit(X_train, y_train, model__sample_weight=sw_train)

# Test-Probas
P_test = clf.predict_proba(X_test)

# a) Thresholded Predictions
y_pred_thr = predict_with_class_thresholds(P_test, best_th_vec)

# b) Vergleich: Argmax (Baseline)
y_pred_argmax = P_test.argmax(axis=1)

print("\n=== Thresholded ===")
print("Accuracy:", accuracy_score(y_test, y_pred_thr))
print("Macro-F1:", f1_score(y_test, y_pred_thr, average="macro"))
print(classification_report(y_test, y_pred_thr, zero_division=0, target_names=le.classes_))

print("\n=== Argmax (Baseline) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_argmax))
print("Macro-F1:", f1_score(y_test, y_pred_argmax, average="macro"))
print(classification_report(y_test, y_pred_argmax, zero_division=0, target_names=le.classes_))


[Validation] Macro-F1 (per-Klasse Thresholds): 0.2869

=== Thresholded ===
Accuracy: 0.5169641421450394
Macro-F1: 0.29090373800733715
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.73      0.71      0.72      3140
             Content       0.30      0.43      0.35       635
Educational/Research       0.45      0.48      0.47       382
          Enterprise       0.23      0.28      0.25       449
          Government       0.15      0.35      0.21        34
                 NSP       0.40      0.29      0.34      1042
    Network Services       0.08      0.03      0.04       212
          Non-Profit       0.25      0.20      0.22       161
     Route Collector       0.00      0.00      0.00         8
        Route Server       0.24      0.45      0.31       156

            accuracy                           0.52      6219
           macro avg       0.28      0.32      0.29      6219
        weighted avg       0.53      0.52      0.52      6

In [17]:
# -*- coding: utf-8 -*-
"""
Produktionsreife XGBoost Klassifikation
- OHNE skopt, torch, tabnet
- Nur: sklearn, xgboost, pandas, numpy, matplotlib, seaborn, shap
- Per-Klasse Threshold-Tuning mit smarter Grid-Suche
- Early Stopping, SHAP, Confusion Matrix, Modell-Save
"""

import re
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier

import shap
import warnings
warnings.filterwarnings("ignore")

# =============================================================================
# 0) Daten laden
# =============================================================================
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

X = df[['org_name',
        'rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
        'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
        'domains', 'total_weight', 'unique_points', 'country_count',
        'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
        'p99_km', 'min_km', 'max_km',
        'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km'
       ]].copy()

y = df['info_type'].copy()
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)

# Spalten
text_col = 'org_name'
km_cols = ['mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
           'p99_km', 'min_km', 'max_km']
count_like_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
                   'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
                   'domains', 'total_weight', 'unique_points', 'country_count']
pct_cols = ['pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']

# =============================================================================
# 1) Text-Cleaning
# =============================================================================
def clean_org_name(X):
    def _clean(s):
        if not isinstance(s, str):
            return ""
        s = s.lower()
        s = re.sub(r'\b(gmbh|inc|ltd|llc|corp|co|ag|sa|limited|plc|bv)\b', ' ', s)
        s = re.sub(r'[^a-z0-9\s]', ' ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        return s
    return np.array([_clean(x) for x in X.ravel()]).reshape(-1, 1)

# =============================================================================
# 2) Pipelines
# =============================================================================
count_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

km_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

pct_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("to01", FunctionTransformer(lambda x: np.clip(x / 100.0, 0.0, 1.0), feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

text_pipe = Pipeline([
    ("clean", FunctionTransformer(clean_org_name, validate=False)),
    ("tfidf", TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(3, 6),
        min_df=2,
        max_df=0.95,
        lowercase=True
    ))
])

# =============================================================================
# 3) Preprocessor
# =============================================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num_count", count_pipe, count_like_cols),
        ("num_km", km_pipe, km_cols),
        ("num_pct", pct_pipe, pct_cols),
    ],
    remainder="drop",
    transformer_weights={
        "text": 1.0,
        "num_count": 12.0,
        "num_km": 10.0,
        "num_pct": 3.0,
    },
    sparse_threshold=0.3
)

# =============================================================================
# 4) Modell
# =============================================================================
clf = Pipeline([
    ("prep", preprocessor),
    ("model", XGBClassifier(
        tree_method="hist",
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=7,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        reg_alpha=0.5,
        objective="multi:softprob",
        eval_metric="mlogloss",
        num_class=n_classes,
        random_state=42,
        n_jobs=-1
    ))
])

# =============================================================================
# 5) Splits
# =============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# =============================================================================
# 6) Sample Weights
# =============================================================================
alpha = 0.7
class_counts_tr = pd.Series(y_tr).value_counts().sort_index()
class_weights_tr = 1.0 / (class_counts_tr ** alpha)
class_weights_tr = class_weights_tr / class_weights_tr.mean()
sw_tr = class_weights_tr[y_tr].values

# =============================================================================
# 7) Fit mit Early Stopping
# =============================================================================
print("Training mit Early Stopping...")
prep_tr = preprocessor.fit(X_tr)
X_tr_prep = prep_tr.transform(X_tr)
X_val_prep = prep_tr.transform(X_val)

clf.named_steps['model'].fit(
    X_tr_prep, y_tr,
    sample_weight=sw_tr,
    eval_set=[(X_val_prep, y_val)],
    early_stopping_rounds=80,
    verbose=False
)
print(f"   → Best iteration: {clf.named_steps['model'].best_iteration}")

# =============================================================================
# 8) Threshold-Tuning: Smarte Grid-Suche (ohne skopt)
# =============================================================================
P_val = clf.predict_proba(X_val)

# Informierte Initialisierung
prior = class_counts_tr / class_counts_tr.sum()
init_th = np.clip(prior / prior.max(), 0.3, 0.9)

def predict_with_thresholds(P, th):
    th = np.clip(th, 1e-6, 1.0)
    return (P / th).argmax(axis=1)

def tune_thresholds_grid(P, y_true, init_th, grid_size=15, iters=3):
    grid = np.linspace(0.2, 1.0, grid_size)
    th = init_th.copy()
    best_f1 = f1_score(y_true, predict_with_thresholds(P, th), average='macro')
    
    for it in range(iters):
        improved = False
        for c in range(len(th)):
            best_local = -1
            best_th_c = th[c]
            for t in grid:
                th_trial = th.copy()
                th_trial[c] = t
                pred = predict_with_thresholds(P, th_trial)
                f1 = f1_score(y_true, pred, average='macro')
                if f1 > best_local:
                    best_local = f1
                    best_th_c = t
            if best_th_c != th[c]:
                th[c] = best_th_c
                best_f1 = best_local
                improved = True
        if not improved:
            break
    return th, best_f1

print("Threshold-Tuning mit smarter Grid-Suche...")
best_th_vec, best_f1_val = tune_thresholds_grid(P_val, y_val, init_th, grid_size=13, iters=3)
print(f"   → Validation Macro-F1: {best_f1_val:.4f}")

# Optional: Thresholds anzeigen
# for cls, th in zip(le.classes_, best_th_vec):
#     print(f"{cls:20s} -> th={th:.3f}")

# =============================================================================
# 9) Final Fit
# =============================================================================
class_counts_train = pd.Series(y_train).value_counts().sort_index()
class_weights_train = 1.0 / (class_counts_train ** alpha)
class_weights_train = class_weights_train / class_weights_train.mean()
sw_train = class_weights_train[y_train].values

prep_train = preprocessor.fit(X_train)
X_train_prep = prep_train.transform(X_train)
X_test_prep = prep_train.transform(X_test)

clf.named_steps['model'].fit(
    X_train_prep, y_train,
    sample_weight=sw_train,
    eval_set=[(prep_train.transform(X_val), y_val)],
    early_stopping_rounds=80,
    verbose=False
)

# =============================================================================
# 10) Evaluation
# =============================================================================
P_test = clf.predict_proba(X_test)
y_pred_thr = predict_with_thresholds(P_test, best_th_vec)
y_pred_argmax = P_test.argmax(axis=1)

print("\n" + "="*60)
print("FINAL TEST RESULTS")
print("="*60)
print("Thresholded:")
print(f"   Accuracy: {accuracy_score(y_test, y_pred_thr):.4f}")
print(f"   Macro-F1: {f1_score(y_test, y_pred_thr, average='macro'):.4f}")
print("\nArgmax:")
print(f"   Accuracy: {accuracy_score(y_test, y_pred_argmax):.4f}")
print(f"   Macro-F1: {f1_score(y_test, y_pred_argmax, average='macro'):.4f}")

print("\nClassification Report (Thresholded):")
print(classification_report(y_test, y_pred_thr, target_names=le.classes_, zero_division=0))

# =============================================================================
# 11) Confusion Matrix
# =============================================================================
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_thr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix (Thresholded)')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

# =============================================================================
# 12) SHAP
# =============================================================================
print("\nBerechne SHAP Values...")
explainer = shap.TreeExplainer(clf.named_steps['model'])
X_sample = shap.sample(X_train_prep, 100)
shap_values = explainer.shap_values(X_sample)
shap.summary_plot(shap_values, X_sample, plot_type="bar", max_display=20)
plt.title("SHAP Feature Importance")
plt.show()

# =============================================================================
# 13) Speichern
# =============================================================================
import os
os.makedirs("models", exist_ok=True)

joblib.dump(clf, 'models/xgboost_asn_classifier.pkl')
joblib.dump(le, 'models/label_encoder.pkl')
joblib.dump(best_th_vec, 'models/thresholds.npy')
joblib.dump(preprocessor, 'models/preprocessor.pkl')

print("\nModelle gespeichert in ./models/")

# =============================================================================
# 14) Inference-Funktion
# =============================================================================
def predict_asn_type(org_name, **kwargs):
    row = pd.DataFrame([{**{'org_name': org_name}, **kwargs}])
    row = row[X.columns]
    proba = clf.predict_proba(row)[0]
    pred = predict_with_thresholds(proba.reshape(1, -1), best_th_vec)[0]
    return {
        'class': le.classes_[pred],
        'probabilities': dict(zip(le.classes_, proba))
    }

# Beispiel:
# pred = predict_asn_type("Google LLC", rank=1, asnDegree_total=5000, ...)

Training mit Early Stopping...


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [16]:
# -*- coding: utf-8 -*-
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier

# ===== Daten =====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

X = df[['org_name',
        'rank','asnDegree_total','asnDegree_customer','asnDegree_peer','asnDegree_provider',
        'cone_numberAsns','cone_numberPrefixes','cone_numberAddresses',
        'domains','total_weight','unique_points','country_count',
        'mean_km','var_km2','std_km','iqr_km','p25_km','p50_km','p75_km','p90_km','p95_km',
        'p99_km','min_km','max_km',
        'pct_ips_le_100km','pct_ips_le_500km','pct_ips_le_1000km']].copy()

y = df['info_type'].copy()
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)

text_col = 'org_name'
km_cols = ['mean_km','var_km2','std_km','iqr_km','p25_km','p50_km','p75_km','p90_km','p95_km','p99_km','min_km','max_km']
count_like_cols = ['rank','asnDegree_total','asnDegree_customer','asnDegree_peer','asnDegree_provider',
                   'cone_numberAsns','cone_numberPrefixes','cone_numberAddresses',
                   'domains','total_weight','unique_points','country_count']
pct_cols = ['pct_ips_le_100km','pct_ips_le_500km','pct_ips_le_1000km']

# ===== Teil-Pipelines =====
count_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda A: np.log1p(np.clip(A, a_min=0, a_max=None)),
                                  feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

km_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda A: np.log1p(np.clip(A, a_min=0, a_max=None)),
                                  feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

pct_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("to01", FunctionTransformer(lambda A: np.clip(A / 100.0, 0.0, 1.0),
                                 feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

text_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(2,5),
        min_df=1,
        lowercase=True
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num_count", count_pipe, count_like_cols),
        ("num_km", km_pipe, km_cols),
        ("num_pct", pct_pipe, pct_cols),
    ],
    remainder="drop",
    transformer_weights={
        # gib dem Text etwas mehr Gewicht, hilft oft kleineren Klassen
        "text": 1.0,
        "num_count": 10.0,
        "num_km": 8.0,
        "num_pct": 8.0,
    },
    sparse_threshold=0.3
)

base_model = XGBClassifier(
    tree_method="gpu_hist",          # "gpu_hist" falls GPU
    n_estimators=1200,
    learning_rate=0.03,
    max_depth=6,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    reg_alpha=0.5,
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=n_classes,
    random_state=42
)

clf = Pipeline([
    ("prep", preprocessor),
    ("model", base_model)
])

# ===== Splits =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

# Innerer Val-Split für Kalibrierung + Threshold-Tuning
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# ===== Class Imbalance (vorsichtig) =====
alpha = 0.35   # etwas schwächer gewichten, um Overfitting großer Klassen zu vermeiden
class_counts_tr = pd.Series(y_tr).value_counts().sort_index().values
class_weights_tr = (1.0 / (class_counts_tr ** alpha))
sw_tr = class_weights_tr[y_tr]

# Fit Basismodell auf (X_tr, y_tr)
clf.fit(X_tr, y_tr, model__sample_weight=sw_tr)

# ===== Kalibrierung der Wahrscheinlichkeiten =====
# 'prefit=True': nutzt die bereits trainierte Pipeline und kalibriert darauf.
# Methode: 'isotonic' (besser, aber langsamer) oder 'sigmoid' (robust, schnell).
cal = CalibratedClassifierCV(estimator=clf, method="isotonic", cv="prefit")
cal.fit(X_val, y_val)

# Probas (kalibriert) auf Validation
P_val = cal.predict_proba(X_val)  # (n_val, K)

# ===== Per-Klasse Thresholds über PR-Kurve (F1-optimal) =====
def f1_opt_threshold_for_class(y_true_bin, scores):
    # scores: Wahrscheinlichkeiten für Klasse c (kalibriert)
    p, r, th = precision_recall_curve(y_true_bin, scores)
    # precision_recall_curve gibt th mit Länge len(p)-1 zurück
    f1s = (2 * p[1:] * r[1:]) / np.clip(p[1:] + r[1:], 1e-12, None)
    if len(f1s) == 0:
        return 0.5  # fallback
    idx = np.nanargmax(f1s)
    return float(th[idx])

best_th_vec = np.zeros(n_classes, dtype=float)
for c in range(n_classes):
    y_bin = (y_val == c).astype(int)
    scores_c = P_val[:, c]
    best_th_vec[c] = f1_opt_threshold_for_class(y_bin, scores_c)

print("Per-Klasse-Thresholds (Validation):")
for cls, th in zip(le.classes_, best_th_vec):
    print(f"  {cls:20s} -> {th:.3f}")

# ===== Final: Refit auf ganzem Training, Kalibrieren auf X_val, Test evaluieren =====
# (Wir refitten absichtlich NICHT erneut vor der Kalibrierung, da cal schon 'prefit' nutzt.
#  Für maximale Strenge kannst du neu fitten auf X_train und dann erneut CalibratedCV(cv='prefit') auf X_val machen.)

# Kalibrierte Probas auf Test
P_test = cal.predict_proba(X_test)

# a) Thresholded Vorhersage: pro Klasse eigener Schwellwert
def predict_with_thresholds(P, th_vec):
    # weicher One-vs-Rest: nimm argmax der Scores, aber „zulassen“ nur wenn >= th der jeweiligen Klasse?
    # Für Multiclass ohne Reject reicht: klassifiziere zur Klasse mit größtem (P_c - th_c Bias).
    # Hier nutzen wir den OVR-Ansatz: Score = P_c - th_c, wähle argmax.
    scores = P - th_vec[None, :]
    return scores.argmax(axis=1)

y_pred_thr = predict_with_thresholds(P_test, best_th_vec)

# b) Argmax (Baseline)
y_pred_argmax = P_test.argmax(axis=1)

print("\n=== Thresholded (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_thr))
print("Macro-F1:", f1_score(y_test, y_pred_thr, average="macro"))
print(classification_report(y_test, y_pred_thr, zero_division=0, target_names=le.classes_))

print("\n=== Argmax (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_argmax))
print("Macro-F1:", f1_score(y_test, y_pred_argmax, average="macro"))
print(classification_report(y_test, y_pred_argmax, zero_division=0, target_names=le.classes_))


Per-Klasse-Thresholds (Validation):
  Cable/DSL/ISP        -> 0.397
  Content              -> 0.167
  Educational/Research -> 0.355
  Enterprise           -> 0.193
  Government           -> 0.136
  NSP                  -> 0.202
  Network Services     -> 0.103
  Non-Profit           -> 0.129
  Route Collector      -> 0.054
  Route Server         -> 0.314

=== Thresholded (calibrated) ===
Accuracy: 0.5598970895642387
Macro-F1: 0.3188717783511447
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.71      0.81      0.76      3140
             Content       0.32      0.51      0.39       635
Educational/Research       0.55      0.47      0.51       382
          Enterprise       0.23      0.13      0.17       449
          Government       0.24      0.24      0.24        34
                 NSP       0.42      0.25      0.31      1042
    Network Services       0.06      0.02      0.03       212
          Non-Profit       0.25      0.26      0.26    

In [15]:
cal = CalibratedClassifierCV(estimator=clf, method="sigmoid", cv="prefit")
cal.fit(X_val, y_val)

# Probas (kalibriert) auf Validation
P_val = cal.predict_proba(X_val)  # (n_val, K)

# ===== Per-Klasse Thresholds über PR-Kurve (F1-optimal) =====
def f1_opt_threshold_for_class(y_true_bin, scores):
    # scores: Wahrscheinlichkeiten für Klasse c (kalibriert)
    p, r, th = precision_recall_curve(y_true_bin, scores)
    # precision_recall_curve gibt th mit Länge len(p)-1 zurück
    f1s = (2 * p[1:] * r[1:]) / np.clip(p[1:] + r[1:], 1e-12, None)
    if len(f1s) == 0:
        return 0.5  # fallback
    idx = np.nanargmax(f1s)
    return float(th[idx])

best_th_vec = np.zeros(n_classes, dtype=float)
for c in range(n_classes):
    y_bin = (y_val == c).astype(int)
    scores_c = P_val[:, c]
    best_th_vec[c] = f1_opt_threshold_for_class(y_bin, scores_c)

print("Per-Klasse-Thresholds (Validation):")
for cls, th in zip(le.classes_, best_th_vec):
    print(f"  {cls:20s} -> {th:.3f}")

# ===== Final: Refit auf ganzem Training, Kalibrieren auf X_val, Test evaluieren =====
# (Wir refitten absichtlich NICHT erneut vor der Kalibrierung, da cal schon 'prefit' nutzt.
#  Für maximale Strenge kannst du neu fitten auf X_train und dann erneut CalibratedCV(cv='prefit') auf X_val machen.)

# Kalibrierte Probas auf Test
P_test = cal.predict_proba(X_test)

# a) Thresholded Vorhersage: pro Klasse eigener Schwellwert
def predict_with_thresholds(P, th_vec):
    # weicher One-vs-Rest: nimm argmax der Scores, aber „zulassen“ nur wenn >= th der jeweiligen Klasse?
    # Für Multiclass ohne Reject reicht: klassifiziere zur Klasse mit größtem (P_c - th_c Bias).
    # Hier nutzen wir den OVR-Ansatz: Score = P_c - th_c, wähle argmax.
    scores = P - th_vec[None, :]
    return scores.argmax(axis=1)

y_pred_thr = predict_with_thresholds(P_test, best_th_vec)

# b) Argmax (Baseline)
y_pred_argmax = P_test.argmax(axis=1)

print("\n=== Thresholded (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_thr))
print("Macro-F1:", f1_score(y_test, y_pred_thr, average="macro"))
print(classification_report(y_test, y_pred_thr, zero_division=0, target_names=le.classes_))

print("\n=== Argmax (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_argmax))
print("Macro-F1:", f1_score(y_test, y_pred_argmax, average="macro"))
print(classification_report(y_test, y_pred_argmax, zero_division=0, target_names=le.classes_))


Per-Klasse-Thresholds (Validation):
  Cable/DSL/ISP        -> 0.395
  Content              -> 0.138
  Educational/Research -> 0.169
  Enterprise           -> 0.105
  Government           -> 0.037
  NSP                  -> 0.214
  Network Services     -> 0.064
  Non-Profit           -> 0.065
  Route Collector      -> 0.048
  Route Server         -> 0.500

=== Thresholded (calibrated) ===
Accuracy: 0.5632738382376588
Macro-F1: 0.3322605330035301
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.71      0.80      0.75      3140
             Content       0.36      0.41      0.38       635
Educational/Research       0.49      0.57      0.52       382
          Enterprise       0.26      0.29      0.27       449
          Government       0.38      0.18      0.24        34
                 NSP       0.43      0.24      0.31      1042
    Network Services       0.07      0.05      0.06       212
          Non-Profit       0.31      0.28      0.30    