# Load the Data
## Peeringdb

In [1]:
import json
from pathlib import Path
import pandas as pd

filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


# Caida AS Names

In [2]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


## Join both

In [3]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


## Load AS Rank

In [4]:
as_rank_df = pd.read_csv('/workspaces/pytorch-gpu-2/preprocessing/data/asrank/as_rank_df.csv')
as_rank_df.head()

Unnamed: 0,asn,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses
0,3356,1,6613,6545,68,0,53986,873410,3468642119
1,1299,2,2567,2509,58,0,41193,776707,3219679484
2,174,3,6723,6626,97,0,38887,730166,3034352967
3,3257,4,1853,1816,37,0,36040,612491,2791999209
4,2914,5,1541,1483,58,0,25179,576134,2918763154


## Join both

In [5]:
peering_df_joined_with_asrank = pd.merge(
    peering_df_joined,
    as_rank_df,
    left_on='asn',
    right_on='asn',
    how='left'
)
peering_df_joined_with_asrank['rank'].fillna(peering_df_joined_with_asrank['rank'].median(), inplace=True)
peering_df_joined_with_asrank['asnDegree_total'].fillna(peering_df_joined_with_asrank['asnDegree_total'].median(), inplace=True)
peering_df_joined_with_asrank['asnDegree_customer'].fillna(peering_df_joined_with_asrank['asnDegree_customer'].median(), inplace=True)
peering_df_joined_with_asrank['asnDegree_peer'].fillna(peering_df_joined_with_asrank['asnDegree_peer'].median(), inplace=True)
peering_df_joined_with_asrank['asnDegree_provider'].fillna(peering_df_joined_with_asrank['asnDegree_provider'].median(), inplace=True)
peering_df_joined_with_asrank['cone_numberAsns'].fillna(peering_df_joined_with_asrank['cone_numberAsns'].median(), inplace=True)
peering_df_joined_with_asrank['cone_numberPrefixes'].fillna(peering_df_joined_with_asrank['cone_numberPrefixes'].median(), inplace=True)
peering_df_joined_with_asrank['cone_numberAddresses'].fillna(peering_df_joined_with_asrank['cone_numberAddresses'].median(), inplace=True)

peering_df_joined_with_asrank.head()

Unnamed: 0,asn,org_name,country,source,info_type,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses
0,4436,"GTT Americas, LLC",US,ARIN,NSP,78320.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,20940,Akamai International B.V.,NL,RIPE,Content,1894.0,485.0,14.0,366.0,105.0,15.0,8945.0,14612752.0
2,31800,DALnet,US,ARIN,Non-Profit,47745.0,78.0,0.0,74.0,4.0,1.0,2.0,512.0
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP,81.0,1273.0,166.0,1101.0,6.0,733.0,22131.0,42899794.0
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP,110.0,499.0,489.0,8.0,2.0,505.0,11982.0,31992440.0


## Load domains

In [6]:
ipinfo_df = pd.read_csv('../../preprocessing/data/ipinfo_domains/ipinfo_domains.csv')
ipinfo_df.head()

Unnamed: 0,ASN,domains
0,16509,139276485
1,13335,63477595
2,52925,32915972
3,396982,24543491
4,47846,17833760


## Join both

In [7]:
peering_df_joined_with_asrank_and_domains = pd.merge(
    peering_df_joined_with_asrank,
    ipinfo_df,
    left_on='asn',
    right_on='ASN',
    how='left'
)
peering_df_joined_with_asrank_and_domains['domains'].fillna(peering_df_joined_with_asrank_and_domains['domains'].median(), inplace=True)
peering_df_joined_with_asrank_and_domains.head()

Unnamed: 0,asn,org_name,country,source,info_type,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses,ASN,domains
0,4436,"GTT Americas, LLC",US,ARIN,NSP,78320.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,21.0
1,20940,Akamai International B.V.,NL,RIPE,Content,1894.0,485.0,14.0,366.0,105.0,15.0,8945.0,14612752.0,20940.0,3849287.0
2,31800,DALnet,US,ARIN,Non-Profit,47745.0,78.0,0.0,74.0,4.0,1.0,2.0,512.0,,21.0
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP,81.0,1273.0,166.0,1101.0,6.0,733.0,22131.0,42899794.0,3303.0,46521.0
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP,110.0,499.0,489.0,8.0,2.0,505.0,11982.0,31992440.0,22773.0,55711.0


## Load geolocations

In [8]:
import clickhouse_connect

client = clickhouse_connect.get_client(
    host='localhost', port=8123,
    username='default', password=''
)

query = """
/* Gewichtetes Zentrum und umfangreiche Distanz-Statistiken (Kilometer) */
WITH base AS (
    SELECT
        asn,
        latitude,
        longitude,
        country,
        toUInt64(ip_end - ip_start + 1) AS w
    FROM ip_location_asn
    WHERE ip_version = 4
      AND origin = 'ipinfo'
),
vec AS (
    SELECT
        asn,
        sum(w * cos(radians(latitude)) * cos(radians(longitude))) AS X,
        sum(w * cos(radians(latitude)) * sin(radians(longitude))) AS Y,
        sum(w * sin(radians(latitude)))                           AS Z,
        sum(w)                                                    AS W
    FROM base
    GROUP BY asn
),
center AS (
    SELECT
        asn,
        degrees(atan2(Y, X))                       AS center_lon,
        degrees(atan2(Z, sqrt(X * X + Y * Y)))     AS center_lat
    FROM vec
),
joined AS (
    SELECT
        b.asn,
        b.w,
        b.country,
        c.center_lat,
        c.center_lon,
        greatCircleDistance(b.longitude, b.latitude, c.center_lon, c.center_lat) / 1000 AS d_km
    FROM base AS b
    INNER JOIN center AS c USING (asn)
),
stats AS (
    SELECT
        asn,
        any(center_lat) AS center_lat,
        any(center_lon) AS center_lon,
        sum(w)  AS total_weight,
        avgWeighted(d_km,       w) AS mean_km,
        avgWeighted(d_km * d_km, w) AS mean_sq_km2,
        quantileExactWeighted(0.25)(d_km, w) AS p25_km,
        quantileExactWeighted(0.50)(d_km, w) AS p50_km,
        quantileExactWeighted(0.75)(d_km, w) AS p75_km,
        quantileExactWeighted(0.90)(d_km, w) AS p90_km,
        quantileExactWeighted(0.95)(d_km, w) AS p95_km,
        quantileExactWeighted(0.99)(d_km, w) AS p99_km,
        min(d_km) AS min_km,
        max(d_km) AS max_km,
        sumIf(w, d_km <=  100) / sum(w) AS share_le_100km,
        sumIf(w, d_km <=  500) / sum(w) AS share_le_500km,
        sumIf(w, d_km <= 1000) / sum(w) AS share_le_1000km
    FROM joined
    GROUP BY asn
),
geo_meta AS (
    SELECT
        asn,
        uniqExact((latitude, longitude)) AS unique_points,
        uniqExact(country)               AS country_count
    FROM base
    GROUP BY asn
)
SELECT
    s.asn,
    round(s.center_lat, 5) AS center_lat,
    round(s.center_lon, 5) AS center_lon,
    s.total_weight,
    gm.unique_points,
    gm.country_count,
    round(s.mean_km, 2)                         AS mean_km,
    round(greatest(s.mean_sq_km2 - s.mean_km * s.mean_km, 0), 2) AS var_km2,
    round(sqrt(greatest(s.mean_sq_km2 - s.mean_km * s.mean_km, 0)), 2) AS std_km,
    round(s.p75_km - s.p25_km, 2)               AS iqr_km,
    round(s.p25_km, 2)                          AS p25_km,
    round(s.p50_km, 2)                          AS p50_km,
    round(s.p75_km, 2)                          AS p75_km,
    round(s.p90_km, 2)                          AS p90_km,
    round(s.p95_km, 2)                          AS p95_km,
    round(s.p99_km, 2)                          AS p99_km,
    round(s.min_km, 2)                          AS min_km,
    round(s.max_km, 2)                          AS max_km,
    round(s.share_le_100km * 100, 2)            AS pct_ips_le_100km,
    round(s.share_le_500km * 100, 2)            AS pct_ips_le_500km,
    round(s.share_le_1000km * 100, 2)           AS pct_ips_le_1000km
FROM stats AS s
LEFT JOIN geo_meta AS gm USING (asn)
ORDER BY asn
"""

ch_df = client.query_df(query)
ch_df.head()


Unnamed: 0,asn,center_lat,center_lon,total_weight,unique_points,country_count,mean_km,var_km2,std_km,iqr_km,...,p50_km,p75_km,p90_km,p95_km,p99_km,min_km,max_km,pct_ips_le_100km,pct_ips_le_500km,pct_ips_le_1000km
0,1,25.968,92.5789,78170880,79,61,244.02,81524.56,285.53,85.42,...,202.41,269.52,270.88,323.86,2174.95,22.74,16533.53,14.53,96.74,97.75
1,2,33.23043,-61.91418,86016,4,4,3182.86,10932427.46,3306.42,0.0,...,1427.96,1427.96,8556.52,11221.18,11221.18,1427.96,11221.18,0.0,0.0,0.0
2,3,42.49447,-76.06584,46607360,21,16,702.31,781753.99,884.17,0.0,...,411.17,411.17,1267.02,3902.24,3906.49,227.04,15164.77,0.0,85.21,85.21
3,4,39.84525,-77.49818,162816,7,6,435.78,3240972.63,1800.27,0.42,...,112.86,112.86,112.86,112.86,12790.22,81.32,12790.22,17.61,95.6,95.6
4,5,36.12522,-113.40178,2304,4,4,1766.67,7530375.89,2744.15,0.33,...,801.17,801.17,9527.87,9527.87,9527.87,768.83,9527.87,0.0,0.0,88.89


## Join both

In [9]:
peering_df_joined_with_asrank_and_domains_and_geoloc = pd.merge(
    peering_df_joined_with_asrank_and_domains,
    ch_df,
    left_on='asn',
    right_on='asn',
    how='left'
)
peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'] = peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'].fillna('unknown').str.lower()
peering_df_joined_with_asrank_and_domains_and_geoloc.columns.tolist()

['asn',
 'org_name',
 'country',
 'source',
 'info_type',
 'rank',
 'asnDegree_total',
 'asnDegree_customer',
 'asnDegree_peer',
 'asnDegree_provider',
 'cone_numberAsns',
 'cone_numberPrefixes',
 'cone_numberAddresses',
 'ASN',
 'domains',
 'center_lat',
 'center_lon',
 'total_weight',
 'unique_points',
 'country_count',
 'mean_km',
 'var_km2',
 'std_km',
 'iqr_km',
 'p25_km',
 'p50_km',
 'p75_km',
 'p90_km',
 'p95_km',
 'p99_km',
 'min_km',
 'max_km',
 'pct_ips_le_100km',
 'pct_ips_le_500km',
 'pct_ips_le_1000km']

In [10]:
category_map = {
    "NSP": "Transit",
    "Content": "Content",
    "Cable/DSL/ISP": "Access",
    "Enterprise": "Enterprise",
    "Educational/Research": "Education/Research",
    "Non-Profit": "Enterprise",
    "Government": "Enterprise",
    "Route Server": "Network Services",
    "Route Collector": "Network Services",
    "Network Services": "Network Services",
    "Not-Disclosed": "Unknown"
}

peering_df_joined_with_asrank_and_domains_and_geoloc["info_type"] = (
    peering_df_joined_with_asrank_and_domains_and_geoloc["info_type"]
    .map(category_map)
    .fillna(peering_df_joined_with_asrank_and_domains_and_geoloc["info_type"])
)
peering_df_joined_with_asrank_and_domains_and_geoloc["info_type"].value_counts()

Access                12561
Transit                4166
Enterprise             2575
Content                2540
Education/Research     1529
Network Services       1504
Name: info_type, dtype: int64

# Classification

## TF-IDF

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ==== Daten ====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()
df["org_name"] = df["org_name"].fillna("unknown").str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)]  # sehr kleine Klassen raus (optional)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    df["org_name"], df["info_type"], test_size=0.13, random_state=42, stratify=df["info_type"]
)

# Gemeinsamer Vectorizer (fit nur auf Train!)
vec = TfidfVectorizer(analyzer="char", ngram_range=(1,6),
                      lowercase=True, min_df=1, sublinear_tf=True)

# ==== 1) SVM + Kalibrierung ====
svm = LinearSVC(C=0.35, class_weight="balanced")
svm_cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)

svm_pipe = Pipeline([
    ("tfidf", vec),
    ("svm_cal", svm_cal)
])

svm_pipe.fit(X_train_text, y_train)
y_pred_svm = svm_pipe.predict(X_test_text)
print("\n=== SVM (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm))



=== SVM (calibrated) ===
Accuracy: 0.6004947433518862
Macro-F1: 0.4486005064207326
                    precision    recall  f1-score   support

            Access       0.64      0.91      0.75      1633
           Content       0.45      0.31      0.36       330
Education/Research       0.66      0.48      0.56       199
        Enterprise       0.53      0.32      0.40       335
  Network Services       0.64      0.25      0.36       195
           Transit       0.41      0.20      0.27       542

          accuracy                           0.60      3234
         macro avg       0.56      0.41      0.45      3234
      weighted avg       0.57      0.60      0.56      3234



## Combine

In [12]:
# === TF-IDF (org_name) + Numerik -> Calibrated LinearSVC ===
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ---------- Daten vorbereiten ----------
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# Text normalisieren
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()

# Sehr kleine Klassen optional rausfiltern
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

# Zielspalte
y = df["info_type"].astype(str)

# Numerische Spalten bestimmen (alles außer Text/Kat/Label)
ignore = {"org_name", "info_type", "country", "source"}
num_candidates = [c for c in df.columns if c not in ignore]

# Nur numerisch verwertbare Spalten (coerce -> float)
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce")
num_df = num_df.replace([np.inf, -np.inf], np.nan)

num_cols = num_df.columns.tolist()

# Feature-DataFrame für Pipeline
X = pd.concat([df[["org_name"]].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

# ---------- Train/Test Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, random_state=42, stratify=y
)

# ---------- Preprocessing ----------
def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

num_pipe = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", MaxAbsScaler())
])

pre = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(analyzer="char", ngram_range=(2,6),
                                 lowercase=True, sublinear_tf=True, min_df=1),
         "org_name"),
        ("num",  num_pipe, num_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

# ---------- SVM + Kalibrierung ----------
base_svm = LinearSVC(C=0.35, class_weight="balanced", random_state=42)

pipe = Pipeline([
    ("pre", pre),
    ("svm", base_svm)
])

# ---------- Trainieren ----------
pipe.fit(X_train, y_train)

# ---------- Evaluieren ----------
y_pred = pipe.predict(X_test)
print("\n=== TF-IDF(Text) + Numerik -> Calibrated LinearSVC ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))

# ---------- (Optional) Probas für Ensembling ----------
# P_test = pipe.predict_proba(X_test)   # shape [N, n_classes]
# classes_ = pipe.named_steps["svm"].classes_.tolist()



=== TF-IDF(Text) + Numerik -> Calibrated LinearSVC ===
Accuracy: 0.6360544217687075
Macro-F1: 0.5344013616071771
                    precision    recall  f1-score   support

            Access       0.76      0.81      0.79      1633
           Content       0.45      0.52      0.48       330
Education/Research       0.61      0.63      0.62       199
        Enterprise       0.47      0.44      0.46       335
  Network Services       0.42      0.44      0.43       195
           Transit       0.52      0.37      0.43       542

          accuracy                           0.64      3234
         macro avg       0.54      0.54      0.53      3234
      weighted avg       0.63      0.64      0.63      3234



In [13]:
# ===============================================
# TF-IDF (org_name) + Numerik + country
# -> LinearSVC (unkalibriert)  und  SGD(modified_huber, mit Probas)
# ===============================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import FunctionTransformer

def amplify_numeric(X, factor=3.0):
    return X * factor

# ==== 0) Quelle laden (passe ggf. den DF-Namen an) ====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# ==== 1) Vorverarbeitung & Label-Filter ====
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

# Ziel
y = df["info_type"].astype(str)

# ==== 2) Feature-Spalten bestimmen ====
# Wir nehmen country dazu (starkes Signal) und alle numerisch konvertierbaren Spalten
ignore = {"org_name", "info_type", "source"}  # 'country' NICHT ignorieren
all_cols = df.columns.tolist()
text_col = "org_name"
cat_cols  = ["country"] if "country" in df.columns else []

# numerische Kandidaten = alles außer Text/Label/Source/country
num_candidates = [c for c in all_cols if c not in ignore.union({text_col}).union(set(cat_cols))]
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
num_cols = num_df.columns.tolist()

# Endgültiges X-DF
X = pd.concat([df[[text_col] + cat_cols].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

# ==== 3) Train/Test Split ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

# ==== 4) Preprocessor bauen ====
# Numerik: Impute -> log1p -> MaxAbsScaler (gut in Kombi mit TF-IDF, bleibt sparse-freundlich)
def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.where(np.isfinite(A), A, np.nan)
    # Median-Imputation passiert vorher; hier nur Sicherung
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", MaxAbsScaler()),
    ("boost", FunctionTransformer(lambda X: X * 3.0, validate=False)),  # Gewichtung!
])

transformers = [
    ("text", TfidfVectorizer(analyzer="char", ngram_range=(2,6),
                             lowercase=True, sublinear_tf=True, min_df=1), text_col),
]
if cat_cols:
    transformers.append(("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols))
if num_cols:
    transformers.append(("num", num_pipe, num_cols))

pre = ColumnTransformer(transformers, remainder="drop", sparse_threshold=0.3)

# ==== 5) MODELL A: LinearSVC (unkalibriert) ====
svm_linear = LinearSVC(
    C=0.35,
    class_weight="balanced",
    max_iter=5000,          # Konvergenz sichern
    random_state=42
)
pipe_svm = Pipeline([("pre", pre), ("clf", svm_linear)])
pipe_svm.fit(X_train, y_train)

y_pred_svm = pipe_svm.predict(X_test)
print("\n=== LinearSVC (TF-IDF + Numerik + country) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm, zero_division=0))

# ==== 6) MODELL B: SGDClassifier (modified_huber) -> liefert predict_proba ====
sgd = SGDClassifier(
    loss="modified_huber",   # SVM-ähnlich, aber mit Probas
    alpha=1e-4,
    class_weight="balanced",
    max_iter=5000,
    tol=1e-3,
    random_state=42
)
pipe_sgd = Pipeline([("pre", pre), ("clf", sgd)])
pipe_sgd.fit(X_train, y_train)

y_pred_sgd = pipe_sgd.predict(X_test)
print("\n=== SGD(modified_huber) (TF-IDF + Numerik + country) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Macro-F1:", f1_score(y_test, y_pred_sgd, average="macro"))
print(classification_report(y_test, y_pred_sgd, zero_division=0))

# Optional: Probas (für Ensembling/Routing)
# P_test = pipe_sgd.predict_proba(X_test)
# classes_ = pipe_sgd.named_steps["clf"].classes_.tolist()

# ==== 7) Mini-Tuning (optional, schnell) ====
# Wenn du noch 2-3 Punkte rausholen willst, probier leicht andere C/alpha:
#   - LinearSVC: C in [0.25, 0.35, 0.5, 0.75, 1.0]
#   - SGD alpha in [5e-5, 1e-4, 2e-4]
# Oder n-gram Range auf (2,7) testen.



=== LinearSVC (TF-IDF + Numerik + country) ===
Accuracy: 0.6499690785405071
Macro-F1: 0.5470150591016102
                    precision    recall  f1-score   support

            Access       0.78      0.82      0.80      1633
           Content       0.46      0.55      0.50       330
Education/Research       0.62      0.59      0.61       199
        Enterprise       0.51      0.48      0.49       335
  Network Services       0.41      0.44      0.42       195
           Transit       0.53      0.41      0.46       542

          accuracy                           0.65      3234
         macro avg       0.55      0.55      0.55      3234
      weighted avg       0.65      0.65      0.65      3234


=== SGD(modified_huber) (TF-IDF + Numerik + country) ===
Accuracy: 0.5714285714285714
Macro-F1: 0.48382855621293547
                    precision    recall  f1-score   support

            Access       0.81      0.69      0.75      1633
           Content       0.60      0.25      0.35    

In [None]:
# --- schnelle & stabile Pipeline ---
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# Labels filtern
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)
y = df["info_type"].astype(str)

# Spalten
ignore = {"org_name", "info_type", "source"}
text_col = "org_name"
cat_cols  = ["country"] if "country" in df.columns else []
num_candidates = [c for c in df.columns if c not in ignore.union({text_col}).union(set(cat_cols))]
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
num_cols = num_df.columns.tolist()

X = pd.concat([df[[text_col] + cat_cols].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.where(np.isfinite(A), A, np.nan)
    A = np.clip(A, 0.0, None)
    return np.log1p(A)

# Numerik (klein → darf dicht sein)
NUM_BOOST = 8.0
num_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", StandardScaler()),
    ("boost", FunctionTransformer(lambda X: X * NUM_BOOST, validate=False)),
])

# Text: TF-IDF → SVD (SVD reduziert massiv! DANN ist Dichte ok)
text_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char", ngram_range=(2, 6),
        lowercase=True, sublinear_tf=True,
        min_df=3,          # etwas strenger für Speed
        dtype=np.float32
    )),
    ("svd", TruncatedSVD(n_components=350, random_state=42, n_iter=5)),
])

# Country: OHE (wenig Kardinalität → dicht ok)
try:
    cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

transformers = [("text", text_pipe, text_col)]
if cat_cols: transformers.append(("cat", cat_encoder, cat_cols))
if num_cols: transformers.append(("num", num_pipe, num_cols))

# WICHTIG: KEIN sparse_threshold=0.0 -> nichts wird unnötig verdichtet
pre = ColumnTransformer(transformers, remainder="drop")

clf = LogisticRegression(
    solver="saga",
    penalty="l2",
    C=2.0,
    max_iter=4000,               # dank SVD meist ausreichend
    class_weight="balanced",
    n_jobs=-1,
    multi_class="multinomial",
)

pipe = Pipeline([("pre", pre), ("clf", clf)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print("\n=== Fast LogisticRegression (TFIDF->SVD + OHE + Num[boost]) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))


In [17]:
# === Nur Numerik -> HistGradientBoostingClassifier (mit Log1p für schiefe Spalten) ===
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier

# ---------- 1) Daten vorbereiten ----------
# Nimm den gleichen DF wie in deinen letzten Läufen:
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# Optional: sehr kleine Klassen entfernen (wie zuvor)
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

# Ziel
y = df["info_type"].astype(str)

# Kandidaten für Numerik: alles außer offensichtlichem Text/Kat/Label
ignore = {"org_name", "info_type", "country", "source"}
num_candidates = [c for c in df.columns if c not in ignore]
num_candidates.remove("asn")  # ASN ist numerisch, aber keine sinnvolle Zahl

# In numerisch zwingen; Unendlichkeiten -> NaN
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)

# Falls Spalten komplett NaN sind, droppen
num_df = num_df.drop(columns=num_df.columns[num_df.isna().all()], errors="ignore")

# Endgültige Numerik-Spaltenliste
num_cols = num_df.columns.tolist()

# Feature-DF
X = num_df.copy()

# ---------- 2) Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

# ---------- 3) Spalten in log/linear trennen ----------
# Log1p nur für Spalten, die (nach Imputation) nichtnegativ sind
# Wir schätzen die Nichtnegativität vorab grob über die Trainingsdaten (NaN -> 0 für Prüfung)
_train_tmp = X_train.copy()
_train_tmp = _train_tmp.fillna(0)
log_cols = [c for c in _train_tmp.columns if _train_tmp[c].min() >= 0.0]
lin_cols = [c for c in _train_tmp.columns if c not in log_cols]

def log1p_array(A):
    A = np.asarray(A, dtype=float)
    # A kommt nach Imputation; zur Sicherheit clippen
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

# ---------- 4) Preprocessing ----------
num_log_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("sc",    StandardScaler(with_mean=False)),
])

num_lin_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("sc",  StandardScaler(with_mean=False)),
])

transformers = []
if log_cols:
    transformers.append(("num_log", num_log_pipe, log_cols))
if lin_cols:
    transformers.append(("num_lin", num_lin_pipe, lin_cols))

pre = ColumnTransformer(transformers, remainder="drop")

# ---------- 5) Modell ----------
hgb = HistGradientBoostingClassifier(
    learning_rate=0.06,
    max_leaf_nodes=31,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42
)

pipe = Pipeline([
    ("pre", pre),
    ("clf", hgb)
])

# ---------- 6) Train ----------
pipe.fit(X_train, y_train)

# ---------- 7) Eval ----------
y_pred = pipe.predict(X_test)
print("\n=== Nur Numerik -> HistGradientBoosting ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))

# ---------- 8) (Optional) Feature-Importance per Permutation ----------
# Hinweis: kann etwas dauern; zeigt dir, welche Numerik-Spalten wirklich tragen.
from sklearn.inspection import permutation_importance

res = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42)
# ColumnTransformer erzeugt eine kombinierte Featureliste:
feature_names = []
if log_cols:
    feature_names += [f"[log] {c}" for c in log_cols]
if lin_cols:
    feature_names += [f"[lin] {c}" for c in lin_cols]

imp = pd.Series(res.importances_mean, index=feature_names).sort_values(ascending=False)
print("\nTop-20 Numerik-Features (Permutation Importance):")
print(imp.head(20))



=== Nur Numerik -> HistGradientBoosting ===
Accuracy: 0.44712430426716143
Macro-F1: 0.26504239987683176
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.81      0.55      0.66      1633
             Content       0.41      0.39      0.40       330
Educational/Research       0.30      0.25      0.27       199
          Enterprise       0.23      0.25      0.24       233
          Government       0.05      0.28      0.08        18
                 NSP       0.40      0.32      0.36       542
    Network Services       0.10      0.18      0.13       110
          Non-Profit       0.18      0.33      0.23        84
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.16      0.93      0.28        81

            accuracy                           0.45      3234
           macro avg       0.26      0.35      0.27      3234
        weighted avg       0.57      0.45      0.49      3234



KeyboardInterrupt: 

In [None]:
# ===============================================
# TF-IDF (org_name) + Numerik + country
# -> LinearSVC (unkalibriert)  und  SGD(modified_huber, mit Probas)
# == Numerik gezielt boosten + Wirkung prüfen
# ===============================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ==== 0) Quelle laden ====
df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()

# ==== 1) Vorverarbeitung & Label-Filter ====
df["org_name"] = df["org_name"].fillna("unknown").astype(str).str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)].reset_index(drop=True)

y = df["info_type"].astype(str)

# ==== 2) Feature-Spalten ====
ignore = {"org_name", "info_type", "source"}  # 'country' NICHT ignorieren
all_cols = df.columns.tolist()
text_col = "org_name"
cat_cols  = ["country"] if "country" in df.columns else []

num_candidates = [c for c in all_cols if c not in ignore.union({text_col}).union(set(cat_cols))]
num_df = df[num_candidates].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
num_cols = num_df.columns.tolist()

X = pd.concat([df[[text_col] + cat_cols].reset_index(drop=True),
               num_df.reset_index(drop=True)], axis=1)

# ==== 3) Split ====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.13, stratify=y, random_state=42
)

# ==== 4) Preprocessor ====
def log1p_array(A):
    A = np.asarray(A, dtype=float)
    A = np.where(np.isfinite(A), A, np.nan)
    A = np.clip(A, a_min=0.0, a_max=None)
    return np.log1p(A)

# >>> EINZIGER KNOPF: Boost-Faktor für Numerik <<<
NUM_BOOST = 8.0   # typ. 5–15 probieren; 8 ist guter Start

num_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(log1p_array, validate=True)),
    ("scale", MaxAbsScaler()),                             # sparse-freundlich
    ("boost", FunctionTransformer(lambda X: X * NUM_BOOST, # hier boosten!
                                  validate=False)),
])

# OneHotEncoder explizit SPARSE lassen (Performance, kein Densify)
try:
    cat_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    cat_enc = OneHotEncoder(handle_unknown="ignore", sparse=True)

transformers = [
    ("text", TfidfVectorizer(analyzer="char", ngram_range=(2,6),
                             lowercase=True, sublinear_tf=True, min_df=1,
                             dtype=np.float32), text_col),
]
if cat_cols:
    transformers.append(("cat", cat_enc, cat_cols))
if num_cols:
    transformers.append(("num", num_pipe, num_cols))

# Wichtig: NICHT künstlich verdichten
pre = ColumnTransformer(transformers, remainder="drop")  # kein sparse_threshold=0.0

# ==== 5) MODELL A: LinearSVC ====
svm_linear = LinearSVC(
    C=0.5,                      # etwas höher, weil wir Numerik pushen
    class_weight="balanced",
    max_iter=5000,
    random_state=42
)
pipe_svm = Pipeline([("pre", pre), ("clf", svm_linear)])
pipe_svm.fit(X_train, y_train)

y_pred_svm = pipe_svm.predict(X_test)
print("\n=== LinearSVC (TF-IDF + Numerik*{:.1f} + country) ===".format(NUM_BOOST))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm, zero_division=0))

# ==== 6) MODELL B: SGD (modified_huber) ====
sgd = SGDClassifier(
    loss="modified_huber",
    alpha=1e-4,                 # ggf. auf 5e-5 senken, wenn du noch mehr Numerik willst
    class_weight="balanced",
    max_iter=5000,
    tol=1e-3,
    random_state=42
)
pipe_sgd = Pipeline([("pre", pre), ("clf", sgd)])
pipe_sgd.fit(X_train, y_train)

y_pred_sgd = pipe_sgd.predict(X_test)
print("\n=== SGD(modified_huber) (TF-IDF + Numerik*{:.1f} + country) ===".format(NUM_BOOST))
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Macro-F1:", f1_score(y_test, y_pred_sgd, average="macro"))
print(classification_report(y_test, y_pred_sgd, zero_division=0))

# ==== 7) Kurzer Wirkungs-Check: tragen die Numerik-Koeffizienten jetzt mehr? ====
def block_norm_ratio(pipe):
    pre = pipe.named_steps["pre"]
    clf = pipe.named_steps["clf"]
    names = pre.get_feature_names_out()

    # Indexe der Numerik-Features
    num_idx = np.array([i for i, n in enumerate(names) if n.startswith("num__")])
    if num_idx.size == 0 or not hasattr(clf, "coef_"):
        return np.nan

    # multi-class: L2-Norm pro Klasse mitteln
    W = clf.coef_          # shape [K, D] (bei binary ggf. [1, D])
    num_norm = np.mean(np.linalg.norm(W[:, num_idx], axis=1))
    all_norm = np.mean(np.linalg.norm(W, axis=1))
    return float(num_norm / (all_norm + 1e-12))

print("\n[Diagnose] Anteil der Numerik-Koeffizienten (L2-Norm) an Gesamt:")
print("LinearSVC  num/all =", block_norm_ratio(pipe_svm))
print("SGD        num/all =", block_norm_ratio(pipe_sgd))


In [21]:
peering_df_joined_with_asrank_and_domains_and_geoloc.columns

Index(['asn', 'org_name', 'country', 'source', 'info_type', 'rank',
       'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer',
       'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes',
       'cone_numberAddresses', 'ASN', 'domains', 'center_lat', 'center_lon',
       'total_weight', 'unique_points', 'country_count', 'mean_km', 'var_km2',
       'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
       'p99_km', 'min_km', 'max_km', 'pct_ips_le_100km', 'pct_ips_le_500km',
       'pct_ips_le_1000km'],
      dtype='object')

In [35]:
peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'].fillna('unknown').str.lower()


0                                        gtt americas, llc
1                                akamai international b.v.
2                                                   dalnet
3                                    swisscom (schweiz) ag
4                                  cox communications inc.
                               ...                        
24870    max technology & support services private limited
24871                                                  NaN
24872                                                  NaN
24873                                      bjoern schleyer
24874                                         kiwi telecom
Name: org_name, Length: 24875, dtype: object

In [42]:
peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'] = peering_df_joined_with_asrank_and_domains_and_geoloc['org_name'].fillna('unknown').str.lower()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler
import numpy as np
from sklearn.preprocessing import FunctionTransformer, RobustScaler, StandardScaler, LabelEncoder

df = peering_df_joined_with_asrank_and_domains_and_geoloc.copy()
X = df[['org_name', 'rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes',
    'cone_numberAddresses', 'domains',  'total_weight', 'unique_points', 'country_count', 'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
    'p99_km', 'min_km', 'max_km', 'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']]
y = df['info_type']
le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)
text_col = 'org_name'
num_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes',
    'cone_numberAddresses', 'domains',  'total_weight', 'unique_points', 'country_count', 'mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km',
    'p99_km', 'min_km', 'max_km', 'pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']

pct_cols = ['pct_ips_le_100km', 'pct_ips_le_500km', 'pct_ips_le_1000km']

# Distanz-/Streumaße (km): oft stark rechtsschief
km_cols = ['mean_km', 'var_km2', 'std_km', 'iqr_km', 'p25_km', 'p50_km', 'p75_km', 'p90_km', 'p95_km', 'p99_km',
           'min_km', 'max_km']

# Zähl-/Skalenwerte: Ränge, Degrees, Cones, Domains, Gewichte, Länder, Punkte
count_like_cols = ['rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider',
                   'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses',
                   'domains', 'total_weight', 'unique_points', 'country_count']

# Falls du ASN wirklich als Zahl nutzen willst: packe ihn hier rein
# count_like_cols += ['ASN']

# --- Teil-Pipelines ---

# 1) Count/Skalen: impute -> log1p -> robust scale
count_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda X: np.log1p(np.clip(X, a_min=0, a_max=None)), feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 2) km-Metriken: impute -> log1p -> robust scale
km_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("log1p", FunctionTransformer(lambda X: np.log1p(np.clip(X, a_min=0, a_max=None)), feature_names_out="one-to-one")),
    ("scale", RobustScaler())
])

# 3) Prozente: impute -> clip [0,1] -> standard scale
pct_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median", add_indicator=True)),
    ("clip01", FunctionTransformer(lambda X: np.clip(X, 0.0, 1.0), feature_names_out="one-to-one")),
    ("scale", StandardScaler())
])

# --- Text-Pipeline (deine) ---
from sklearn.feature_extraction.text import TfidfVectorizer
text_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(2,4),
        min_df=1,
        lowercase=True
    ))
])

text_col = "org_name"

# --- ColumnTransformer zusammenbauen ---
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipe, text_col),
        ("num_count", count_pipe, count_like_cols),
        ("num_km", km_pipe, km_cols),
        ("num_pct", pct_pipe, pct_cols),
    ],
    remainder="drop"
)
from xgboost import XGBClassifier
clf = Pipeline([
    ("prep", preprocessor),
    ("model", XGBClassifier(
        tree_method="gpu_hist",         # oder "gpu_hist" bei NVIDIA
        n_estimators=500,
        learning_rate=0.1,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss"
    ))
])


# Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, zero_division=0))
