# Load the Data
## Peeringdb

In [1]:
import json
from pathlib import Path
import pandas as pd

filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


# Caida AS Names

In [2]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


## Join both

In [3]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


In [19]:
category_map = {
    "NSP": "Transit",
    "Content": "Content",
    "Cable/DSL/ISP": "Access",
    "Enterprise": "Enterprise",
    "Educational/Research": "Education/Research",
    "Non-Profit": "Enterprise",
    "Government": "Enterprise",
    "Route Server": "Network Services",
    "Route Collector": "Network Services",
    "Network Services": "Network Services",
    "Not-Disclosed": "Unknown"
}

peering_df_joined["info_type"] = (
    peering_df_joined["info_type"]
    .map(category_map)
    .fillna(peering_df_joined["info_type"])
)
peering_df_joined["info_type"].value_counts()

Access                11787
Transit                3982
Content                2486
Enterprise             2460
Network Services       1458
Education/Research     1457
Name: info_type, dtype: int64

# Classification

## TF-IDF

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ==== Daten ====
df = peering_df_joined.copy()
df["org_name"] = df["org_name"].fillna("unknown").str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)]  # sehr kleine Klassen raus (optional)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    df["org_name"], df["info_type"], test_size=0.13, random_state=42, stratify=df["info_type"]
)

# Gemeinsamer Vectorizer (fit nur auf Train!)
vec = TfidfVectorizer(analyzer="char", ngram_range=(1,6),
                      lowercase=True, min_df=1, sublinear_tf=True)

# ==== 1) SVM + Kalibrierung ====
svm = LinearSVC(C=0.35, class_weight="balanced")
svm_cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)

svm_pipe = Pipeline([
    ("tfidf", vec),
    ("svm_cal", svm_cal)
])

svm_pipe.fit(X_train_text, y_train)
y_pred_svm = svm_pipe.predict(X_test_text)
print("\n=== SVM (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm))



=== SVM (calibrated) ===
Accuracy: 0.58984375
Macro-F1: 0.43285540823646373
                    precision    recall  f1-score   support

            Access       0.63      0.92      0.75      1532
           Content       0.45      0.33      0.38       323
Education/Research       0.67      0.49      0.56       189
        Enterprise       0.43      0.25      0.31       320
  Network Services       0.64      0.25      0.36       190
           Transit       0.40      0.16      0.23       518

          accuracy                           0.59      3072
         macro avg       0.54      0.40      0.43      3072
      weighted avg       0.55      0.59      0.54      3072



## Bert

In [21]:
# === Ersatz für den HF-Datasets-Teil (kein pyarrow/datasets nötig) ===
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer,
                          EarlyStoppingCallback, TextClassificationPipeline)

# --------- Konfig ---------
MODEL_NAME   = "xlm-roberta-base"   # multilingual, starkes Baseline-Modell
MAX_LENGTH   = 64                   # Org-Namen sind kurz -> 64 reicht
LR           = 1e-5
EPOCHS       = 20
BATCH_SIZE   = 32
WARMUP_RATIO = 0.06
SEED         = 42
OUT_DIR      = "xlmr_org_trainer_out"

tok = AutoTokenizer.from_pretrained(MODEL_NAME)
le = LabelEncoder()

le = LabelEncoder()
df = peering_df_joined
df["label_id"] = le.fit_transform(df["info_type"])
num_labels = len(le.classes_)
print(f"Labels: {num_labels} Klassen ->", list(le.classes_))

df["label_id"] = le.fit_transform(df["info_type"])


df.fillna('Unknown', inplace=True)

# Prüfe GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU-Name: {torch.cuda.get_device_name(0)}")
else:
    print("Warnung: Keine GPU verfügbar, CPU wird verwendet.")

# Train/Validation Split (stratifiziert)
train_df, eval_df = train_test_split(
    df[["org_name", "label_id"]],
    test_size=0.13,
    random_state=SEED,
    stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# Texte & Labels aus den bereits vorbereiteten DataFrames (train_df, eval_df)
train_texts = train_df["org_name"].tolist()
eval_texts  = eval_df["org_name"].tolist()
y_train_np  = train_df["label_id"].to_numpy()
y_eval_np   = eval_df["label_id"].to_numpy()
num_labels  = df["label_id"].nunique()



# Tokenisierung OHNE Padding (Padding macht später der DataCollator)
train_enc = tok(train_texts, truncation=True, max_length=MAX_LENGTH)
eval_enc  = tok(eval_texts,  truncation=True, max_length=MAX_LENGTH)

class SimpleHFLikeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.enc = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item

ds_train = SimpleHFLikeDataset(train_enc, y_train_np)
ds_eval  = SimpleHFLikeDataset(eval_enc,  y_eval_np)

collator = DataCollatorWithPadding(tokenizer=tok)

valid_classes = sorted(df["info_type"].unique())

# ---- Modell + Class Weights wie gehabt ----
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={int(i): c for i, c in enumerate(valid_classes)},
    label2id={c: int(i) for i, c in enumerate(valid_classes)}
).to(device)

# Class-Weights aus dem Trainingssplit
class_counts = np.bincount(y_train_np, minlength=num_labels)
weights = class_counts.sum() / np.maximum(class_counts, 1)
weights = weights / weights.mean()
class_weights = torch.tensor(weights, dtype=torch.float, device=device)
print("Class weights:", np.round(weights, 3))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir=OUT_DIR + "/checkpoints",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    warmup_ratio=WARMUP_RATIO,
    fp16=(device.type=="cuda"),
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    report_to=["none"],
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    return {
        "accuracy":  float(accuracy_score(labels, preds)),
        "f1_macro":  float(f1_score(labels, preds, average="macro")),
        "precision": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "recall":    float(recall_score(labels, preds, average="macro")),
    }

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
metrics = trainer.evaluate()
print("Eval:", metrics)

trainer.save_model(OUT_DIR + "/model")
tok.save_pretrained(OUT_DIR + "/model")


Labels: 6 Klassen -> ['Access', 'Content', 'Education/Research', 'Enterprise', 'Network Services', 'Transit']
Device: cuda
GPU-Name: NVIDIA GeForce RTX 4070 Laptop GPU


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: [0.202 0.959 1.636 0.969 1.636 0.599]




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision,Recall
1,1.6261,1.544434,0.509766,0.308875,0.359607,0.367892
2,1.4927,1.415287,0.510417,0.414821,0.405639,0.45737
3,1.365,1.356411,0.493815,0.418727,0.402557,0.462343
4,1.354,1.371389,0.484049,0.430136,0.447004,0.476403
5,1.2863,1.350164,0.49707,0.430287,0.41067,0.486535
6,1.214,1.357503,0.500326,0.447261,0.427028,0.492556
7,1.245,1.394686,0.464844,0.424991,0.423324,0.477538
8,1.1771,1.448541,0.465495,0.423897,0.419283,0.466403


Eval: {'eval_loss': 1.357502818107605, 'eval_accuracy': 0.5003255208333334, 'eval_f1_macro': 0.44726056996683994, 'eval_precision': 0.42702826333251176, 'eval_recall': 0.49255561938059694, 'eval_runtime': 1.2179, 'eval_samples_per_second': 2522.412, 'eval_steps_per_second': 78.825, 'epoch': 8.0}


('xlmr_org_trainer_out/model/tokenizer_config.json',
 'xlmr_org_trainer_out/model/special_tokens_map.json',
 'xlmr_org_trainer_out/model/tokenizer.json')

In [22]:
# ==== Ensemble aus kalibrierter SVM + XLM-R (Late Fusion) ====
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 1) Hilfsfunktionen: Probas holen und Klassenreihenfolge erzwingen
label_names = list(valid_classes)  # <- gleiche Reihenfolge wie beim HF-Modell (id2label/label2id)
label_index = {lbl: i for i, lbl in enumerate(label_names)}

def svm_proba(texts):
    """Kalibrierte SVM-Probas in label_names-Reihenfolge."""
    # svm_pipe.classes_ enthält die Klassenreihenfolge des SVM-Teils
    svm_labels = list(svm_pipe.named_steps["svm_cal"].classes_) if hasattr(svm_pipe.named_steps["svm_cal"], "classes_") \
                 else list(svm_pipe.classes_)
    proba = svm_pipe.predict_proba(texts)  # shape: [N, n_classes_svm]
    # Auf label_names umsortieren
    idx_map = [svm_labels.index(lbl) for lbl in label_names]
    proba_sorted = proba[:, idx_map]
    return proba_sorted

@torch.no_grad()
def xlmr_proba(texts, batch_size=64, max_length=256):
    """Transformer-Softmax-Probas in label_names-Reihenfolge (id2label stimmt auf valid_classes)."""
    model.eval()
    all_probs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(batch, truncation=True, max_length=max_length, padding=True, return_tensors="pt")
        enc = {k: v.to(model.device) for k, v in enc.items()}
        logits = model(**enc).logits  # [B, num_labels]
        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
        all_probs.append(probs)
    return np.vstack(all_probs)  # [N, num_labels] schon in label_names-Reihenfolge

# 2) Probas auf deinem (Ensemble-)Validierungs/Testsplit erzeugen
X_eval = eval_df["org_name"].tolist()
y_eval = y_eval_np  # ints passend zur Reihenfolge in label_names

P_svm  = svm_proba(X_eval)              # [N, C]
P_xlmr = xlmr_proba(X_eval, max_length=MAX_LENGTH)  # [N, C]

# 3) Gewicht per einfacher Grid-Search finden (0..1)
grid = np.linspace(0.0, 1.0, 21)  # 0.00, 0.05, ..., 1.00
best = {"w": None, "f1": -1.0, "acc": 0.0}

for w in grid:
    P_ens = w * P_svm + (1.0 - w) * P_xlmr
    y_hat = P_ens.argmax(axis=1)
    f1 = f1_score(y_eval, y_hat, average="macro")
    acc = accuracy_score(y_eval, y_hat)
    if f1 > best["f1"] or (f1 == best["f1"] and acc > best["acc"]):
        best.update({"w": float(w), "f1": float(f1), "acc": float(acc)})

print(f"\n=== Ensemble-Gewichtssuche ===")
print(f"Bestes w (SVM-Anteil): {best['w']:.2f} | Macro-F1: {best['f1']:.4f} | Acc: {best['acc']:.4f}")

# 4) Finale Ensemble-Vorhersage + Report
w = best["w"]
P_ens = w * P_svm + (1.0 - w) * P_xlmr
y_pred = P_ens.argmax(axis=1)

print("\n=== Ensemble (SVM^w + XLM-R^(1-w)) auf Eval ===")
print("Accuracy:", accuracy_score(y_eval, y_pred))
print("Macro-F1:", f1_score(y_eval, y_pred, average="macro"))
print(classification_report(y_eval, y_pred, target_names=label_names))

# 5) Praktische Inferenzfunktion fürs spätere Nutzen
def ensemble_predict(texts, return_proba=False, batch_size=64):
    Ps = svm_proba(texts)
    Pt = xlmr_proba(texts, batch_size=batch_size, max_length=MAX_LENGTH)
    P = w * Ps + (1.0 - w) * Pt
    preds = P.argmax(axis=1)
    if return_proba:
        return preds, P
    return preds

# Beispiel:
# preds, proba = ensemble_predict(["google llc", "university of oxford"], return_proba=True)



=== Ensemble-Gewichtssuche ===
Bestes w (SVM-Anteil): 0.50 | Macro-F1: 0.4886 | Acc: 0.5977

=== Ensemble (SVM^w + XLM-R^(1-w)) auf Eval ===
Accuracy: 0.59765625
Macro-F1: 0.4885708991717628
                    precision    recall  f1-score   support

            Access       0.72      0.81      0.76      1532
           Content       0.40      0.48      0.44       323
Education/Research       0.56      0.76      0.65       189
        Enterprise       0.40      0.39      0.40       320
  Network Services       0.56      0.35      0.43       190
           Transit       0.39      0.20      0.26       518

          accuracy                           0.60      3072
         macro avg       0.51      0.50      0.49      3072
      weighted avg       0.58      0.60      0.58      3072



In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

# ==== Beispiel-Daten ====
df = peering_df_joined.copy()
# df["org_name"], df["country"], df["asn"], df["ix_count"], df["info_type"]

df["org_name"] = df["org_name"].fillna("unknown").str.lower()

# ==== Hilfsfunktion für Regex-Features aus dem Namen ====
def regex_features(X):
    X = X["org_name"].astype(str).str.lower()
    return pd.DataFrame({
        "has_isp": X.str.contains(r"isp|telecom|broadband|internet").astype(int),
        "has_univ": X.str.contains(r"univ|college|schule|academy").astype(int),
        "has_gov": X.str.contains(r"gov|ministerium|city|state|municipal").astype(int),
        "has_ix": X.str.contains(r"\bix\b|exchange|route").astype(int),
        "has_asn": X.str.contains(r"as\d+").astype(int),
    })

# ==== Spalten definieren ====
text_col = "org_name"
regex_col = ["org_name"]

# ==== ColumnTransformer aufbauen ====
preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,6), sublinear_tf=True), text_col),
    ("regex", FunctionTransformer(regex_features), regex_col)
])

# ==== Modell ====
svm = LinearSVC(C=0.35, class_weight="balanced")
svm_cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)

pipe = Pipeline([
    ("features", preprocessor),
    ("clf", svm_cal)
])

# ==== Train/Test Split ====
X_train, X_test, y_train, y_test = train_test_split(
    df[["org_name"]],
    df["info_type"],
    test_size=0.13,
    stratify=df["info_type"],
    random_state=42
)

# ==== Trainieren + Bewerten ====
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))

Accuracy: 0.5846354166666666
Macro-F1: 0.340165375175505
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.62      0.92      0.74      1532
             Content       0.42      0.34      0.38       323
Educational/Research       0.68      0.48      0.56       189
          Enterprise       0.41      0.13      0.20       224
          Government       0.50      0.25      0.33        16
                 NSP       0.41      0.17      0.24       518
    Network Services       0.00      0.00      0.00       105
          Non-Profit       0.67      0.30      0.41        80
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.62      0.48      0.54        81

            accuracy                           0.58      3072
           macro avg       0.43      0.31      0.34      3072
        weighted avg       0.53      0.58      0.52      3072



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

df = peering_df_joined.copy()
df["org_name"] = df["org_name"].fillna("unknown").str.lower()
X_train, X_test, y_train, y_test = train_test_split(
    df[["org_name"]],
    df["info_type"],
    test_size=0.13,
    stratify=df["info_type"],
    random_state=42
)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    loss_function='MultiClass',
    text_features=['org_name'],
    auto_class_weights='Balanced',
    eval_metric='TotalF1',
    verbose=100
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro-F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))
#!/usr/bin/env python3

0:	learn: 0.1464868	total: 997ms	remaining: 16m 35s
100:	learn: 0.3638968	total: 1m 32s	remaining: 13m 43s
200:	learn: 0.3849558	total: 3m 4s	remaining: 12m 12s
300:	learn: 0.4635967	total: 4m 37s	remaining: 10m 43s
400:	learn: 0.4926904	total: 6m 11s	remaining: 9m 14s
500:	learn: 0.5089481	total: 7m 45s	remaining: 7m 43s
600:	learn: 0.5220419	total: 9m 19s	remaining: 6m 11s
700:	learn: 0.5338786	total: 10m 58s	remaining: 4m 40s
800:	learn: 0.5424692	total: 12m 32s	remaining: 3m 6s
900:	learn: 0.5521177	total: 14m 8s	remaining: 1m 33s
999:	learn: 0.5574348	total: 15m 47s	remaining: 0us
Accuracy: 0.4248046875
Macro-F1: 0.26779344876719907
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.78      0.55      0.65      1532
             Content       0.25      0.45      0.32       323
Educational/Research       0.45      0.39      0.41       189
          Enterprise       0.20      0.28      0.23       224
          Government       0.04      0.56  

In [7]:
# === SBERT-Embeddings + LogisticRegression-Klassifikator ===
# Setzt voraus: pip install sentence-transformers scikit-learn joblib

import os
import re
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump, load
from sentence_transformers import SentenceTransformer

# --------- Konfig ---------
MODEL_NAME   = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # sehr gutes Multilingual-SBERT
MAX_LENGTH   = 64
SEED         = 42
OUT_DIR      = "sbert_org_cls"
os.makedirs(OUT_DIR, exist_ok=True)

rng = np.random.RandomState(SEED)
torch.manual_seed(SEED)

# --------- Daten ---------
# Erwartet: peering_df_joined mit Spalten ["org_name", "info_type"]
df = peering_df_joined.copy()
df["org_name"] = df["org_name"].fillna("Unknown").astype(str).str.strip()
df["info_type"] = df["info_type"].fillna("Unknown").astype(str).str.strip()

# (Optional) leichte Normalisierung von Rechtsformen & Sonderzeichen
def normalize_org_name(s: str) -> str:
    s = re.sub(r"[.,;:()\-_/]+", " ", s)               # Satzzeichen -> Leerzeichen
    s = re.sub(r"\b(ag|gmbh|mbh|ltd|llc|inc|s\.a\.|sa|sarl|co|kg|kgaa|se|oy|ab)\b", "", s, flags=re.I)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["org_name_norm"] = df["org_name"].map(normalize_org_name)

# Label-Encoding
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"])
num_labels = len(le.classes_)
print(f"Labels: {num_labels} Klassen ->", list(le.classes_))

# Stratified Split
train_df, eval_df = train_test_split(
    df[["org_name_norm", "label_id"]],
    test_size=0.13,
    random_state=SEED,
    stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# --------- SBERT laden & Embeddings berechnen ---------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
model = SentenceTransformer(MODEL_NAME, device=device)

def embed(texts, batch_size=256):
    # normalize_embeddings=True kann bei Cosine-Ähnlichkeit helfen – für LR ist es optional.
    return model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True,
        normalize_embeddings=False
    )

X_train = embed(train_df["org_name_norm"].tolist())
X_eval  = embed(eval_df["org_name_norm"].tolist())
y_train = train_df["label_id"].to_numpy()
y_eval  = eval_df["label_id"].to_numpy()

# --------- Klassifikator: Logistic Regression (mit Klassenbalancierung) ---------
# Tipp: 'saga' oder 'lbfgs' funktionieren gut; C kann man via CV tunen.
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = {c: w for c, w in zip(classes, class_weights)}
print("Class weights (LR):", np.round(class_weights, 3))

clf = LogisticRegression(
    multi_class="auto",
    solver="lbfgs",
    C=2.0,
    max_iter=2000,
    n_jobs=-1,
    class_weight=class_weight_dict,
    random_state=SEED,
)
clf.fit(X_train, y_train)

# --------- Evaluation ---------
y_pred = clf.predict(X_eval)
metrics = {
    "accuracy":  float(accuracy_score(y_eval, y_pred)),
    "f1_macro":  float(f1_score(y_eval, y_pred, average="macro")),
    "precision": float(precision_score(y_eval, y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(y_eval, y_pred, average="macro")),
}
print("Eval:", metrics)
print("\nClassification report:\n", classification_report(y_eval, y_pred, target_names=le.classes_))

# --------- Persistenz ---------
dump(clf, os.path.join(OUT_DIR, "clf.joblib"))
dump(le,  os.path.join(OUT_DIR, "label_encoder.joblib"))

# Hinweis: Das SBERT-Basis-Modell selbst wird nicht verändert. Für Reproduzierbarkeit:
with open(os.path.join(OUT_DIR, "MODEL_NAME.txt"), "w", encoding="utf-8") as f:
    f.write(MODEL_NAME)

# --------- Inferenz-Funktion ---------
def predict_org_classes(org_names):
    names_norm = [normalize_org_name(x or "Unknown") for x in org_names]
    X = embed(names_norm)
    yhat = clf.predict(X)
    labels = le.inverse_transform(yhat)
    return labels

# Beispiel:
# print(predict_org_classes(["Siemens AG", "Acme GmbH", "Universität Zürich"]))


  from .autonotebook import tqdm as notebook_tqdm


Labels: 10 Klassen -> ['Cable/DSL/ISP', 'Content', 'Educational/Research', 'Enterprise', 'Government', 'NSP', 'Network Services', 'Non-Profit', 'Route Collector', 'Route Server']
Device: cuda


Batches: 100%|██████████| 81/81 [00:02<00:00, 37.04it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 46.03it/s]


Class weights (LR): [ 0.2    0.95   1.621  1.373 18.689  0.593  2.941  3.857 76.141  3.793]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Eval: {'accuracy': 0.3688151041666667, 'f1_macro': 0.2633710874882087, 'precision': 0.24591686784053568, 'recall': 0.3843455165550887}

Classification report:
                       precision    recall  f1-score   support

       Cable/DSL/ISP       0.74      0.43      0.55      1532
             Content       0.25      0.34      0.29       323
Educational/Research       0.46      0.56      0.51       189
          Enterprise       0.18      0.28      0.22       224
          Government       0.09      0.50      0.16        16
                 NSP       0.25      0.18      0.21       518
    Network Services       0.06      0.15      0.08       105
          Non-Profit       0.16      0.39      0.23        80
     Route Collector       0.06      0.50      0.11         4
        Route Server       0.19      0.51      0.28        81

            accuracy                           0.37      3072
           macro avg       0.25      0.38      0.26      3072
        weighted avg       0.49 

In [8]:
# === SBERT-Embeddings + LogisticRegression-Klassifikator ===
# Setzt voraus: pip install sentence-transformers scikit-learn joblib

import os
import re
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump, load
from sentence_transformers import SentenceTransformer

# --------- Konfig ---------
MODEL_NAME   = "Vsevolod/company-names-similarity-sentence-transformer"  # sehr gutes Multilingual-SBERT
MAX_LENGTH   = 64
SEED         = 42
OUT_DIR      = "sbert_org_cls"
os.makedirs(OUT_DIR, exist_ok=True)

rng = np.random.RandomState(SEED)
torch.manual_seed(SEED)

# --------- Daten ---------
# Erwartet: peering_df_joined mit Spalten ["org_name", "info_type"]
df = peering_df_joined.copy()
df["org_name"] = df["org_name"].fillna("Unknown").astype(str).str.strip()
df["info_type"] = df["info_type"].fillna("Unknown").astype(str).str.strip()

# (Optional) leichte Normalisierung von Rechtsformen & Sonderzeichen
def normalize_org_name(s: str) -> str:
    s = re.sub(r"[.,;:()\-_/]+", " ", s)               # Satzzeichen -> Leerzeichen
    s = re.sub(r"\b(ag|gmbh|mbh|ltd|llc|inc|s\.a\.|sa|sarl|co|kg|kgaa|se|oy|ab)\b", "", s, flags=re.I)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["org_name_norm"] = df["org_name"].map(normalize_org_name)

# Label-Encoding
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"])
num_labels = len(le.classes_)
print(f"Labels: {num_labels} Klassen ->", list(le.classes_))

# Stratified Split
train_df, eval_df = train_test_split(
    df[["org_name_norm", "label_id"]],
    test_size=0.13,
    random_state=SEED,
    stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# --------- SBERT laden & Embeddings berechnen ---------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
model = SentenceTransformer(MODEL_NAME, device=device)

def embed(texts, batch_size=256):
    # normalize_embeddings=True kann bei Cosine-Ähnlichkeit helfen – für LR ist es optional.
    return model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True,
        normalize_embeddings=False
    )

X_train = embed(train_df["org_name_norm"].tolist())
X_eval  = embed(eval_df["org_name_norm"].tolist())
y_train = train_df["label_id"].to_numpy()
y_eval  = eval_df["label_id"].to_numpy()

# --------- Klassifikator: Logistic Regression (mit Klassenbalancierung) ---------
# Tipp: 'saga' oder 'lbfgs' funktionieren gut; C kann man via CV tunen.
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = {c: w for c, w in zip(classes, class_weights)}
print("Class weights (LR):", np.round(class_weights, 3))

clf = LogisticRegression(
    multi_class="auto",
    solver="lbfgs",
    C=2.0,
    max_iter=2000,
    n_jobs=-1,
    class_weight=class_weight_dict,
    random_state=SEED,
)
clf.fit(X_train, y_train)

# --------- Evaluation ---------
y_pred = clf.predict(X_eval)
metrics = {
    "accuracy":  float(accuracy_score(y_eval, y_pred)),
    "f1_macro":  float(f1_score(y_eval, y_pred, average="macro")),
    "precision": float(precision_score(y_eval, y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(y_eval, y_pred, average="macro")),
}
print("Eval:", metrics)
print("\nClassification report:\n", classification_report(y_eval, y_pred, target_names=le.classes_))

# --------- Persistenz ---------
dump(clf, os.path.join(OUT_DIR, "clf.joblib"))
dump(le,  os.path.join(OUT_DIR, "label_encoder.joblib"))

# Hinweis: Das SBERT-Basis-Modell selbst wird nicht verändert. Für Reproduzierbarkeit:
with open(os.path.join(OUT_DIR, "MODEL_NAME.txt"), "w", encoding="utf-8") as f:
    f.write(MODEL_NAME)

# --------- Inferenz-Funktion ---------
def predict_org_classes(org_names):
    names_norm = [normalize_org_name(x or "Unknown") for x in org_names]
    X = embed(names_norm)
    yhat = clf.predict(X)
    labels = le.inverse_transform(yhat)
    return labels

# Beispiel:
# print(predict_org_classes(["Siemens AG", "Acme GmbH", "Universität Zürich"]))


Labels: 10 Klassen -> ['Cable/DSL/ISP', 'Content', 'Educational/Research', 'Enterprise', 'Government', 'NSP', 'Network Services', 'Non-Profit', 'Route Collector', 'Route Server']
Device: cuda


  return self.fget.__get__(instance, owner)()
Batches: 100%|██████████| 81/81 [00:01<00:00, 52.61it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 74.88it/s]


Class weights (LR): [ 0.2    0.95   1.621  1.373 18.689  0.593  2.941  3.857 76.141  3.793]
Eval: {'accuracy': 0.2884114583333333, 'f1_macro': 0.21081945105245822, 'precision': 0.21046043930577016, 'recall': 0.36153178433530075}

Classification report:
                       precision    recall  f1-score   support

       Cable/DSL/ISP       0.73      0.30      0.43      1532
             Content       0.27      0.33      0.30       323
Educational/Research       0.30      0.48      0.37       189
          Enterprise       0.16      0.24      0.19       224
          Government       0.06      0.69      0.10        16
                 NSP       0.24      0.15      0.19       518
    Network Services       0.03      0.10      0.05       105
          Non-Profit       0.10      0.33      0.15        80
     Route Collector       0.04      0.50      0.08         4
        Route Server       0.17      0.51      0.25        81

            accuracy                           0.29      3072


In [9]:
# ===================== Sentence-BERT (frozen) + klassischer Klassifikator =====================
# Setup
import os
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
import joblib

from sentence_transformers import SentenceTransformer

# --------- Konfig ---------
SENTENCE_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # stark & multilingual
BATCH_SIZE_EMB = 256         # Embedding-Batchsize (nur Encoding, kein Finetune)
SEED          = 42
TEST_SIZE     = 0.13
OUT_DIR       = "sbert_org_cls_out"
os.makedirs(OUT_DIR, exist_ok=True)

# --------- Daten vorbereiten ---------
df = peering_df_joined.copy()
df["org_name"] = df["org_name"].fillna("Unknown")
df["info_type"] = df["info_type"].fillna("Unknown")

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"])
classes = le.classes_
num_labels = len(classes)
print(f"Labels: {num_labels} -> {list(classes)}")

# Stratified Split
train_df, eval_df = train_test_split(
    df[["org_name", "label_id"]],
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

X_train = train_df["org_name"].tolist()
y_train = train_df["label_id"].to_numpy()
X_eval  = eval_df["org_name"].tolist()
y_eval  = eval_df["label_id"].to_numpy()

# --------- SBERT Modell laden (nur Encoding) ---------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(SENTENCE_MODEL, device=device)

# Helper: Batched Encoding
def encode_batched(texts, batch_size=BATCH_SIZE_EMB):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        with torch.inference_mode():
            emb = model.encode(
                batch,
                batch_size=len(batch),  # SBERT handhabt intern batching; hier kein Overhead
                convert_to_numpy=True,
                normalize_embeddings=True  # i.d.R. gut für Cosine/LR
            )
        embs.append(emb)
    return np.vstack(embs) if embs else np.empty((0, model.get_sentence_embedding_dimension()))

print("→ Embeddings berechnen (Train)…")
X_train_emb = encode_batched(X_train)
print("→ Embeddings berechnen (Eval)…")
X_eval_emb  = encode_batched(X_eval)

# --------- Klassifikator (Logistic Regression) mit Klassen-Gewichten ---------
cls_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.arange(num_labels),
    y=y_train
)
class_weight_dict = {i: w for i, w in enumerate(cls_weights)}
print("Class weights:", np.round(cls_weights, 3))

# C und max_iter ggf. anpassen (Bremsen lösen, falls Konvergenz-Warnung)
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    class_weight=class_weight_dict,
    max_iter=2000,
    n_jobs=-1,
    random_state=SEED
)
clf.fit(X_train_emb, y_train)

# --------- Evaluation ---------
y_pred = clf.predict(X_eval_emb)
metrics = {
    "accuracy":  float(accuracy_score(y_eval, y_pred)),
    "f1_macro":  float(f1_score(y_eval, y_pred, average="macro")),
    "precision": float(precision_score(y_eval, y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(y_eval, y_pred, average="macro")),
}
print("Eval:", metrics)

# --------- Speichern (Klassifikator + LabelEncoder + Meta) ---------
joblib.dump(clf, os.path.join(OUT_DIR, "clf.joblib"))
joblib.dump(le,  os.path.join(OUT_DIR, "label_encoder.joblib"))
with open(os.path.join(OUT_DIR, "sbert_model_name.txt"), "w", encoding="utf-8") as f:
    f.write(SENTENCE_MODEL)

# --------- Inferenz-Funktion ---------
def predict_org_classes(org_names):
    """
    org_names: List[str]
    returns: List[{"org_name", "pred_label", "pred_id", "proba"}]
    """
    if isinstance(org_names, str):
        org_names = [org_names]

    embs = encode_batched(org_names)
    proba = clf.predict_proba(embs)
    pred_ids = proba.argmax(axis=1)
    pred_labels = le.inverse_transform(pred_ids)

    out = []
    for i, name in enumerate(org_names):
        out.append({
            "org_name": name,
            "pred_label": pred_labels[i],
            "pred_id": int(pred_ids[i]),
            "proba": float(proba[i, pred_ids[i]])
        })
    return out

# Beispiel:
print(predict_org_classes(["Siemens AG", "Harvard University", "ACME GmbH"]))


Labels: 10 -> ['Cable/DSL/ISP', 'Content', 'Educational/Research', 'Enterprise', 'Government', 'NSP', 'Network Services', 'Non-Profit', 'Route Collector', 'Route Server']




→ Embeddings berechnen (Train)…
→ Embeddings berechnen (Eval)…
Class weights: [ 0.2    0.95   1.621  1.373 18.689  0.593  2.941  3.857 76.141  3.793]
Eval: {'accuracy': 0.37890625, 'f1_macro': 0.26589287257832417, 'precision': 0.25268507217483904, 'recall': 0.4105633100295777}
[{'org_name': 'Siemens AG', 'pred_label': 'Enterprise', 'pred_id': 3, 'proba': 0.44211757165396004}, {'org_name': 'Harvard University', 'pred_label': 'Educational/Research', 'pred_id': 2, 'proba': 0.9297410543876621}, {'org_name': 'ACME GmbH', 'pred_label': 'Enterprise', 'pred_id': 3, 'proba': 0.2695421460279461}]


In [13]:
# ================= SBERT Multiclass Fine-Tuning (SoftmaxLoss) =================
import os, numpy as np, pandas as pd, torch, random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Modellwahl: multilingual & stark für kurze Texte
SBERT_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
OUT_DIR = "sbert_softmax_out"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Daten ---
df = peering_df_joined.copy()
df["org_name"]  = df["org_name"].fillna("Unknown")
df["info_type"] = df["info_type"].fillna("Unknown")

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"])
num_labels = df["label_id"].nunique()

train_df, eval_df = train_test_split(
    df[["org_name","label_id"]],
    test_size=0.13, random_state=SEED, stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# --- InputExamples bauen ---
train_examples = [InputExample(texts=[t], label=int(l)) for t,l in zip(train_df["org_name"], train_df["label_id"])]
eval_sents  = eval_df["org_name"].tolist()
eval_labels = eval_df["label_id"].tolist()

# --- Modell + Loss ---
model = SentenceTransformer(SBERT_NAME, device=DEVICE)
loss_fn = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=num_labels
)

# Class weighting für kurzen Text hilft selten, aber EarlyStopping & Eval helfen
batch_size = 128
train_loader = DataLoader(train_examples, shuffle=True, batch_size=batch_size, drop_last=False)

# Evaluator (Accuracy auf Dev)
evaluator = evaluation.LabelAccuracyEvaluator(
    eval_sents,
    labels=eval_labels,
    name="dev"
)
# --- Training ---
epochs = 8
warmup_ratio = 0.06
eval_steps = max(1, len(train_loader)//2)  # 2x pro Epoche evaluieren

model.fit(
    train_objectives=[(train_loader, loss_fn)],
    evaluator=evaluator,
    epochs=epochs,
    warmup_ratio=warmup_ratio,
    evaluation_steps=eval_steps,
    output_path=OUT_DIR,
    use_amp=torch.cuda.is_available(),
    checkpoint_path=os.path.join(OUT_DIR, "ckpts"),
    checkpoint_save_steps=eval_steps,
)

# --- Eval (Macro-F1 etc.) ---
model = SentenceTransformer(OUT_DIR, device=DEVICE)  # bestes Modell laden
with torch.inference_mode():
    X_eval_emb = model.encode(eval_sents, batch_size=512, convert_to_numpy=True, normalize_embeddings=True)

# Ein lineares Köpfchen für reporting (optional)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(
    multi_class="multinomial", solver="lbfgs", max_iter=2000, n_jobs=-1, random_state=SEED
).fit(
    model.encode(train_df["org_name"].tolist(), batch_size=512, convert_to_numpy=True, normalize_embeddings=True),
    train_df["label_id"].to_numpy()
)
y_pred = clf.predict(X_eval_emb)
print({
    "accuracy":  float(accuracy_score(eval_labels, y_pred)),
    "f1_macro":  float(f1_score(eval_labels, y_pred, average="macro")),
    "precision": float(precision_score(eval_labels, y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(eval_labels, y_pred, average="macro")),
})


SoftmaxLoss requires transformers >= 4.43.0 to work correctly. Otherwise, the classifier layer that maps embeddings to the labels cannot be updated. Consider updating transformers with `pip install transformers>=4.43.0`.


TypeError: LabelAccuracyEvaluator.__init__() got an unexpected keyword argument 'labels'

In [17]:
# ================= Stacking: SBERT + Char TF-IDF =================
import os, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from scipy.sparse import csr_matrix, hstack

from sentence_transformers import SentenceTransformer

SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SBERT_NAME = "Vsevolod/company-names-similarity-sentence-transformer"
OUT_DIR = "sbert2_tfidf_stack_out"
os.makedirs(OUT_DIR, exist_ok=True)

df = peering_df_joined.copy()
df["org_name"]  = df["org_name"].fillna("Unknown")
df["info_type"] = df["info_type"].fillna("Unknown")

# Einfache Normalisierung (hilft stark):
def normalize_org(s: str) -> str:
    s = s.strip()
    s = s.replace("&", " und ")
    # Rechtsformen entfernen/vereinheitlichen
    for suf in [" gmbh", " ag", " inc", " ltd", " llc", " s.a.", " s.a", " srl", " co.", " co", " company"]:
        if s.lower().endswith(suf):
            s = s[: -len(suf)]
    return " ".join(s.split())

df["org_norm"] = df["org_name"].astype(str).map(normalize_org)

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"])
y = df["label_id"].to_numpy()

train_df, eval_df = train_test_split(
    df[["org_name","org_norm","label_id"]],
    test_size=0.13, random_state=SEED, stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# --- SBERT embeddings ---
model = SentenceTransformer(SBERT_NAME, device=DEVICE)
with torch.inference_mode():
    X_train_emb = model.encode(train_df["org_name"].tolist(), batch_size=512, convert_to_numpy=True, normalize_embeddings=True)
    X_eval_emb  = model.encode(eval_df["org_name"].tolist(),  batch_size=512, convert_to_numpy=True, normalize_embeddings=True)

# --- Char TF-IDF (3–5-gram ist ein robuster Sweet-Spot) ---
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2)
X_train_tf = tfidf.fit_transform(train_df["org_norm"])
X_eval_tf  = tfidf.transform(eval_df["org_norm"])

# --- Stack: [TF-IDF | SBERT] ---
X_train_stack = hstack([X_train_tf, csr_matrix(X_train_emb)], format="csr")
X_eval_stack  = hstack([X_eval_tf,  csr_matrix(X_eval_emb)],  format="csr")

# --- Class weights ---
num_labels = df["label_id"].nunique()
cls_w = compute_class_weight("balanced", classes=np.arange(num_labels), y=train_df["label_id"].to_numpy())
cls_w = {i:w for i,w in enumerate(cls_w)}

# --- Klassifikator ---
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    class_weight=cls_w,
    max_iter=3000,
    n_jobs=-1,
    random_state=SEED
)
clf.fit(X_train_stack, train_df["label_id"].to_numpy())

y_pred = clf.predict(X_eval_stack)
print("Logistic Regression Stacked Eval:")
print({
    "accuracy":  float(accuracy_score(eval_df["label_id"], y_pred)),
    "f1_macro":  float(f1_score(eval_df["label_id"], y_pred, average="macro")),
    "precision": float(precision_score(eval_df["label_id"], y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(eval_df["label_id"], y_pred, average="macro")),
})
print("\nClassification Report:\n", classification_report(eval_df["label_id"], y_pred, target_names=le.classes_))

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

svm = LinearSVC(
    C=1.0,
    class_weight=cls_w,      # wie oben berechnet
    loss="squared_hinge",
    dual=True,               # bei vielen Features (TF-IDF) meist besser
    random_state=SEED
)
svm.fit(X_train_stack, train_df["label_id"].to_numpy())

y_pred = svm.predict(X_eval_stack)
print("SVM Stacked Eval:")
print({
    "accuracy":  float(accuracy_score(eval_df["label_id"], y_pred)),
    "f1_macro":  float(f1_score(eval_df["label_id"], y_pred, average="macro")),
    "precision": float(precision_score(eval_df["label_id"], y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(eval_df["label_id"], y_pred, average="macro")),
})
print("\nClassification Report:\n", classification_report(eval_df["label_id"], y_pred, target_names=le.classes_))




Logistic Regression Stacked Eval:
{'accuracy': 0.453125, 'f1_macro': 0.3560013073085809, 'precision': 0.32372381753312884, 'recall': 0.4334219119795401}

Classification Report:
                       precision    recall  f1-score   support

       Cable/DSL/ISP       0.77      0.52      0.62      1532
             Content       0.32      0.45      0.37       323
Educational/Research       0.45      0.65      0.53       189
          Enterprise       0.24      0.36      0.29       224
          Government       0.31      0.56      0.40        16
                 NSP       0.32      0.30      0.31       518
    Network Services       0.05      0.10      0.06       105
          Non-Profit       0.30      0.39      0.34        80
     Route Collector       0.17      0.50      0.25         4
        Route Server       0.32      0.52      0.40        81

            accuracy                           0.45      3072
           macro avg       0.32      0.43      0.36      3072
        weight



In [None]:
# ================= Stacking: SBERT + Char TF-IDF =================
import os, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from scipy.sparse import csr_matrix, hstack

from sentence_transformers import SentenceTransformer

SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SBERT_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
OUT_DIR = "sbert_tfidf_stack_out"
os.makedirs(OUT_DIR, exist_ok=True)

df = peering_df_joined.copy()
df["org_name"]  = df["org_name"].fillna("Unknown")
df["info_type"] = df["info_type"].fillna("Unknown")

# Einfache Normalisierung (hilft stark):
def normalize_org(s: str) -> str:
    s = s.strip()
    s = s.replace("&", " und ")
    # Rechtsformen entfernen/vereinheitlichen
    for suf in [" gmbh", " ag", " inc", " ltd", " llc", " s.a.", " s.a", " srl", " co.", " co", " company"]:
        if s.lower().endswith(suf):
            s = s[: -len(suf)]
    return " ".join(s.split())

df["org_norm"] = df["org_name"].astype(str).map(normalize_org)

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"])
y = df["label_id"].to_numpy()

train_df, eval_df = train_test_split(
    df[["org_name","org_norm","label_id"]],
    test_size=0.13, random_state=SEED, stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# --- SBERT embeddings ---
model = SentenceTransformer(SBERT_NAME, device=DEVICE)
with torch.inference_mode():
    X_train_emb = model.encode(train_df["org_name"].tolist(), batch_size=512, convert_to_numpy=True, normalize_embeddings=True)
    X_eval_emb  = model.encode(eval_df["org_name"].tolist(),  batch_size=512, convert_to_numpy=True, normalize_embeddings=True)

# --- Char TF-IDF (3–5-gram ist ein robuster Sweet-Spot) ---
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2)
X_train_tf = tfidf.fit_transform(train_df["org_norm"])
X_eval_tf  = tfidf.transform(eval_df["org_norm"])

# --- Stack: [TF-IDF | SBERT] ---
X_train_stack = hstack([X_train_tf, csr_matrix(X_train_emb)], format="csr")
X_eval_stack  = hstack([X_eval_tf,  csr_matrix(X_eval_emb)],  format="csr")

# --- Class weights ---
num_labels = df["label_id"].nunique()
cls_w = compute_class_weight("balanced", classes=np.arange(num_labels), y=train_df["label_id"].to_numpy())
cls_w = {i:w for i,w in enumerate(cls_w)}

# --- Klassifikator ---
clf = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    class_weight=cls_w,
    max_iter=3000,
    n_jobs=-1,
    random_state=SEED
)
clf.fit(X_train_stack, train_df["label_id"].to_numpy())

y_pred = clf.predict(X_eval_stack)
print({
    "accuracy":  float(accuracy_score(eval_df["label_id"], y_pred)),
    "f1_macro":  float(f1_score(eval_df["label_id"], y_pred, average="macro")),
    "precision": float(precision_score(eval_df["label_id"], y_pred, average="macro", zero_division=0)),
    "recall":    float(recall_score(eval_df["label_id"], y_pred, average="macro")),
})
