# Load the Data
## Peeringdb

In [2]:
import json
from pathlib import Path
import pandas as pd

filepath = Path('../../preprocessing/data/peeringdb/peeringdb_2_dump_2025_10_21.json')

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


# Caida AS Names

In [3]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


## Join both

In [4]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


# Classification

## TF-IDF

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# ==== Daten ====
df = peering_df_joined.copy()
df["org_name"] = df["org_name"].fillna("unknown").str.lower()
valid = df["info_type"].value_counts()
df = df[df["info_type"].isin(valid[valid >= 5].index)]  # sehr kleine Klassen raus (optional)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    df["org_name"], df["info_type"], test_size=0.13, random_state=42, stratify=df["info_type"]
)

# Gemeinsamer Vectorizer (fit nur auf Train!)
vec = TfidfVectorizer(analyzer="char", ngram_range=(1,6),
                      lowercase=True, min_df=1, sublinear_tf=True)

# ==== 1) SVM + Kalibrierung ====
svm = LinearSVC(C=0.35, class_weight="balanced")
svm_cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)

svm_pipe = Pipeline([
    ("tfidf", vec),
    ("svm_cal", svm_cal)
])

svm_pipe.fit(X_train_text, y_train)
y_pred_svm = svm_pipe.predict(X_test_text)
print("\n=== SVM (calibrated) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Macro-F1:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm))



=== SVM (calibrated) ===
Accuracy: 0.5869140625
Macro-F1: 0.3405334971016842
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.62      0.93      0.74      1532
             Content       0.43      0.34      0.38       323
Educational/Research       0.67      0.48      0.56       189
          Enterprise       0.43      0.13      0.20       224
          Government       0.44      0.25      0.32        16
                 NSP       0.42      0.17      0.24       518
    Network Services       0.00      0.00      0.00       105
          Non-Profit       0.65      0.30      0.41        80
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.65      0.48      0.55        81

            accuracy                           0.59      3072
           macro avg       0.43      0.31      0.34      3072
        weighted avg       0.53      0.59      0.53      3072



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Bert

In [6]:
# === Ersatz für den HF-Datasets-Teil (kein pyarrow/datasets nötig) ===
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer,
                          EarlyStoppingCallback, TextClassificationPipeline)

# --------- Konfig ---------
MODEL_NAME   = "xlm-roberta-base"   # multilingual, starkes Baseline-Modell
MAX_LENGTH   = 64                   # Org-Namen sind kurz -> 64 reicht
LR           = 1e-5
EPOCHS       = 20
BATCH_SIZE   = 32
WARMUP_RATIO = 0.06
SEED         = 42
OUT_DIR      = "xlmr_org_trainer_out"

tok = AutoTokenizer.from_pretrained(MODEL_NAME)
le = LabelEncoder()

le = LabelEncoder()
df = peering_df_joined
df["label_id"] = le.fit_transform(df["info_type"])
num_labels = len(le.classes_)
print(f"Labels: {num_labels} Klassen ->", list(le.classes_))

df["label_id"] = le.fit_transform(df["info_type"])


df.fillna('Unknown', inplace=True)

# Prüfe GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU-Name: {torch.cuda.get_device_name(0)}")
else:
    print("Warnung: Keine GPU verfügbar, CPU wird verwendet.")

# Train/Validation Split (stratifiziert)
train_df, eval_df = train_test_split(
    df[["org_name", "label_id"]],
    test_size=0.13,
    random_state=SEED,
    stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# Texte & Labels aus den bereits vorbereiteten DataFrames (train_df, eval_df)
train_texts = train_df["org_name"].tolist()
eval_texts  = eval_df["org_name"].tolist()
y_train_np  = train_df["label_id"].to_numpy()
y_eval_np   = eval_df["label_id"].to_numpy()
num_labels  = df["label_id"].nunique()



# Tokenisierung OHNE Padding (Padding macht später der DataCollator)
train_enc = tok(train_texts, truncation=True, max_length=MAX_LENGTH)
eval_enc  = tok(eval_texts,  truncation=True, max_length=MAX_LENGTH)

class SimpleHFLikeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.enc = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item

ds_train = SimpleHFLikeDataset(train_enc, y_train_np)
ds_eval  = SimpleHFLikeDataset(eval_enc,  y_eval_np)

collator = DataCollatorWithPadding(tokenizer=tok)

valid_classes = sorted(df["info_type"].unique())

# ---- Modell + Class Weights wie gehabt ----
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={int(i): c for i, c in enumerate(valid_classes)},
    label2id={c: int(i) for i, c in enumerate(valid_classes)}
).to(device)

# Class-Weights aus dem Trainingssplit
class_counts = np.bincount(y_train_np, minlength=num_labels)
weights = class_counts.sum() / np.maximum(class_counts, 1)
weights = weights / weights.mean()
class_weights = torch.tensor(weights, dtype=torch.float, device=device)
print("Class weights:", np.round(weights, 3))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir=OUT_DIR + "/checkpoints",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    warmup_ratio=WARMUP_RATIO,
    fp16=(device.type=="cuda"),
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    report_to=["none"],
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    return {
        "accuracy":  float(accuracy_score(labels, preds)),
        "f1_macro":  float(f1_score(labels, preds, average="macro")),
        "precision": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "recall":    float(recall_score(labels, preds, average="macro")),
    }

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
metrics = trainer.evaluate()
print("Eval:", metrics)

trainer.save_model(OUT_DIR + "/model")
tok.save_pretrained(OUT_DIR + "/model")


  from .autonotebook import tqdm as notebook_tqdm


Labels: 10 Klassen -> ['Cable/DSL/ISP', 'Content', 'Educational/Research', 'Enterprise', 'Government', 'NSP', 'Network Services', 'Non-Profit', 'Route Collector', 'Route Server']
Device: cuda
GPU-Name: NVIDIA GeForce RTX 4070 Laptop GPU


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: [0.018 0.086 0.147 0.125 1.697 0.054 0.267 0.35  6.912 0.344]




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision,Recall
1,2.1687,2.095237,0.420898,0.164606,0.201883,0.220511
2,1.8373,1.827948,0.469401,0.28203,0.34493,0.338224
3,1.7555,1.742485,0.467773,0.305559,0.30218,0.375189
4,1.725,1.708697,0.527018,0.335315,0.350427,0.392946
5,1.6744,1.727177,0.497396,0.327816,0.314828,0.383184
6,1.4727,1.822304,0.512044,0.35761,0.354573,0.391223
7,1.5011,1.738105,0.536784,0.354389,0.346468,0.390336
8,1.3721,1.74905,0.466146,0.341322,0.323694,0.382001


Eval: {'eval_loss': 1.822304368019104, 'eval_accuracy': 0.5120442708333334, 'eval_f1_macro': 0.3576104467418522, 'eval_precision': 0.35457311486282983, 'eval_recall': 0.39122302576300466, 'eval_runtime': 1.157, 'eval_samples_per_second': 2655.132, 'eval_steps_per_second': 82.973, 'epoch': 8.0}


('xlmr_org_trainer_out/model/tokenizer_config.json',
 'xlmr_org_trainer_out/model/special_tokens_map.json',
 'xlmr_org_trainer_out/model/tokenizer.json')

In [7]:
# ==== Ensemble aus kalibrierter SVM + XLM-R (Late Fusion) ====
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 1) Hilfsfunktionen: Probas holen und Klassenreihenfolge erzwingen
label_names = list(valid_classes)  # <- gleiche Reihenfolge wie beim HF-Modell (id2label/label2id)
label_index = {lbl: i for i, lbl in enumerate(label_names)}

def svm_proba(texts):
    """Kalibrierte SVM-Probas in label_names-Reihenfolge."""
    # svm_pipe.classes_ enthält die Klassenreihenfolge des SVM-Teils
    svm_labels = list(svm_pipe.named_steps["svm_cal"].classes_) if hasattr(svm_pipe.named_steps["svm_cal"], "classes_") \
                 else list(svm_pipe.classes_)
    proba = svm_pipe.predict_proba(texts)  # shape: [N, n_classes_svm]
    # Auf label_names umsortieren
    idx_map = [svm_labels.index(lbl) for lbl in label_names]
    proba_sorted = proba[:, idx_map]
    return proba_sorted

@torch.no_grad()
def xlmr_proba(texts, batch_size=64, max_length=256):
    """Transformer-Softmax-Probas in label_names-Reihenfolge (id2label stimmt auf valid_classes)."""
    model.eval()
    all_probs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(batch, truncation=True, max_length=max_length, padding=True, return_tensors="pt")
        enc = {k: v.to(model.device) for k, v in enc.items()}
        logits = model(**enc).logits  # [B, num_labels]
        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
        all_probs.append(probs)
    return np.vstack(all_probs)  # [N, num_labels] schon in label_names-Reihenfolge

# 2) Probas auf deinem (Ensemble-)Validierungs/Testsplit erzeugen
X_eval = eval_df["org_name"].tolist()
y_eval = y_eval_np  # ints passend zur Reihenfolge in label_names

P_svm  = svm_proba(X_eval)              # [N, C]
P_xlmr = xlmr_proba(X_eval, max_length=MAX_LENGTH)  # [N, C]

# 3) Gewicht per einfacher Grid-Search finden (0..1)
grid = np.linspace(0.0, 1.0, 21)  # 0.00, 0.05, ..., 1.00
best = {"w": None, "f1": -1.0, "acc": 0.0}

for w in grid:
    P_ens = w * P_svm + (1.0 - w) * P_xlmr
    y_hat = P_ens.argmax(axis=1)
    f1 = f1_score(y_eval, y_hat, average="macro")
    acc = accuracy_score(y_eval, y_hat)
    if f1 > best["f1"] or (f1 == best["f1"] and acc > best["acc"]):
        best.update({"w": float(w), "f1": float(f1), "acc": float(acc)})

print(f"\n=== Ensemble-Gewichtssuche ===")
print(f"Bestes w (SVM-Anteil): {best['w']:.2f} | Macro-F1: {best['f1']:.4f} | Acc: {best['acc']:.4f}")

# 4) Finale Ensemble-Vorhersage + Report
w = best["w"]
P_ens = w * P_svm + (1.0 - w) * P_xlmr
y_pred = P_ens.argmax(axis=1)

print("\n=== Ensemble (SVM^w + XLM-R^(1-w)) auf Eval ===")
print("Accuracy:", accuracy_score(y_eval, y_pred))
print("Macro-F1:", f1_score(y_eval, y_pred, average="macro"))
print(classification_report(y_eval, y_pred, target_names=label_names))

# 5) Praktische Inferenzfunktion fürs spätere Nutzen
def ensemble_predict(texts, return_proba=False, batch_size=64):
    Ps = svm_proba(texts)
    Pt = xlmr_proba(texts, batch_size=batch_size, max_length=MAX_LENGTH)
    P = w * Ps + (1.0 - w) * Pt
    preds = P.argmax(axis=1)
    if return_proba:
        return preds, P
    return preds

# Beispiel:
# preds, proba = ensemble_predict(["google llc", "university of oxford"], return_proba=True)



=== Ensemble-Gewichtssuche ===
Bestes w (SVM-Anteil): 0.60 | Macro-F1: 0.3793 | Acc: 0.5993

=== Ensemble (SVM^w + XLM-R^(1-w)) auf Eval ===
Accuracy: 0.5992838541666666
Macro-F1: 0.3792544149591472
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.67      0.88      0.76      1532
             Content       0.43      0.41      0.42       323
Educational/Research       0.59      0.74      0.66       189
          Enterprise       0.34      0.24      0.28       224
          Government       0.40      0.50      0.44        16
                 NSP       0.41      0.17      0.24       518
    Network Services       0.00      0.00      0.00       105
          Non-Profit       0.67      0.30      0.41        80
     Route Collector       0.00      0.00      0.00         4
        Route Server       0.57      0.58      0.57        81

            accuracy                           0.60      3072
           macro avg       0.41      0.38      0.38   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import torch
import numpy as np
from sklearn.metrics import log_loss

@torch.no_grad()
def xlmr_logits(texts, batch_size=64, max_length=256):
    model.eval()
    outs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(batch, truncation=True, max_length=max_length, padding=True, return_tensors="pt")
        enc = {k: v.to(model.device) for k, v in enc.items()}
        outs.append(model(**enc).logits.detach().cpu().numpy())
    return np.vstack(outs)

# 1) Roh-Logits holen
Z = xlmr_logits(eval_df["org_name"].tolist(), max_length=MAX_LENGTH)  # [N, C]

# 2) Optimale Temperatur per NLL auf Eval suchen
def softmax(z): 
    z = z - z.max(axis=1, keepdims=True)
    ez = np.exp(z)
    return ez / ez.sum(axis=1, keepdims=True)

def best_temperature(logits, y, grid=np.linspace(0.5, 3.0, 26)):
    best_T, best_nll = 1.0, 1e9
    for T in grid:
        P = softmax(logits / T)
        nll = log_loss(y, P, labels=list(range(P.shape[1])))
        if nll < best_nll:
            best_T, best_nll = float(T), float(nll)
    return best_T

T = best_temperature(Z, y_eval)
print(f"\n=== Temperature Scaling ===\nBeste Temperatur T: {T:.2f}")

# 3) Kalibrierte XLM-R-Probas neu berechnen und erneut ensemblen
P_xlmr_cal = softmax(Z / T)
# Danach z.B. mit Logit-Blending (s. oben) erneut w suchen:
Ls = safe_logits(P_svm); Lt = safe_logits(P_xlmr_cal)
# ... identischer Grid-Search-Code wie oben ...



=== Temperature Scaling ===
Beste Temperatur T: 1.00


NameError: name 'safe_logits' is not defined

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

# ==== Beispiel-Daten ====
df = peering_df_joined.copy()
# df["org_name"], df["country"], df["asn"], df["ix_count"], df["info_type"]

df["org_name"] = df["org_name"].fillna("unknown").str.lower()
df["country"] = df["country"].fillna("??")
df["ix_count"] = df["ix_count"].fillna(0)

# ==== Hilfsfunktion für Regex-Features aus dem Namen ====
def regex_features(X):
    X = X["org_name"].astype(str).str.lower()
    return pd.DataFrame({
        "has_isp": X.str.contains(r"isp|telecom|broadband|internet").astype(int),
        "has_univ": X.str.contains(r"univ|college|schule|academy").astype(int),
        "has_gov": X.str.contains(r"gov|ministerium|city|state|municipal").astype(int),
        "has_ix": X.str.contains(r"\bix\b|exchange|route").astype(int),
        "has_asn": X.str.contains(r"as\d+").astype(int),
    })

# ==== Spalten definieren ====
text_col = "org_name"
regex_col = ["org_name"]

# ==== ColumnTransformer aufbauen ====
preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,6), sublinear_tf=True), text_col),
    ("regex", FunctionTransformer(regex_features), regex_col)
])

# ==== Modell ====
svm = LinearSVC(C=0.35, class_weight="balanced")
svm_cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)

pipe = Pipeline([
    ("features", preprocessor),
    ("clf", svm_cal)
])

NameError: name 'df' is not defined