# Load the Data
## Peeringdb

In [2]:
import json
from pathlib import Path
import pandas as pd

filepath = Path('peeringdb/peeringdb_2_dump_2025_10_21.json')

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


# Caida AS Names

In [3]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


## Join both

In [4]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


## Caida AS Rank

In [None]:
import requests
import json
import pandas as pd
next_page = True
nodes = []
first=5000
offset=0
while(next_page):
    print(f"Fetching AS Rank data: first={first}, offset={offset}")
    response = requests.get(f"https://api.asrank.caida.org/v2/restful/asns/?first={first}&offset={offset}")

    if response.status_code != 200:
        raise ValueError(f"Request failed with status code {response.status_code}")

    as_rank_dump = response.json()
    nodes.extend(as_rank_dump['data']['asns']['edges'])
    if not as_rank_dump['data']['asns']['pageInfo']['hasNextPage']:
        next_page = False
    offset += first

edges = [e['node'] for e in nodes]
as_rank_df = pd.DataFrame(edges)
# 1️⃣ asnDegree (dict) in eigene Spalten auflösen
asnDegree_df = as_rank_df["asnDegree"].apply(pd.Series)
asnDegree_df.columns = [f"asnDegree_{c}" for c in asnDegree_df.columns]

# 2️⃣ wieder an den Haupt-DataFrame anhängen
as_rank_df = pd.concat([as_rank_df.drop(columns=["asnDegree"]), asnDegree_df], axis=1)

# 1️⃣ asnDegree (dict) in eigene Spalten auflösen
asnCone_df = as_rank_df["cone"].apply(pd.Series)
asnCone_df.columns = [f"cone_{c}" for c in asnCone_df.columns]

# 2️⃣ wieder an den Haupt-DataFrame anhängen
as_rank_df = pd.concat([as_rank_df.drop(columns=["cone"]), asnCone_df], axis=1)

# show a quick preview
as_rank_df

Fetching AS Rank data: first=5000, offset=0
Fetching AS Rank data: first=5000, offset=5000
Fetching AS Rank data: first=5000, offset=10000
Fetching AS Rank data: first=5000, offset=15000
Fetching AS Rank data: first=5000, offset=20000
Fetching AS Rank data: first=5000, offset=25000
Fetching AS Rank data: first=5000, offset=30000
Fetching AS Rank data: first=5000, offset=35000
Fetching AS Rank data: first=5000, offset=40000
Fetching AS Rank data: first=5000, offset=45000
Fetching AS Rank data: first=5000, offset=50000
Fetching AS Rank data: first=5000, offset=55000
Fetching AS Rank data: first=5000, offset=60000
Fetching AS Rank data: first=5000, offset=65000
Fetching AS Rank data: first=5000, offset=70000
Fetching AS Rank data: first=5000, offset=75000
Fetching AS Rank data: first=5000, offset=80000
Fetching AS Rank data: first=5000, offset=85000
Fetching AS Rank data: first=5000, offset=90000
Fetching AS Rank data: first=5000, offset=95000
Fetching AS Rank data: first=5000, offset=100

KeyError: 'asnDegree'

In [49]:
edges = [e['node'] for e in nodes]
as_rank_df = pd.DataFrame(edges)
asnDegree_df = as_rank_df["asnDegree"].apply(pd.Series)
asnDegree_df.columns = [f"asnDegree_{c}" for c in asnDegree_df.columns]

# 2️⃣ wieder an den Haupt-DataFrame anhängen
as_rank_df = pd.concat([as_rank_df.drop(columns=["asnDegree"]), asnDegree_df], axis=1)

# 1️⃣ asnDegree (dict) in eigene Spalten auflösen
asnCone_df = as_rank_df["cone"].apply(pd.Series)
asnCone_df.columns = [f"cone_{c}" for c in asnCone_df.columns]

# 2️⃣ wieder an den Haupt-DataFrame anhängen
as_rank_df = pd.concat([as_rank_df.drop(columns=["cone"]), asnCone_df], axis=1)

as_rank_df['asn'] = as_rank_df['asn'].astype(int)


as_rank_df = as_rank_df[['asn', 'rank', 'asnDegree_total', 'asnDegree_customer', 'asnDegree_peer', 'asnDegree_provider', 'cone_numberAsns', 'cone_numberPrefixes', 'cone_numberAddresses']]

# show a quick preview
as_rank_df

Unnamed: 0,asn,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses
0,3356,1,6613,6545,68,0,53986,873410,3468642119
1,1299,2,2567,2509,58,0,41193,776707,3219679484
2,174,3,6723,6626,97,0,38887,730166,3034352967
3,3257,4,1853,1816,37,0,36040,612491,2791999209
4,2914,5,1541,1483,58,0,25179,576134,2918763154
...,...,...,...,...,...,...,...,...,...
119390,56279,78320,0,0,0,0,1,0,0
119391,215758,78320,0,0,0,0,1,0,0
119392,144817,78320,0,0,0,0,1,0,0
119393,144068,78320,0,0,0,0,1,0,0


In [6]:
peering_df_joined_with_asrank = pd.merge(
    peering_df_joined,
    as_rank_df,
    left_on='asn',
    right_on='asn',
    how='left'
)
peering_df_joined_with_asrank

Unnamed: 0,asn,org_name,country,source,info_type,rank,asnDegree_total,asnDegree_customer,asnDegree_peer,asnDegree_provider,cone_numberAsns,cone_numberPrefixes,cone_numberAddresses
0,4436,"GTT Americas, LLC",US,ARIN,NSP,78320.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,20940,Akamai International B.V.,NL,RIPE,Content,1894.0,485.0,14.0,366.0,105.0,15.0,8945.0,14612752.0
2,31800,DALnet,US,ARIN,Non-Profit,47745.0,78.0,0.0,74.0,4.0,1.0,2.0,512.0
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP,81.0,1273.0,166.0,1101.0,6.0,733.0,22131.0,42899794.0
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP,110.0,499.0,489.0,8.0,2.0,505.0,11982.0,31992440.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23625,154232,MAX TECHNOLOGY & SUPPORT SERVICES PRIVATE LIMITED,IN,APNIC,Cable/DSL/ISP,,,,,,,,
23626,204856,,,,Educational/Research,,,,,,,,
23627,204917,,,,Cable/DSL/ISP,,,,,,,,
23628,210796,Bjoern Schleyer,DE,RIPE,NSP,,,,,,,,


# Classification

## TF-IDF

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
from tqdm import tqdm

# DataFrame (dein echter Datensatz, hier Beispiel
df = peering_df_joined
# Preprocessing
df['org_name'] = df['org_name'].fillna('Unknown').str.lower()

# Filtere Klassen mit <2 Einträgen
class_counts = df['info_type'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df['info_type'].isin(valid_classes)]
print(df)
print(f"Verwendete Klassen: {valid_classes.tolist()}")
print(f"DataFrame nach Filterung: {len(df)} Zeilen")

# --- TF-IDF-Modell ---
print("\n=== TF-IDF-Modell ===")
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), lowercase=True)
X_tfidf = vectorizer.fit_transform(df['org_name'])
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, df['info_type'], test_size=0.13, random_state=42, stratify=df['info_type'])

# Balancing
smote = SMOTE(random_state=42)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)
X_train_bal, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_tfidf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_tfidf.fit(X_train_bal, y_train_bal)

# Evaluation
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

          asn                                           org_name country  \
0        4436                                  gtt americas, llc      US   
1       20940                          akamai international b.v.      NL   
2       31800                                             dalnet      US   
3        3303                              swisscom (schweiz) ag      CH   
4       22773                            cox communications inc.      US   
...       ...                                                ...     ...   
23625  154232  max technology & support services private limited      IN   
23626  204856                                            unknown     NaN   
23627  204917                                            unknown     NaN   
23628  210796                                    bjoern schleyer      DE   
23629  400926                                       kiwi telecom      US   

      source             info_type  
0       ARIN                   NSP  
1       RIPE 

## Bert

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
import sys


# Initialize parallel_pandas

# Prüfe GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU-Name: {torch.cuda.get_device_name(0)}")
else:
    print("Warnung: Keine GPU verfügbar, CPU wird verwendet.")

# DataFrame (dein echter Datensatz, hier Beispiel
df = peering_df_joined
# Preprocessing
df['org_name'] = df['org_name'].fillna('Unknown').str.lower()

# Filtere Klassen mit <2 Einträgen
class_counts = df['info_type'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df['info_type'].isin(valid_classes)]
print(df)
print(f"Verwendete Klassen: {valid_classes.tolist()}")
print(f"DataFrame nach Filterung: {len(df)} Zeilen")

# Deduplizierung
unique_df = df.drop_duplicates(subset=['org_name'])
print(f"Eindeutige org_name: {len(unique_df)}")


# --- BERT-Modell ---
print("\n=== BERT-Modell ===")
# BERT-Tokenizer und Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.eval()

# BERT Embeddings
def get_bert_embedding(text, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(text), batch_size), desc="BERT-Embeddings"):
        batch = text[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Training
X_bert = get_bert_embedding(unique_df['org_name'].tolist())
y = unique_df['info_type']
X_train_bert, X_test_bert, y_train, y_test = train_test_split(X_bert, y, test_size=0.13, random_state=42, stratify=y)

# Balancing
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_bert, y_train)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_bal, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_bert = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_bert.fit(X_train_bal, y_train_bal)

# Evaluation
y_pred_bert = classifier_bert.predict(X_test_bert)
print("BERT Accuracy:", accuracy_score(y_test, y_pred_bert))
print("BERT Classification Report:")
print(classification_report(y_test, y_pred_bert))

Device: cuda
GPU-Name: NVIDIA GeForce RTX 4070 Laptop GPU
          asn                                           org_name country  \
0        4436                                  gtt americas, llc      US   
1       20940                          akamai international b.v.      NL   
2       31800                                             dalnet      US   
3        3303                              swisscom (schweiz) ag      CH   
4       22773                            cox communications inc.      US   
...       ...                                                ...     ...   
23625  154232  max technology & support services private limited      IN   
23626  204856                                            unknown     NaN   
23627  204917                                            unknown     NaN   
23628  210796                                    bjoern schleyer      DE   
23629  400926                                       kiwi telecom      US   

      source             info

BERT-Embeddings: 100%|██████████| 326/326 [00:09<00:00, 34.45it/s]


BERT Accuracy: 0.34403839055001845
BERT Classification Report:
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.77      0.41      0.53      1418
             Content       0.21      0.29      0.24       277
Educational/Research       0.38      0.47      0.42       170
          Enterprise       0.15      0.23      0.18       201
          Government       0.38      0.50      0.43        16
                 NSP       0.24      0.23      0.23       436
    Network Services       0.03      0.12      0.05        91
          Non-Profit       0.11      0.31      0.16        61
     Route Collector       0.00      0.00      0.00         2
        Route Server       0.10      0.32      0.15        37

            accuracy                           0.34      2709
           macro avg       0.24      0.29      0.24      2709
        weighted avg       0.51      0.34      0.39      2709



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# === Ersatz für den HF-Datasets-Teil (kein pyarrow/datasets nötig) ===
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer,
                          EarlyStoppingCallback, TextClassificationPipeline)

# --------- Konfig ---------
MODEL_NAME   = "xlm-roberta-base"   # multilingual, starkes Baseline-Modell
MAX_LENGTH   = 256                   # Org-Namen sind kurz -> 256 reicht
LR           = 1e-5
EPOCHS       = 20
BATCH_SIZE   = 32
WARMUP_RATIO = 0.06
SEED         = 42
OUT_DIR      = "xlmr_org_trainer_out"

tok = AutoTokenizer.from_pretrained(MODEL_NAME)
le = LabelEncoder()

le = LabelEncoder()
df = peering_df_joined
df["label_id"] = le.fit_transform(df["info_type"])
num_labels = len(le.classes_)
print(f"Labels: {num_labels} Klassen ->", list(le.classes_))

df["label_id"] = le.fit_transform(df["info_type"])


df.fillna('Unknown', inplace=True)

# Prüfe GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU-Name: {torch.cuda.get_device_name(0)}")
else:
    print("Warnung: Keine GPU verfügbar, CPU wird verwendet.")

# Train/Validation Split (stratifiziert)
train_df, eval_df = train_test_split(
    df[["org_name", "label_id"]],
    test_size=0.13,
    random_state=SEED,
    stratify=df["label_id"]
)
train_df = train_df.reset_index(drop=True)
eval_df  = eval_df.reset_index(drop=True)

# Texte & Labels aus den bereits vorbereiteten DataFrames (train_df, eval_df)
train_texts = train_df["org_name"].tolist()
eval_texts  = eval_df["org_name"].tolist()
y_train_np  = train_df["label_id"].to_numpy()
y_eval_np   = eval_df["label_id"].to_numpy()
num_labels  = df["label_id"].nunique()



# Tokenisierung OHNE Padding (Padding macht später der DataCollator)
train_enc = tok(train_texts, truncation=True, max_length=MAX_LENGTH)
eval_enc  = tok(eval_texts,  truncation=True, max_length=MAX_LENGTH)

class SimpleHFLikeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.enc = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item

ds_train = SimpleHFLikeDataset(train_enc, y_train_np)
ds_eval  = SimpleHFLikeDataset(eval_enc,  y_eval_np)

collator = DataCollatorWithPadding(tokenizer=tok)

valid_classes = sorted(df["info_type"].unique())

# ---- Modell + Class Weights wie gehabt ----
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={int(i): c for i, c in enumerate(valid_classes)},
    label2id={c: int(i) for i, c in enumerate(valid_classes)}
).to(device)

# Class-Weights aus dem Trainingssplit
class_counts = np.bincount(y_train_np, minlength=num_labels)
weights = class_counts.sum() / np.maximum(class_counts, 1)
weights = weights / weights.mean()
class_weights = torch.tensor(weights, dtype=torch.float, device=device)
print("Class weights:", np.round(weights, 3))

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir=OUT_DIR + "/checkpoints",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    warmup_ratio=WARMUP_RATIO,
    fp16=(device.type=="cuda"),
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    report_to=["none"],
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    return {
        "accuracy":  float(accuracy_score(labels, preds)),
        "f1_macro":  float(f1_score(labels, preds, average="macro")),
        "precision": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "recall":    float(recall_score(labels, preds, average="macro")),
    }

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
metrics = trainer.evaluate()
print("Eval:", metrics)

trainer.save_model(OUT_DIR + "/model")
tok.save_pretrained(OUT_DIR + "/model")


Labels: 10 Klassen -> ['Cable/DSL/ISP', 'Content', 'Educational/Research', 'Enterprise', 'Government', 'NSP', 'Network Services', 'Non-Profit', 'Route Collector', 'Route Server']
Device: cuda
GPU-Name: NVIDIA GeForce RTX 4070 Laptop GPU


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: [0.018 0.086 0.147 0.125 1.697 0.054 0.267 0.35  6.912 0.344]




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision,Recall
1,2.172,2.033163,0.435872,0.218902,0.254875,0.232359
2,1.8301,1.796867,0.457031,0.296792,0.312804,0.352578
3,1.7345,1.769686,0.430339,0.29257,0.287761,0.372117
4,1.6681,1.733563,0.511068,0.329118,0.325248,0.377533
5,1.6421,1.770867,0.47526,0.327483,0.30637,0.384421
6,1.4849,1.820235,0.504232,0.345382,0.35261,0.376024
7,1.4751,1.793263,0.508789,0.340168,0.326939,0.381363
8,1.3682,1.797449,0.455729,0.330194,0.311678,0.372623


Eval: {'eval_loss': 1.8202348947525024, 'eval_accuracy': 0.5042317708333334, 'eval_f1_macro': 0.34538187490101324, 'eval_precision': 0.3526095238775851, 'eval_recall': 0.3760243040019605, 'eval_runtime': 1.2399, 'eval_samples_per_second': 2477.658, 'eval_steps_per_second': 77.427, 'epoch': 8.0}


('xlmr_org_trainer_out/model/tokenizer_config.json',
 'xlmr_org_trainer_out/model/special_tokens_map.json',
 'xlmr_org_trainer_out/model/tokenizer.json')

In [17]:
# Calculate the maximum length of org_name strings in peering_df_joined, ignoring NaN values
max_org_name_length = joined_df['org_name'].dropna().str.len().max()
print(f"The biggest length of org_name is: {max_org_name_length}")

The biggest length of org_name is: 203


In [None]:
# ==== Text + numerische AS-Rank-Features in einem HF-Trainer ====
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (
    AutoTokenizer, AutoModel, TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# --------- Konfig ---------
MODEL_NAME   = "xlm-roberta-base"
MAX_LENGTH   = 64           # 64 reicht für Org-Namen
LR           = 2e-5
EPOCHS       = 25
BATCH_SIZE   = 32
WARMUP_RATIO = 0.06
SEED         = 100
OUT_DIR      = "xlmr_org_trainer_out_mixed"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# --------- Daten ---------
df = peering_df_joined_with_asrank.copy()

# Label-Encode
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["info_type"].astype(str))
num_labels = len(le.classes_)
id2label = {i: c for i,c in enumerate(le.classes_)}
label2id = {c: i for i,c in enumerate(le.classes_)}

# Text
df["org_name"] = df["org_name"].fillna("Unknown").astype(str).str.strip()

# Numerische Featureliste (deine Spalten)
FEAT_COLS = [
    "rank",
    "asnDegree_total", "asnDegree_customer", "asnDegree_peer", "asnDegree_provider",
    "cone_numberAsns", "cone_numberPrefixes", "cone_numberAddresses",
]
num_feats = scaler.fit_transform(df[FEAT_COLS].fillna(0))
# NaNs -> 0, sinnvolle Skalen (log1p für stark schiefe Größen)
num_df = df[FEAT_COLS].copy()
for c in FEAT_COLS:
    if c in ("cone_numberPrefixes","cone_numberAddresses","cone_numberAsns"):
        num_df[c] = np.log1p(pd.to_numeric(num_df[c], errors="coerce").fillna(0))
    else:
        num_df[c] = pd.to_numeric(num_df[c], errors="coerce").fillna(0)

# Split
train_idx, eval_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.13,
    random_state=SEED,
    stratify=df["label_id"]
)
train_texts = df.loc[train_idx, "org_name"].tolist()
eval_texts  = df.loc[eval_idx,  "org_name"].tolist()
y_train_np  = df.loc[train_idx, "label_id"].to_numpy()
y_eval_np   = df.loc[eval_idx,  "label_id"].to_numpy()

Xnum_train = num_df.loc[train_idx].to_numpy(dtype=np.float32)
Xnum_eval  = num_df.loc[eval_idx].to_numpy(dtype=np.float32)

# Standardisieren (nur auf Train fitten!)
scaler = StandardScaler()
Xnum_train = scaler.fit_transform(Xnum_train)
Xnum_eval  = scaler.transform(Xnum_eval)

# Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenisierung (ohne Padding – das macht der Collator)
enc_train = tok(train_texts, truncation=True, max_length=MAX_LENGTH)
enc_eval  = tok(eval_texts,  truncation=True, max_length=MAX_LENGTH)

# Dataset: gibt Listen (keine Tensors) zurück -> Collator kümmert sich ums Padding/Stacking
class TextNumDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, num_feats, labels):
        self.enc = encodings
        self.num = num_feats
        self.y   = labels
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        item = {k: self.enc[k][idx] for k in self.enc}  # ids/attn as plain lists
        item["features"] = self.num[idx]
        item["labels"]   = int(self.y[idx])
        return item

ds_train = TextNumDataset(enc_train, Xnum_train, y_train_np)
ds_eval  = TextNumDataset(enc_eval,  Xnum_eval,  y_eval_np)

# Custom Collator: pad Text + stapel numerische Features
class MixedCollator:
    def __init__(self, tokenizer): self.tok = tokenizer
    def __call__(self, batch):
        text = {k: [b[k] for b in batch] for k in ["input_ids","attention_mask"]}
        text = self.tok.pad(text, return_tensors="pt")
        feats = torch.tensor([b["features"] for b in batch], dtype=torch.float)
        labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
        text["features"] = feats
        text["labels"]   = labels
        return text

collator = MixedCollator(tok)

# Class-Weights (aus Train)
class_counts = np.bincount(y_train_np, minlength=num_labels)
w = class_counts.sum() / np.maximum(class_counts, 1)
w = w / w.mean()
class_weights = torch.tensor(w, dtype=torch.float, device=device)
print("Class weights:", np.round(w, 3))

# Modell: XLM-R Encoder + mean pooling + numerische Features -> MLP-Classifier
class TextPlusNumClassifier(nn.Module):
    def __init__(self, model_name, num_labels, num_num_feats, dropout=0.2, use_attn_pool=False):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = self.backbone.config.hidden_size  # z.B. 768

        # 🔸 numerische Features erst in einen eigenen Raum projizieren
        self.num_proj = nn.Sequential(
            nn.Linear(num_num_feats, 128),
            nn.ReLU(),
            nn.LayerNorm(128),
        )

        # (optional) Attention-Pooling statt Mean-Pooling
        self.use_attn_pool = use_attn_pool
        if self.use_attn_pool:
            self.attn = nn.Linear(hidden, 1)

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential(
            nn.Linear(hidden + 128, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, num_labels),
        )
        self.num_labels = num_labels

    def _pool(self, last_hidden, attn_mask):
        if not self.use_attn_pool:
            # Mean-Pooling
            mask = attn_mask.unsqueeze(-1)                   # [B,T,1]
            return (last_hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
        # Attention-Pooling
        scores = self.attn(last_hidden).squeeze(-1)          # [B,T]
        scores = scores.masked_fill(attn_mask == 0, -1e9)
        weights = scores.softmax(dim=-1).unsqueeze(-1)       # [B,T,1]
        return (last_hidden * weights).sum(1)                # [B,H]

    def forward(self, input_ids=None, attention_mask=None, features=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self._pool(out.last_hidden_state, attention_mask)      # [B,H]

        # 🔸 numerische Features projizieren & mit Text-Embedding konkatenieren
        num_emb = self.num_proj(features)                                # [B,128]
        z = torch.cat([pooled, num_emb], dim=1)                          # [B,H+128]

        logits = self.classifier(self.dropout(z))
        loss = None
        if labels is not None:
            # Label smoothing hilft oft bei Imbalance leicht
            loss = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)(logits, labels)
        return {"loss": loss, "logits": logits}

model = TextPlusNumClassifier(MODEL_NAME, num_labels=num_labels, num_num_feats=Xnum_train.shape[1]).to(device)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  float(accuracy_score(labels, preds)),
        "f1_macro":  float(f1_score(labels, preds, average="macro")),
        "precision": float(precision_score(labels, preds, average="macro", zero_division=0)),
        "recall":    float(recall_score(labels, preds, average="macro")),
    }

# TrainingArguments
args = TrainingArguments(
    output_dir=OUT_DIR + "/checkpoints",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    warmup_ratio=WARMUP_RATIO,
    gradient_accumulation_steps=1,
    fp16=(device.type=="cuda"),
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    report_to=["none"],
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
print("Eval:", trainer.evaluate())

# Modelle speichern (Backbone+Head + LabelEncoder & Scaler separat speichern, falls gewünscht)
trainer.save_model(OUT_DIR + "/model")
tok.save_pretrained(OUT_DIR + "/model")

# Tipp: Speichere auch den StandardScaler, damit du bei Inferenz die numerischen Features identisch transformierst
import joblib, os
os.makedirs(OUT_DIR + "/model", exist_ok=True)
joblib.dump({"scaler": scaler, "feat_cols": FEAT_COLS, "label_encoder": le}, OUT_DIR + "/model/aux.pkl")
print("Aux artefacts saved to", OUT_DIR + "/model/aux.pkl")


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
Class weights: [0.018 0.086 0.147 0.125 1.697 0.054 0.267 0.35  6.912 0.344]


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  feats = torch.tensor([b["features"] for b in batch], dtype=torch.float)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision,Recall
1,4.1651,3.971931,0.032552,0.084598,0.090505,0.213724
2,4.0537,3.797562,0.041992,0.171292,0.236636,0.251107
3,3.8311,3.80407,0.058268,0.181972,0.205395,0.279771


In [56]:
as_rank_df.to_csv('as_rank_df.csv', index=False)

In [5]:
as_rank_df = pd.read_csv('as_rank_df.csv')