In [1]:
import json
from pathlib import Path
import pandas as pd

# locate a likely PeeringDB dump JSON in the workspace
candidates = list(Path('.').glob('*peeringdb*dump*.json')) + list(Path('.').glob('peeringdb*.json'))
if not candidates:
    raise FileNotFoundError("PeeringDB dump JSON not found. Place a file like 'peeringdb_dump.json' in the working directory.")
filepath = candidates[0]

with filepath.open('r', encoding='utf-8') as f:
    dump = json.load(f)

# extract the net.data section and load into a DataFrame
net_data = dump.get('net', {}).get('data')
if net_data is None:
    raise KeyError("JSON does not contain 'net' -> 'data' structure")

net_df = pd.DataFrame(net_data)
net_df['asn'] = net_df['asn'].astype(int)
net_df = net_df[net_df['info_type'] != '']

# show a quick preview
net_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,policy_ratio,policy_contracts,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,Not Required,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,Not Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,True,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,Required,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok


In [2]:
# show all columns in net_df (net_df is defined in a previous cell)
print(f"{len(net_df.columns)} columns:")
pd.DataFrame(net_df.columns, columns=["column"])

41 columns:


Unnamed: 0,column
0,id
1,org_id
2,name
3,aka
4,name_long
5,website
6,social_media
7,asn
8,looking_glass
9,route_server


In [3]:
def date_stats(dataframe, column_name):
    numeric_data = pd.to_datetime(dataframe[column_name]).astype(int) / 10**9
    stats = {
        'min': pd.to_datetime(numeric_data.min(), unit='s'),
        'max': pd.to_datetime(numeric_data.max(), unit='s'),
        'median': pd.to_datetime(numeric_data.median(), unit='s'),
        'mean': pd.to_datetime(numeric_data.mean(), unit='s'),
        '10-percentile': pd.to_datetime(numeric_data.quantile(0.1), unit='s'),
        '20-percentile': pd.to_datetime(numeric_data.quantile(0.2), unit='s'),
        '30-percentile': pd.to_datetime(numeric_data.quantile(0.3), unit='s'),
        '40-percentile': pd.to_datetime(numeric_data.quantile(0.4), unit='s'),
        '50-percentile': pd.to_datetime(numeric_data.quantile(0.5), unit='s'),
        '60-percentile': pd.to_datetime(numeric_data.quantile(0.6), unit='s'),
        '70-percentile': pd.to_datetime(numeric_data.quantile(0.7), unit='s'),
        '80-percentile': pd.to_datetime(numeric_data.quantile(0.8), unit='s'),
        '90-percentile': pd.to_datetime(numeric_data.quantile(0.9), unit='s'),
    }
    for key, value in stats.items():
        stats[key] = pd.to_datetime(value, unit='s')
    return stats

In [4]:
date_stats(net_df, 'updated')

{'min': Timestamp('2022-07-27 05:33:15'),
 'max': Timestamp('2025-10-22 03:59:18'),
 'median': Timestamp('2024-03-23 06:38:05.500000'),
 'mean': Timestamp('2024-01-10 22:15:54.764705792'),
 '10-percentile': Timestamp('2022-07-27 05:34:16'),
 '20-percentile': Timestamp('2022-07-27 05:35:47'),
 '30-percentile': Timestamp('2022-07-27 05:37:01.700000'),
 '40-percentile': Timestamp('2023-04-07 12:59:21'),
 '50-percentile': Timestamp('2024-03-23 06:38:05.500000'),
 '60-percentile': Timestamp('2024-09-18 05:03:41.600000'),
 '70-percentile': Timestamp('2025-02-10 14:26:01.600000'),
 '80-percentile': Timestamp('2025-05-26 12:18:01.400000'),
 '90-percentile': Timestamp('2025-08-22 10:22:03.400000')}

In [5]:
date_stats(net_df, 'created')

{'min': Timestamp('2004-07-28 00:00:00'),
 'max': Timestamp('2025-10-21 14:40:30'),
 'median': Timestamp('2019-06-05 06:46:23.500000'),
 'mean': Timestamp('2018-10-16 00:34:18.902496768'),
 '10-percentile': Timestamp('2011-10-27 13:48:24.900000'),
 '20-percentile': Timestamp('2015-09-17 13:19:00.200000'),
 '30-percentile': Timestamp('2017-02-03 04:00:07.300000'),
 '40-percentile': Timestamp('2018-04-03 09:04:01.200000'),
 '50-percentile': Timestamp('2019-06-05 06:46:23.500000'),
 '60-percentile': Timestamp('2020-07-04 02:40:40.200000'),
 '70-percentile': Timestamp('2021-08-19 12:40:45.400000'),
 '80-percentile': Timestamp('2022-12-20 15:34:28.400000'),
 '90-percentile': Timestamp('2024-05-08 20:59:50.900000')}

In [6]:
date_stats(net_df, 'rir_status_updated')

{'min': NaT,
 'max': Timestamp('2025-10-21 22:55:08'),
 'median': Timestamp('2024-06-26 04:47:55'),
 'mean': Timestamp('2024-07-16 02:43:55.537119744'),
 '10-percentile': Timestamp('2024-06-26 04:47:55'),
 '20-percentile': Timestamp('2024-06-26 04:47:55'),
 '30-percentile': Timestamp('2024-06-26 04:47:55'),
 '40-percentile': Timestamp('2024-06-26 04:47:55'),
 '50-percentile': Timestamp('2024-06-26 04:47:55'),
 '60-percentile': Timestamp('2024-06-26 04:47:55'),
 '70-percentile': Timestamp('2024-06-26 04:47:55'),
 '80-percentile': Timestamp('2024-06-26 04:47:55'),
 '90-percentile': Timestamp('2024-07-22 12:47:41')}

In [2]:
import pandas as pd
from pathlib import Path

filepath = '../nro-delegated-stats/nro-delegated-stats'

# load the delegated stats into a DataFrame
# assuming it's a pipe-separated file with no header
nro_df = pd.read_csv(filepath, sep='|', header=None, names=['rir', 'country', 'type', 'asn', 'size', 'ignore1', 'status', 'ignore2', 'ignore3'], on_bad_lines='skip')
nro_df = nro_df[nro_df['type'] == 'asn']
nro_df.dropna(inplace=True)
nro_df = nro_df[nro_df['country'] != 'ZZ']

# expand nro_df to include individual ASNs based on size
expanded_rows = []
for _, row in nro_df.iterrows():
    start_asn = int(row['asn'])
    count = row['size']
    for i in range(count):
        new_row = row.copy()
        new_row['asn'] = str(start_asn + i)
        expanded_rows.append(new_row)
nro_df = pd.DataFrame(expanded_rows)

# show a quick preview
nro_df.head()

  nro_df = pd.read_csv(filepath, sep='|', header=None, names=['rir', 'country', 'type', 'asn', 'size', 'ignore1', 'status', 'ignore2', 'ignore3'], on_bad_lines='skip')


KeyboardInterrupt: 

In [8]:
nro_df['rir'].value_counts()

ripencc    38960
arin       33517
apnic      30532
lacnic     13912
afrinic     2603
Name: rir, dtype: int64

In [9]:
nro_df[nro_df['asn'] == '3320']

Unnamed: 0,rir,country,type,asn,size,ignore1,status,ignore2,ignore3
2669,ripencc,DE,asn,3320,1,19950223,assigned,4227b4f5-07d9-4852-b1d5-26559261d043,e-stats


In [10]:
# Merge net_df with nro_df on 'asn' to add RIR and country information
net_df['asn'] = net_df['asn'].astype(str)
merged_df = net_df.merge(nro_df[['asn', 'rir', 'country']], on='asn', how='left')
merged_df.head()

Unnamed: 0,id,org_id,name,aka,name_long,website,social_media,asn,looking_glass,route_server,...,allow_ixp_update,status_dashboard,rir_status,rir_status_updated,logo,created,updated,status,rir,country
0,1,8897,GTT Communications (AS4436),Formerly known as nLayer Communications,,http://www.gtt.net,"[{'service': 'website', 'identifier': 'http://...",4436,,,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-07-27T05:33:22Z,ok,arin,US
1,2,14,Akamai Technologies,,,https://www.akamai.com/,"[{'service': 'website', 'identifier': 'https:/...",20940,,,...,False,https://www.akamaistatus.com/,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-10-20T12:16:12Z,ok,ripencc,NL
2,3,17,DALnet IRC Network,,,http://www.dal.net,"[{'service': 'website', 'identifier': 'http://...",31800,,,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-01-09T13:42:07Z,ok,arin,US
3,5,9350,Swisscom,IP-Plus,,http://www.swisscom.com,"[{'service': 'website', 'identifier': 'http://...",3303,,telnet://route-server.ip-plus.net,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2025-08-12T06:33:30Z,ok,ripencc,CH
4,6,23,Cox Communications,Cox Communications,,http://www.cox.com/peering,"[{'service': 'website', 'identifier': 'http://...",22773,,,...,False,,ok,2024-06-26T04:47:55Z,,2004-07-28T00:00:00Z,2022-11-28T22:55:17Z,ok,arin,US


In [11]:
merged_df['rir'].value_counts()

ripencc    8396
lacnic     5292
apnic      5165
arin       3758
afrinic     928
Name: rir, dtype: int64

In [2]:
import io
with open('/workspaces/pytorch-gpu-2/preprocessing/data/caida/20251001.as-org2info.txt', 'r', newline='', encoding='utf-8') as input_file:
    lines = input_file.readlines()   
    # Buffers initialisieren
    aut_lines = []
    org_lines = []
    mode = None
    total_lines = len(lines)
    aut_count = 0
    org_count = 0 

    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("# format:aut"):
            mode = "aut"
            continue
        elif line.startswith("# format:org_id"):
            mode = "org"
            continue
        elif line.startswith("#") or not line:
            # Andere Kommentar- oder Leerzeilen überspringen
            continue      
        if mode == "aut":
            aut_lines.append(line)
            aut_count += 1
        elif mode == "org":
            org_lines.append(line)
            org_count += 1
    # StringIO-Objekte aus den gesammelten Zeilen bauen
    aut_buffer = io.StringIO("\n".join(aut_lines))
    org_buffer = io.StringIO("\n".join(org_lines))
    # DataFrames einlesen
    aut_df = pd.read_csv(aut_buffer, sep="|",
                        names=["aut", "changed", "aut_name", "org_id", "opaque_id", "source"], usecols=["aut", "org_id", "source", "changed"])
    org_df = pd.read_csv(org_buffer, sep="|",
                        names=["org_id", "changed", "org_name", "country", "source"], usecols=["org_id", "org_name", "country"])

    # Join the DataFrames
    joined_df = pd.merge(aut_df, org_df, on="org_id", how="left")


In [5]:
joined_df.head()

Unnamed: 0,aut,changed,org_id,source,org_name,country
0,1,20240618.0,LPL-141-ARIN,ARIN,"Level 3 Parent, LLC",US
1,2,20231108.0,UNIVER-19-Z-ARIN,ARIN,University of Delaware,US
2,3,20100927.0,MIT-2-ARIN,ARIN,Massachusetts Institute of Technology,US
3,4,20230929.0,USC-32-Z-ARIN,ARIN,University of Southern California,US
4,5,20200723.0,WGL-117-ARIN,ARIN,WFA Group LLC,US


In [3]:
peering_df_joined = pd.merge(net_df, joined_df, left_on='asn', right_on='aut', how='left')
peering_df_joined = peering_df_joined[['asn', 'org_name', 'country', 'source', 'info_type']]
peering_df_joined.head()

Unnamed: 0,asn,org_name,country,source,info_type
0,4436,"GTT Americas, LLC",US,ARIN,NSP
1,20940,Akamai International B.V.,NL,RIPE,Content
2,31800,DALnet,US,ARIN,Non-Profit
3,3303,Swisscom (Schweiz) AG,CH,RIPE,Cable/DSL/ISP
4,22773,Cox Communications Inc.,US,ARIN,Cable/DSL/ISP


In [7]:
peering_df_joined['info_type'].value_counts()

Cable/DSL/ISP           11787
NSP                      3982
Content                  2486
Enterprise               1721
Educational/Research     1457
Network Services          804
Route Server              623
Non-Profit                613
Government                126
Route Collector            31
Name: info_type, dtype: int64

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Angenommen, dein DataFrame ist 'df' mit Spalten 'asn', 'org_name', 'country', 'source' und 'label' (die Klasse)
# Beispiel-DataFrame (ersetze durch deinen)
df = peering_df_joined
df['org_name'].fillna('Unknown', inplace=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU-Name: {torch.cuda.get_device_name(0)}")
else:
    print("Warnung: Keine GPU verfügbar, CPU wird verwendet.")

# BERT-Tokenizer und Model laden (pre-trained bert-base-uncased)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.eval()  # Für Inference

# Funktion für BERT-Embeddings (CLS-Token als Sentence-Embedding)
def get_bert_embedding(text, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(text), batch_size)):
        batch = text[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS-Token
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Extrahiere Embeddings für org_name
X_embeddings = get_bert_embedding(df['org_name'].tolist())

# Labels
y = df['info_type']

# Train/Test-Split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.13, random_state=42, stratify=y)

X_train.shape, X_test.shape

# Balancing
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_bal, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
classifier.fit(X_train_bal, y_train_bal)

# Evaluation
y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Anwendung auf net_df
output_df_bert = pd.DataFrame({'org_name': ['New Org LLC', 'University XYZ']})
output_df_bert['org_name'] = output_df_bert['org_name'].str.lower()
new_embeddings = get_bert_embedding(output_df_bert['org_name'].tolist())
output_df_bert['predicted_label'] = classifier.predict(new_embeddings)
print(output_df_bert)

# Optional: ClickHouse Export
output_df_bert.to_csv('predictions.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
100%|██████████| 739/739 [02:35<00:00,  4.75it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.3470052083333333
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.75      0.39      0.52      1532
             Content       0.25      0.29      0.27       323
Educational/Research       0.40      0.54      0.46       189
          Enterprise       0.16      0.28      0.21       224
          Government       0.17      0.25      0.21        16
                 NSP       0.27      0.22      0.24       518
    Network Services       0.04      0.12      0.06       105
          Non-Profit       0.15      0.36      0.21        80
     Route Collector       0.08      0.25      0.12         4
        Route Server       0.19      0.54      0.28        81

            accuracy                           0.35      3072
           macro avg       0.25      0.33      0.26      3072
        weighted avg       0.49      0.35      0.39      3072



100%|██████████| 1/1 [00:00<00:00, 38.16it/s]


         org_name       predicted_label
0     new org llc      Network Services
1  university xyz  Educational/Research


FileNotFoundError: [Errno 2] No such file or directory: 'clickhouse-client'

In [None]:
print(joined_df)
bert_df = joined_df[['aut', 'org_name']]
print(bert_df)
bert_df.fillna('unknown', inplace=True)
new_embeddings = get_bert_embedding(bert_df.tolist())
bert_df['label'] = classifier.predict(new_embeddings)
print(bert_df)
bert_df.to_csv('all_asn.csv', index=False)
bert_df.head()

           aut     changed            org_id source  \
0            1  20240618.0      LPL-141-ARIN   ARIN   
1            2  20231108.0  UNIVER-19-Z-ARIN   ARIN   
2            3  20100927.0        MIT-2-ARIN   ARIN   
3            4  20230929.0     USC-32-Z-ARIN   ARIN   
4            5  20200723.0      WGL-117-ARIN   ARIN   
...        ...         ...               ...    ...   
119408  402035  20250930.0      SL-2249-ARIN   ARIN   
119409  402036  20251001.0      GOAUT-1-ARIN   ARIN   
119410  402037  20251001.0       @del-402037   ARIN   
119411  402331  20250418.0      UHS-206-ARIN   ARIN   
119412  402332  20241030.0       IH-165-ARIN   ARIN   

                                     org_name country  
0                         Level 3 Parent, LLC      US  
1                      University of Delaware      US  
2       Massachusetts Institute of Technology      US  
3           University of Southern California      US  
4                               WFA Group LLC      US  
...

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert_df.fillna('unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert_df['label'] = classifier.predict(new_embeddings)


           aut                               org_name                 label
0            1                    Level 3 Parent, LLC               Content
1            2                 University of Delaware  Educational/Research
2            3  Massachusetts Institute of Technology  Educational/Research
3            4      University of Southern California  Educational/Research
4            5                          WFA Group LLC            Enterprise
...        ...                                    ...                   ...
119408  402035             Sergeant Laboratories, Inc               Content
119409  402036                         GoAutomate Inc            Enterprise
119410  402037                                unknown  Educational/Research
119411  402331              UNIVERSAL HEALTH SERVICES          Route Server
119412  402332                     IQVIA Holdings Inc            Non-Profit

[119413 rows x 3 columns]


Unnamed: 0,aut,org_name,label
0,1,"Level 3 Parent, LLC",Content
1,2,University of Delaware,Educational/Research
2,3,Massachusetts Institute of Technology,Educational/Research
3,4,University of Southern California,Educational/Research
4,5,WFA Group LLC,Enterprise


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
import sys


# Initialize parallel_pandas

# Prüfe GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU-Name: {torch.cuda.get_device_name(0)}")
else:
    print("Warnung: Keine GPU verfügbar, CPU wird verwendet.")

# DataFrame (dein echter Datensatz, hier Beispiel
df = peering_df_joined
# Preprocessing
df['org_name'] = df['org_name'].fillna('Unknown').str.lower()

# Filtere Klassen mit <2 Einträgen
class_counts = df['info_type'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df['info_type'].isin(valid_classes)]
print(df)
print(f"Verwendete Klassen: {valid_classes.tolist()}")
print(f"DataFrame nach Filterung: {len(df)} Zeilen")

# Deduplizierung
unique_df = df.drop_duplicates(subset=['org_name'])
print(f"Eindeutige org_name: {len(unique_df)}")


# --- BERT-Modell ---
print("\n=== BERT-Modell ===")
# BERT-Tokenizer und Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.eval()

# BERT Embeddings
def get_bert_embedding(text, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(text), batch_size), desc="BERT-Embeddings"):
        batch = text[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Training
X_bert = get_bert_embedding(unique_df['org_name'].tolist())
y = unique_df['info_type']
X_train_bert, X_test_bert, y_train, y_test = train_test_split(X_bert, y, test_size=0.13, random_state=42, stratify=y)

# Balancing
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_bert, y_train)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_bal, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_bert = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_bert.fit(X_train_bal, y_train_bal)

# Evaluation
y_pred_bert = classifier_bert.predict(X_test_bert)
print("BERT Accuracy:", accuracy_score(y_test, y_pred_bert))
print("BERT Classification Report:")
print(classification_report(y_test, y_pred_bert))

# --- TF-IDF-Modell ---
print("\n=== TF-IDF-Modell ===")
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), lowercase=True)
X_tfidf = vectorizer.fit_transform(unique_df['org_name'])
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.13, random_state=42, stratify=y)

# Balancing
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)
X_train_bal, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_tfidf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_tfidf.fit(X_train_bal, y_train_bal)

# Evaluation
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

# Ad-hoc-Test
test_names = ['Google LLC', 'University XYZ', 'Verizon Business', 'Cloudflare, Inc.', 'MIT', 'Amazon.com, Inc.', 'Unknown Org']
test_df = pd.DataFrame({'org_name': test_names})
test_df['org_name'] = test_df['org_name'].str.lower()
print(f"Test-Datensatz Länge: {len(test_df)}")

# BERT Vorhersagen
test_embeddings = get_bert_embedding(test_df['org_name'].tolist())
print(f"BERT Embedding Länge: {test_embeddings.shape[0]}")
test_df['predicted_label_bert'] = classifier_bert.predict(test_embeddings)

# TF-IDF Vorhersagen
test_tfidf = vectorizer.transform(test_df['org_name'])
test_df['predicted_label_tfidf'] = classifier_tfidf.predict(test_tfidf)

print("\nAd-hoc-Test Ergebnisse:")
print(test_df[['org_name', 'predicted_label_bert', 'predicted_label_tfidf']])

# ClickHouse Export
test_df.to_csv('predictions.csv', index=False)

Device: cuda
GPU-Name: NVIDIA GeForce RTX 4070 Laptop GPU
          asn                                           org_name country  \
0        4436                                  gtt americas, llc      US   
1       20940                          akamai international b.v.      NL   
2       31800                                             dalnet      US   
3        3303                              swisscom (schweiz) ag      CH   
4       22773                            cox communications inc.      US   
...       ...                                                ...     ...   
23625  154232  max technology & support services private limited      IN   
23626  204856                                            unknown     NaN   
23627  204917                                            unknown     NaN   
23628  210796                                    bjoern schleyer      DE   
23629  400926                                       kiwi telecom      US   

      source             info

BERT-Embeddings: 100%|██████████| 326/326 [00:09<00:00, 35.55it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


BERT Accuracy: 0.34403839055001845
BERT Classification Report:
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.77      0.41      0.53      1418
             Content       0.21      0.29      0.24       277
Educational/Research       0.38      0.47      0.42       170
          Enterprise       0.15      0.23      0.18       201
          Government       0.38      0.50      0.43        16
                 NSP       0.24      0.23      0.23       436
    Network Services       0.03      0.12      0.05        91
          Non-Profit       0.11      0.31      0.16        61
     Route Collector       0.00      0.00      0.00         2
        Route Server       0.10      0.32      0.15        37

            accuracy                           0.34      2709
           macro avg       0.24      0.29      0.24      2709
        weighted avg       0.51      0.34      0.39      2709


=== TF-IDF-Modell ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TF-IDF Accuracy: 0.5197489848652639
TF-IDF Classification Report:
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.65      0.78      0.71      1418
             Content       0.27      0.31      0.29       277
Educational/Research       0.48      0.43      0.45       170
          Enterprise       0.23      0.14      0.17       201
          Government       0.33      0.19      0.24        16
                 NSP       0.26      0.21      0.23       436
    Network Services       0.06      0.01      0.02        91
          Non-Profit       0.50      0.13      0.21        61
     Route Collector       0.00      0.00      0.00         2
        Route Server       0.57      0.32      0.41        37

            accuracy                           0.52      2709
           macro avg       0.34      0.25      0.27      2709
        weighted avg       0.48      0.52      0.49      2709

Test-Datensatz Länge: 7


BERT-Embeddings: 100%|██████████| 1/1 [00:00<00:00, 77.71it/s]

BERT Embedding Länge: 7

Ad-hoc-Test Ergebnisse:
           org_name  predicted_label_bert predicted_label_tfidf
0        google llc  Educational/Research               Content
1    university xyz      Network Services  Educational/Research
2  verizon business            Enterprise                   NSP
3  cloudflare, inc.               Content               Content
4               mit  Educational/Research         Cable/DSL/ISP
5  amazon.com, inc.               Content            Enterprise
6       unknown org            Non-Profit            Non-Profit





In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertModel
from rank_bm25 import BM25Okapi
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
import sys

# DataFrame (dein echter Datensatz)
df = peering_df_joined

# Preprocessing
df['org_name'] = df['org_name'].fillna('unknown').str.lower()


# Filtere Klassen mit <2 Einträgen
class_counts = df['info_type'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df['info_type'].isin(valid_classes)]
print(f"Verwendete Klassen: {valid_classes.tolist()}")
print(f"DataFrame nach Filterung: {len(df)} Zeilen")

# Deduplizierung
unique_df = df.drop_duplicates(subset=['org_name'])
print(f"Eindeutige org_name: {len(unique_df)}")

# Prüfe Klassenanzahl
if len(unique_df['info_type'].unique()) < 2:
    print("Fehler: Nur eine Klasse nach Aggregation. Verwende stratify=None.")
    stratify = None
else:
    stratify = unique_df['info_type']

# --- BERT-Modell ---
print("\n=== BERT-Modell ===")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

def get_bert_embedding(text, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(text), batch_size), desc="BERT-Embeddings"):
        batch = text[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)

X_bert = get_bert_embedding(unique_df['org_name'].tolist())
y = unique_df['info_type']
X_train_bert, X_test_bert, y_train, y_test = train_test_split(X_bert, y, test_size=0.13, random_state=42, stratify=stratify)

# Skalierung
scaler = StandardScaler()
X_train_bal_bert = scaler.fit_transform(X_train_bert)
X_test_bert = scaler.transform(X_test_bert)

# Balancing
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_bal_bert, y_train)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_bal_bert, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_bert = LogisticRegression(max_iter=5000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_bert.fit(X_train_bal_bert, y_train_bal)

# Evaluation
y_pred_bert = classifier_bert.predict(X_test_bert)
print("BERT Accuracy:", accuracy_score(y_test, y_pred_bert))
print("BERT Classification Report:")
print(classification_report(y_test, y_pred_bert, zero_division=0))

# --- TF-IDF-Modell ---
print("\n=== TF-IDF-Modell ===")
vectorizer = TfidfVectorizer(ngram_range=(1, 3), lowercase=True, max_features=5000)
X_tfidf = vectorizer.fit_transform(unique_df['org_name'])
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.13, random_state=42, stratify=stratify)

# Balancing
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)
X_train_bal_tfidf, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_tfidf = LogisticRegression(max_iter=5000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_tfidf.fit(X_train_bal_tfidf, y_train_bal)

# Evaluation
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)
print("TF-IDF Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf, zero_division=0))

# --- BM25-Modell ---
print("\n=== BM25-Modell ===")
# Tokenisierung für BM25
tokenized_corpus = [doc.split() for doc in unique_df['org_name']]
bm25 = BM25Okapi(tokenized_corpus)

# BM25-Features berechnen
X_bm25 = np.array([bm25.get_scores(doc.split()) for doc in unique_df['org_name']])
X_train_bm25, X_test_bm25, y_train, y_test = train_test_split(X_bm25, y, test_size=0.13, random_state=42, stratify=stratify)

# Balancing
X_train_res, y_train_res = smote.fit_resample(X_train_bm25, y_train)
X_train_bal_bm25, y_train_bal = rus.fit_resample(X_train_res, y_train_res)

# Classifier
classifier_bm25 = LogisticRegression(max_iter=5000, multi_class='multinomial', solver='lbfgs', class_weight='balanced')
classifier_bm25.fit(X_train_bal_bm25, y_train_bal)

# Evaluation
y_pred_bm25 = classifier_bm25.predict(X_test_bm25)
print("BM25 Accuracy:", accuracy_score(y_test, y_pred_bm25))
print("BM25 Classification Report:")
print(classification_report(y_test, y_pred_bm25, zero_division=0))

# Ad-hoc-Test
test_names = ['Google LLC', 'University XYZ', 'Verizon Business', 'Cloudflare, Inc.', 'MIT', 'Amazon.com, Inc.', 'Unknown Org']
test_df = pd.DataFrame({'org_name': test_names})
test_df['org_name'] = test_df['org_name'].str.lower()

# BERT Vorhersagen
test_embeddings = get_bert_embedding(test_df['org_name'].tolist())
test_df['predicted_label_bert'] = classifier_bert.predict(scaler.transform(test_embeddings))

# TF-IDF Vorhersagen
test_tfidf = vectorizer.transform(test_df['org_name'])
test_df['predicted_label_tfidf'] = classifier_tfidf.predict(test_tfidf)

# BM25 Vorhersagen
test_bm25 = np.array([bm25.get_scores(doc.split()) for doc in test_df['org_name']])
test_df['predicted_label_bm25'] = classifier_bm25.predict(test_bm25)

# Kombiniere regelbasierte und modellbasierte Vorhersagen
test_df['final_label'] = test_df.apply(lambda x: x['rule_label'] if x['rule_label'] else x['predicted_label_tfidf'], axis=1)

print("\nAd-hoc-Test Ergebnisse:")
print(test_df[['org_name', 'rule_label', 'predicted_label_bert', 'predicted_label_tfidf', 'predicted_label_bm25', 'final_label']])

# ClickHouse Export
test_df.to_csv('predictions.csv', index=False)

Verwendete Klassen: ['Cable/DSL/ISP', 'NSP', 'Content', 'Enterprise', 'Educational/Research', 'Network Services', 'Route Server', 'Non-Profit', 'Government', 'Route Collector']
DataFrame nach Filterung: 23630 Zeilen
Eindeutige org_name: 20832

=== BERT-Modell ===


BERT-Embeddings: 100%|██████████| 326/326 [02:52<00:00,  1.89it/s]


BERT Accuracy: 0.3359173126614987
BERT Classification Report:
                      precision    recall  f1-score   support

       Cable/DSL/ISP       0.76      0.40      0.53      1418
             Content       0.21      0.27      0.23       277
Educational/Research       0.33      0.43      0.37       170
          Enterprise       0.15      0.23      0.18       201
          Government       0.36      0.56      0.44        16
                 NSP       0.24      0.22      0.23       436
    Network Services       0.04      0.13      0.06        91
          Non-Profit       0.08      0.25      0.13        61
     Route Collector       0.00      0.00      0.00         2
        Route Server       0.08      0.27      0.12        37

            accuracy                           0.34      2709
           macro avg       0.22      0.28      0.23      2709
        weighted avg       0.50      0.34      0.38      2709


=== TF-IDF-Modell ===
TF-IDF Accuracy: 0.39313399778516056
TF-IDF 

KeyboardInterrupt: 