In [1]:
import numpy as np
import pandas as pd
import os

# Для классификации
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Для балансировки классов
#!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# Модели
#!pip install xgboost lightgbm
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

# Для аномалий
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import IsolationForest

Loading dataset from csv, adding labels and device ids.

In [33]:
# Папка, где лежат файлы датасета

# Пока берется только первое устройство, что бы добавить все, нужно убрать проверку на == "1" 
DATA_DIR = "archive"  
csv_files = [i for i in os.listdir(DATA_DIR) if i.split(".")[-1] == "csv" and i.split(".")[0].isdigit() and i.split(".")[0] == "1"]


def get_label_and_device(filename: str) -> tuple[str]:
    """
    Определяем метку (label) на основе названия файла.
    """
    filename = filename.split(".")
    return filename[1] if filename[1] == "benign" else filename[1] + "_" +  filename[2], int(filename[0]) 


    
df_list = []
for file in csv_files:
    print(file)
    temp_df = pd.read_csv(os.path.join(DATA_DIR, file))
    # Добавим столбец с меткой (если её нет в самих данных)
    print(temp_df.shape)
    label, device = get_label_and_device(file)
    print(label,  device)
    temp_df['label'], temp_df['device_id'] = label, device 
    
    df_list.append(temp_df.copy())


# Объединяем все фреймы в один
df = pd.concat(df_list, ignore_index=True)

print("Размер объединённого датасета:", df.shape)
print(df.head())

1.mirai.syn.csv
(122573, 115)
mirai_syn 1
1.benign.csv
(49548, 115)
benign 1
1.gafgyt.scan.csv
(29849, 115)
gafgyt_scan 1
1.gafgyt.combo.csv
(59718, 115)
gafgyt_combo 1
1.mirai.ack.csv
(102195, 115)
mirai_ack 1
1.gafgyt.udp.csv
(105874, 115)
gafgyt_udp 1
1.mirai.scan.csv
(107685, 115)
mirai_scan 1
1.mirai.udpplain.csv
(81982, 115)
mirai_udpplain 1
1.gafgyt.junk.csv
(29068, 115)
gafgyt_junk 1
1.mirai.udp.csv
(237665, 115)
mirai_udp 1
1.gafgyt.tcp.csv
(92141, 115)
gafgyt_tcp 1
Размер объединённого датасета: (1018298, 117)
   MI_dir_L5_weight  MI_dir_L5_mean  MI_dir_L5_variance  MI_dir_L3_weight  \
0          1.000000            74.0                 0.0          1.000000   
1          1.992577            74.0                 0.0          1.995540   
2          2.921050            74.0                 0.0          2.952246   
3          3.921040            74.0                 0.0          3.952240   
4          4.921040            74.0                 0.0          4.952240   

   MI_dir_L

Clear null values.

In [34]:
# Проверка пропусков
missing_count = df.isnull().sum()
print("Число пропусков в каждом признаке:\n", missing_count[missing_count > 0])


# Здесь выберем заполнение медианой:
df.fillna(df.median(numeric_only=True), inplace=True)

# Повторно проверим пропуски
print("\nПосле заполнения пропусков:\n", df.isnull().sum().max())


Число пропусков в каждом признаке:
 Series([], dtype: int64)

После заполнения пропусков:
 0


Clear duplicates.

In [35]:
initial_size = df.shape[0]
df.drop_duplicates(inplace=True)
print("Удалено дубликатов:", initial_size - df.shape[0])


Удалено дубликатов: 21759


Clear by using iqr.

In [36]:
def remove_outliers_iqr(data, columns, k=1.5):
    """Удаление выбросов по методу IQR для указанных столбцов."""
    for col in columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - k * IQR
        upper_bound = Q3 + k * IQR
        # Оставляем только те, что внутри границ
        data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

# Сохраним в отдельный датафрейм "df_classif" для классификации (чтобы не портить общий DataFrame для аномалий):
df_classif = df.copy()

# # Уберем выбросы по всем числовым признакам, кроме целевого:
# features_numeric = df_classif.select_dtypes(include=[np.number]).columns
# # Предположим, что у нас последняя колонка 'label' — это целевой признак:
# features_numeric = [col for col in features_numeric if col != 'label']

# df_classif = remove_outliers_iqr(df_classif, features_numeric, k=1.5)
# print("Размер датасета после удаления выбросов (для классификации):", df_classif.shape)


In [37]:
names = {}
for i in df_classif["label"]:
    if names.get(i) is None:
        names[i] = 0
    else:
        names[i] += 1
print(names)

{'mirai_syn': 122572, 'benign': 40394, 'gafgyt_scan': 29848, 'gafgyt_combo': 59717, 'mirai_ack': 102194, 'gafgyt_udp': 100181, 'mirai_scan': 107684, 'mirai_udpplain': 81981, 'gafgyt_junk': 29067, 'mirai_udp': 237664, 'gafgyt_tcp': 85226}


# Binary XGBoost / LightGBM classification

Adding binary lables, where benign is 0 and all malicious is 1

In [38]:
df_binary = df_classif.copy()



# Создадим новый столбец binary_label, где 0 = benign, 1 = malicious
df_binary['binary_label'] = df_binary['label'].apply(lambda x: 0 if x.lower() == 'benign' else 1)

# Уберем исходный label, оставив его только как справочный
df_binary.drop(columns=['label'], inplace=True)

print(df_binary['binary_label'].value_counts())

# print([i for i in df_binary['binary_label']])

binary_label
1    956144
0     40395
Name: count, dtype: int64


Separating train and tets dataset, and applying SMOTE to balance classes

In [39]:
X_binary = df_binary.drop(columns=['binary_label'])
y_binary = df_binary['binary_label']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_binary, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

print("Train size:", X_train_b.shape, "Test size:", X_test_b.shape)

# print([i for i in y_train_b])

smote = SMOTE(random_state=42)
X_train_b_bal, y_train_b_bal = smote.fit_resample(X_train_b, y_train_b)
print("После SMOTE:", np.bincount(y_train_b_bal))

Train size: (797231, 116) Test size: (199308, 116)
После SMOTE: [764915 764915]


Train XGBoost classifier

In [40]:
# Пример: XGBoost
xgb_model_b = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
xgb_model_b.fit(X_train_b_bal, y_train_b_bal)

# Предсказание
y_pred_b = xgb_model_b.predict(X_test_b)

# Оценка
print("=== XGBoost (Binary) ===")
print("Accuracy:", accuracy_score(y_test_b, y_pred_b))
print("F1-score:", f1_score(y_test_b, y_pred_b))
print(confusion_matrix(y_test_b, y_pred_b))
print(classification_report(y_test_b, y_pred_b))


=== XGBoost (Binary) ===
Accuracy: 0.9999899652798684
F1-score: 0.9999947706676289
[[  8078      1]
 [     1 191228]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8079
           1       1.00      1.00      1.00    191229

    accuracy                           1.00    199308
   macro avg       1.00      1.00      1.00    199308
weighted avg       1.00      1.00      1.00    199308



Train LightGBM classifier

In [41]:
lgb_model_b = LGBMClassifier(n_estimators=100, max_depth=6, random_state=42)
lgb_model_b.fit(X_train_b_bal, y_train_b_bal)

y_pred_b_lgb = lgb_model_b.predict(X_test_b)

print("=== LightGBM (Binary) ===")
print("Accuracy:", accuracy_score(y_test_b, y_pred_b_lgb))
print("F1-score:", f1_score(y_test_b, y_pred_b_lgb))
print(confusion_matrix(y_test_b, y_pred_b_lgb))
print(classification_report(y_test_b, y_pred_b_lgb))


[LightGBM] [Info] Number of positive: 764915, number of negative: 764915
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28764
[LightGBM] [Info] Number of data points in the train set: 1529830, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
=== LightGBM (Binary) ===
Accuracy: 0.9999849479198025
F1-score: 0.9999921559399145
[[  8079      0]
 [     3 191226]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8079
           1       1.00      1.00      1.00    191229

    accuracy                           1.00    199308
   macro avg       1.00      1.00      1.00    199308
weighted avg       1.00      1.00      1.00    199308



# Multiclass XGBoost / LightGBM classification

Adding multiclass lables

In [42]:
df_multi = df_classif.copy()
# Преобразуем строковые метки в числа
# Пример: benign -> 0, mirai_ack -> 1, mirai_scan -> 2, ...
le = LabelEncoder()
df_multi['multi_label'] = le.fit_transform(df_multi['label'])

# При желании можно удалить старый столбец label
df_multi.drop(columns=['label'], inplace=True)

print(df_multi['multi_label'].value_counts())
print("Коды меток:", list(le.classes_))


multi_label
9     237665
8     122573
7     107685
6     102195
5     100182
4      85227
10     81982
1      59718
0      40395
3      29849
2      29068
Name: count, dtype: int64
Коды меток: ['benign', 'gafgyt_combo', 'gafgyt_junk', 'gafgyt_scan', 'gafgyt_tcp', 'gafgyt_udp', 'mirai_ack', 'mirai_scan', 'mirai_syn', 'mirai_udp', 'mirai_udpplain']


Separating train and tets dataset, and applying SMOTE to balance classes

In [43]:
X_multi = df_multi.drop(columns=['multi_label'])
y_multi = df_multi['multi_label']

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)

print("Train size:", X_train_m.shape, "Test size:", X_test_m.shape)


smote_m = SMOTE(random_state=42)
X_train_m_bal, y_train_m_bal = smote_m.fit_resample(X_train_m, y_train_m)
print("После SMOTE (мультикласс):", np.bincount(y_train_m_bal))



Train size: (797231, 116) Test size: (199308, 116)
После SMOTE (мультикласс): [190132 190132 190132 190132 190132 190132 190132 190132 190132 190132
 190132]


Train XGBoost classifier

In [44]:
xgb_model_m = XGBClassifier(n_estimators=100, max_depth=6, random_state=42, objective='multi:softmax')
xgb_model_m.fit(X_train_m_bal, y_train_m_bal)

y_pred_m = xgb_model_m.predict(X_test_m)

print("=== XGBoost (Multi) ===")
print("Accuracy:", accuracy_score(y_test_m, y_pred_m))
print("F1-score (macro):", f1_score(y_test_m, y_pred_m, average='macro'))
print(confusion_matrix(y_test_m, y_pred_m))
print(classification_report(y_test_m, y_pred_m))


=== XGBoost (Multi) ===
Accuracy: 0.9998193750376302
F1-score (macro): 0.9997632850233248
[[ 8079     0     0     0     0     0     0     0     0     0     0]
 [    0 11937     5     0     2     0     0     0     0     0     0]
 [    0     0  5813     0     1     0     0     0     0     0     0]
 [    0     0     0  5970     0     0     0     0     0     0     0]
 [    0     0     0     1 17022    22     0     0     0     0     0]
 [    2     0     0     0     1 20033     0     0     0     0     0]
 [    0     0     0     0     0     0 20439     0     0     0     0]
 [    0     0     0     0     0     0     0 21537     0     0     0]
 [    0     0     0     0     0     0     0     0 24515     0     0]
 [    0     0     0     0     0     0     0     0     0 47533     0]
 [    0     0     1     0     0     0     1     0     0     0 16394]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8079
           1       1.00      1.00      1.

Train LightGBM classifier

In [45]:
lgb_model_m = LGBMClassifier(n_estimators=100, max_depth=6, random_state=42)
lgb_model_m.fit(X_train_m_bal, y_train_m_bal)

y_pred_m_lgb = lgb_model_m.predict(X_test_m)

print("=== LightGBM (Multi) ===")
print("Accuracy:", accuracy_score(y_test_m, y_pred_m_lgb))
print("F1-score (macro):", f1_score(y_test_m, y_pred_m_lgb, average='macro'))
print(confusion_matrix(y_test_m, y_pred_m_lgb))
print(classification_report(y_test_m, y_pred_m_lgb))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.176131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28993
[LightGBM] [Info] Number of data points in the train set: 2091452, number of used features: 115
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
[LightGBM] [Info] Start training from score -2.397895
=== LightGBM (Multi) ===
Accuracy: 0.997039757561

# Isolation Forest for searching anomalies in data 

Preparing lables and separating train/test datasets.

In [46]:
df_ano = df.copy()  # Предположим, что здесь уже нет пропусков

# Отделим benign-трафик
df_benign = df_ano[df_ano['label'].str.lower() == 'benign'].copy()
df_malicious = df_ano[df_ano['label'].str.lower() != 'benign'].copy()

# X_ano_train = только benign
X_ano_train = df_benign.drop(columns=['label'])
# X_ano_test = benign + malicious (для проверки)
X_ano_test = df_ano.drop(columns=['label'])
y_ano_test = df_ano['label'].apply(lambda x: 0 if x.lower() == 'benign' else 1)  
# 0 = нормальный, 1 = аномалия

print("benign train shape:", X_ano_train.shape, "test shape:", X_ano_test.shape)


benign train shape: (40395, 116) test shape: (996539, 116)


Train Isolation Forest

In [47]:
iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)

# Обучаем только на benign
iso_forest.fit(X_ano_train)

# Предсказываем: -1 = аномалия, 1 = нормально
y_pred_if = iso_forest.predict(X_ano_test)

# Переведём: 1 -> 0 (норма), -1 -> 1 (аномалия)
y_pred_if = np.where(y_pred_if == 1, 0, 1)

print("=== Isolation Forest Anomaly Detection ===")
print("Accuracy:", accuracy_score(y_ano_test, y_pred_if))
print("F1-score:", f1_score(y_ano_test, y_pred_if))
print(confusion_matrix(y_ano_test, y_pred_if))
print(classification_report(y_ano_test, y_pred_if))


=== Isolation Forest Anomaly Detection ===
Accuracy: 0.9977000398378789
F1-score: 0.9988028681023893
[[ 38105   2290]
 [     2 956142]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97     40395
           1       1.00      1.00      1.00    956144

    accuracy                           1.00    996539
   macro avg       1.00      0.97      0.98    996539
weighted avg       1.00      1.00      1.00    996539

