

*   List item
*   List item



In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("/home/nail/Documents/Natalia/IoT/Datas/CIC_IOT_Dataset2023/IoT_Intrusion/IoT_Intrusion.csv")

In [None]:
X = df[df.columns[0:-1]].fillna(0).reset_index(drop=True)
y = df['label']== 'BenignTraffic'

In [None]:

# Удаление признаков, которые являются постоянными
X = X.loc[:, (X != X.iloc[0]).any()]  # Оставляем только переменные признаки

# Стандартизация признаков (рекомендуется для алгоритмов, чувствительных к масштабу данных)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Разделение данных на тренировочные, валидационные и тестовые выборки
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=5)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=5)

# Отбор признаков (например, выбираем топ-N лучших признаков)
N = 10  # количество отбираемых признаков
selector = SelectKBest(score_func=f_classif, k=N)
X_train_selected = selector.fit_transform(X_train, y_train)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Проверка размеров выборок
print(f"Train size: {len(X_train_selected)}, Validation size: {len(X_val_selected)}, Test size: {len(X_test_selected)}")


Train size: 734002, Validation size: 157286, Test size: 157287


In [None]:
# Обучение модели на отобранных признаках
model = RandomForestClassifier()
model.fit(X_train_selected, y_train)

# Прогнозирование на валидационных данных
y_pred = model.predict(X_val_selected)

# Оценка точности
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy with top {N} features: {accuracy:.4f}")

# Замер времени обучения
import time
start_train_time = time.perf_counter()
model.fit(X_train_selected, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_logreg = model.predict(X_val_selected)
end_predict_time = time.perf_counter()

# Проверка размеров предсказанных и истинных меток
print(f"Размеры y_val: {y_val.shape}, y_pred_logreg: {y_pred_logreg.shape}")

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

Accuracy with top 10 features: 0.9911
Размеры y_val: (157286,), y_pred_logreg: (157286,)
Training time: 55.1111 seconds
Prediction time: 0.7286 seconds


In [None]:
# Преобразование данных в тензоры PyTorch для дальнейшего использования в нейросетях
train_input = torch.tensor(X_train_selected, dtype=torch.float32)
train_label = torch.tensor(y_train.to_numpy(), dtype=torch.long)
val_input = torch.tensor(X_val_selected, dtype=torch.float32)
val_label = torch.tensor(y_val.to_numpy(), dtype=torch.long)
test_input = torch.tensor(X_test_selected, dtype=torch.float32)
test_label = torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Проверка размеров полученных тензоров
print(f"Train input shape: {train_input.shape}, Train label shape: {train_label.shape}")
print(f"Validation input shape: {val_input.shape}, Validation label shape: {val_label.shape}")
print(f"Test input shape: {test_input.shape}, Test label shape: {test_label.shape}")

Train input shape: torch.Size([734002, 10]), Train label shape: torch.Size([734002])
Validation input shape: torch.Size([157286, 10]), Validation label shape: torch.Size([157286])
Test input shape: torch.Size([157287, 10]), Test label shape: torch.Size([157287])


In [None]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

# Предположим, что ваш DataFrame df уже загружен
# Обработка данных
X = df[df.columns[0:-1]].fillna(0).reset_index(drop=True)
y = (df['label'] == 'BenignTraffic').astype(int)  # Бинаризация целевой переменной

# Удаление постоянных признаков
X = X.loc[:, (X != X.iloc[0]).any()]

# Стандартизация данных
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Разделение данных на тренировочные и тестовые выборки
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=5)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=5)

# Применение SelectKBest для отбора признаков (например, выбираем топ-10 лучших признаков)
N = 10
selector = SelectKBest(score_func=f_classif, k=N)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Обучение Logistic Regression и замер времени
start_train_time = time.perf_counter()
logreg = LogisticRegression(max_iter=1000)  # Увеличено число итераций для надежной сходимости
logreg.fit(X_train_selected, y_train)
end_train_time = time.perf_counter()

# Прогнозирование на тестовых данных и замер времени
start_predict_time = time.perf_counter()
y_pred_logreg = logreg.predict(X_test_selected)
end_predict_time = time.perf_counter()

# Оценка метрик
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg))

# Оценка точности
accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy: {accuracy:.4f}")

# Время обучения и предсказания
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

Logistic Regression:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    153679
           1       0.66      0.57      0.61      3608

    accuracy                           0.98    157287
   macro avg       0.82      0.78      0.80    157287
weighted avg       0.98      0.98      0.98    157287

Accuracy: 0.9833
Training time: 4.2741 seconds
Prediction time: 0.0036 seconds


In [None]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_rf = rf_clf.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

Random Forest Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153679
           1       0.93      0.94      0.94      3608

    accuracy                           1.00    157287
   macro avg       0.97      0.97      0.97    157287
weighted avg       1.00      1.00      1.00    157287

Training time: 83.1187 seconds
Prediction time: 0.6880 seconds


In [None]:
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_dt = dt_clf.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("Decision Tree Classifier:")
print(classification_report(y_test, y_pred_dt))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

Decision Tree Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153679
           1       0.91      0.92      0.92      3608

    accuracy                           1.00    157287
   macro avg       0.96      0.96      0.96    157287
weighted avg       1.00      1.00      1.00    157287

Training time: 4.9156 seconds
Prediction time: 0.0123 seconds


In [None]:
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_knn = knn_clf.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("K-Nearest Neighbors Classifier:")
print(classification_report(y_test, y_pred_knn))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

K-Nearest Neighbors Classifier:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    153679
           1       0.79      0.88      0.83      3608

    accuracy                           0.99    157287
   macro avg       0.89      0.94      0.91    157287
weighted avg       0.99      0.99      0.99    157287

Training time: 0.0411 seconds
Prediction time: 204.9660 seconds


In [None]:
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_gb = gb_clf.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("Gradient Boosting Classifier:")
print(classification_report(y_test, y_pred_gb))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")


Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153679
           1       0.92      0.94      0.93      3608

    accuracy                           1.00    157287
   macro avg       0.96      0.97      0.96    157287
weighted avg       1.00      1.00      1.00    157287

Training time: 352.8267 seconds
Prediction time: 0.1837 seconds


In [None]:
import time
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_xgb = xgb_clf.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("XGB Classifier:")
print(classification_report(y_test, y_pred_xgb))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

XGB Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153679
           1       0.91      0.95      0.93      3608

    accuracy                           1.00    157287
   macro avg       0.95      0.97      0.96    157287
weighted avg       1.00      1.00      1.00    157287

Training time: 3.9836 seconds
Prediction time: 0.0729 seconds


In [None]:
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
gnb = GaussianNB()
gnb.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_gnb = gnb.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("Gaussian Naive Bayes:")
print(classification_report(y_test, y_pred_gnb))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

Gaussian Naive Bayes:
              precision    recall  f1-score   support

           0       1.00      0.53      0.69    153679
           1       0.05      1.00      0.09      3608

    accuracy                           0.54    157287
   macro avg       0.52      0.76      0.39    157287
weighted avg       0.98      0.54      0.68    157287

Training time: 0.4370 seconds
Prediction time: 0.0799 seconds


In [None]:
import time
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
mlp = MLPClassifier(max_iter=300, random_state=42)  # max_iter - максимальное количество итераций, можно настроить
mlp.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_mlp = mlp.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("MLP Classifier:")
print(classification_report(y_test, y_pred_mlp))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

MLP Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153679
           1       0.83      0.92      0.87      3608

    accuracy                           0.99    157287
   macro avg       0.91      0.96      0.93    157287
weighted avg       0.99      0.99      0.99    157287

Training time: 427.7701 seconds
Prediction time: 1.1884 seconds


In [None]:
import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
ada_boost = AdaBoostClassifier(n_estimators=50, random_state=42)  # n_estimators - количество слабых классификаторов
ada_boost.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_ada_boost = ada_boost.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("AdaBoost Classifier:")
print(classification_report(y_test, y_pred_ada_boost))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")

AdaBoost Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153679
           1       0.90      0.93      0.91      3608

    accuracy                           1.00    157287
   macro avg       0.95      0.97      0.96    157287
weighted avg       1.00      1.00      1.00    157287

Training time: 72.8655 seconds
Prediction time: 0.9829 seconds


In [None]:
import time
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Замер времени обучения
start_train_time = time.perf_counter()
svm = SVC(kernel='linear', random_state=42)  # Выбор линейного ядра (можно использовать другие: 'rbf', 'poly', и т.д.)
svm.fit(X_train, y_train)
end_train_time = time.perf_counter()

# Замер времени предсказания
start_predict_time = time.perf_counter()
y_pred_svm = svm.predict(X_test)
end_predict_time = time.perf_counter()

# Вывод результата
print("Support Vector Machine (SVM):")
print(classification_report(y_test, y_pred_svm))

# Вывод времени выполнения
print(f"Training time: {end_train_time - start_train_time:.4f} seconds")
print(f"Prediction time: {end_predict_time - start_predict_time:.4f} seconds")