In [1]:
from tqdm import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


SEED = 42
plt.style.use("ggplot")
pd.options.display.max_columns = None

In [2]:
# Chargeaons les données pré-traitées
train_df = pd.read_csv("train_pre.csv")
test_df = pd.read_csv("test_pre.csv")

In [3]:
train_df.head()

Unnamed: 0,id,dur,proto,service,sbytes,dbytes,rate,sload,dload,sinpkt,dinpkt,sjit,djit,stcpb,dtcpb,tcprtt,synack,ackdat,smean,dmean,response_body_len,ct_srv_src,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_src_ltm,ct_srv_dst,attack_cat
0,1,0.002025,90,12,1.8e-05,1.2e-05,7.4e-05,2.364553e-06,0.000379,0.000288,0.000148,2.1e-05,4.1e-05,0.144768,0.512828,0.0,0.0,0.0,0.010163,0.029492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
1,2,0.010832,90,12,5.4e-05,0.002867,7.8e-05,1.401989e-06,0.022458,0.000592,0.000272,4.2e-05,0.004796,0.330128,0.716524,0.0,0.0,0.0,0.01626,0.758573,0.0,0.677419,0.0,0.0,0.0,0.015625,0.0,0.081967,Normal
2,3,0.027052,90,12,2.6e-05,0.0009,1.4e-05,2.625704e-07,0.002717,0.002748,0.001811,0.011763,0.039466,0.492706,0.689918,0.044423,0.029261,0.033164,0.012195,0.565158,0.0,0.096774,0.02,0.0,0.0,0.03125,0.016949,0.081967,Normal
3,4,0.028027,90,2,4.6e-05,5.3e-05,1.4e-05,4.576117e-07,0.00015,0.001812,0.001591,0.000177,0.017249,0.257772,0.243882,0.0,0.0,0.0,0.01626,0.043896,0.0,0.0,0.02,0.0,0.0,0.03125,0.016949,0.0,Normal
4,5,0.007491,90,12,3.9e-05,1.8e-05,3.3e-05,1.429776e-06,0.000178,0.000566,0.001334,0.001654,0.0004,0.567209,0.460351,0.050967,0.033874,0.037632,0.016938,0.030864,0.0,0.677419,0.02,0.02,0.0,0.609375,0.016949,0.622951,Normal


In [4]:
# Nous avons fait une sélection de caractériqtiques.Et 27 caractéristiques ont été retenues
print(f"Train Shape: {train_df.shape}\nTest Shape: {test_df.shape}") 

Train Shape: (139125, 29)
Test Shape: (82176, 29)


## 1. Split Data & Cross Validation

In [5]:
# Variables d'entraînement et variable réponse
main_cols = train_df.columns.difference(["id", "attack_cat"]).values
X = train_df[main_cols]
y = train_df["attack_cat"]

# Encodons la variable réponse pour pouvoir entraîner XGBoost
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Définissons notre validation croisée
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
main_cols, len(main_cols)

(array(['ackdat', 'ct_dst_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
        'ct_src_dport_ltm', 'ct_src_ltm', 'ct_srv_dst', 'ct_srv_src',
        'dbytes', 'dinpkt', 'djit', 'dload', 'dmean', 'dtcpb', 'dur',
        'proto', 'rate', 'response_body_len', 'sbytes', 'service',
        'sinpkt', 'sjit', 'sload', 'smean', 'stcpb', 'synack', 'tcprtt'],
       dtype=object),
 27)

## 2. Modeling

In [6]:
# Liste des modèles à tester pour sélectionner celui qui aura le meilleur score
# et la meilleur capacité de généralisation
models = []
models.append(('LR', LogisticRegression(max_iter=100, class_weight="balanced")))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('XGB', XGBClassifier(n_estimators=100, random_state=SEED)))
models.append(('LGB', LGBMClassifier(n_estimators=100, random_state=SEED)))
models.append(('CAB', CatBoostClassifier(n_estimators=100, verbose=0, random_state=SEED, auto_class_weights="Balanced")))

In [7]:
for name, model in models:
    results = cross_val_score(model, X, y_enc, scoring="f1_weighted", cv=skf)
    print(f"{name} -- F1-Weighted: {np.mean(results):.4f}({np.std(results):.4f})")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR -- F1-Weighted: 0.2622(0.0100)
LDA -- F1-Weighted: 0.6264(0.0022)




QDA -- F1-Weighted: 0.5035(0.0017)
XGB -- F1-Weighted: 0.7908(0.0019)
LGB -- F1-Weighted: 0.7668(0.0051)
CAB -- F1-Weighted: 0.7353(0.0026)


Notre choix se tournera vers XGBoost