In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, normalize, StandardScaler
import numpy as np

## Chargement de la donnée

In [2]:
data = pd.read_csv("data/Tuesday-WorkingHours.pcap_ISCX.csv")

In [3]:
data = data.rename(str.strip, axis="columns")
data["Label"].unique()

array(['BENIGN', 'FTP-Patator', 'SSH-Patator'], dtype=object)

In [4]:
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
data = pd.get_dummies(data)

# Remplace les infinis par des NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Supprime ou impute les NaN
data.dropna(inplace=True)  # Option 1 : on les supprime
data.Label.unique()


array([0, 1])

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = data.drop("Label", axis="columns")
y = data['Label']

X_scaled = scaler.fit_transform(X)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


X_train, X_test, y_train, y_test = train_test_split(data.drop("Label", axis="columns"), data["Label"], train_size=0.75)


In [7]:
from sklearn.decomposition import PCA

# -------- PCA --------
pca = PCA(n_components=0.95)  # Garde 95% de la variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Nombre de composantes retenues : {pca.n_components_}")


Nombre de composantes retenues : 4


In [8]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)
y_pred_knn = knn.predict(X_test_pca)

print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    107906
           1       0.97      0.98      0.98      3506

    accuracy                           1.00    111412
   macro avg       0.99      0.99      0.99    111412
weighted avg       1.00      1.00      1.00    111412



In [None]:
# -------- SVM linéaire avec PCA --------
import time
from sklearn.svm import SVC


svm_linear_pca = SVC(kernel='linear', C=1)
start_time = time.time()
svm_linear_pca.fit(X_train_pca, y_train)
y_pred_linear_pca = svm_linear_pca.predict(X_test_pca)
linear_time = time.time() - start_time

print("=== SVM linéaire + PCA ===")
print(f"Temps d'exécution : {linear_time:.2f} secondes")
print(classification_report(y_test, y_pred_linear_pca))

# -------- SVM RBF avec PCA --------
svm_rbf_pca = SVC(kernel='rbf', C=1, gamma='scale')
start_time = time.time()
svm_rbf_pca.fit(X_train_pca, y_train)
y_pred_rbf_pca = svm_rbf_pca.predict(X_test_pca)
rbf_time = time.time() - start_time

print("\n=== SVM RBF + PCA ===")
print(f"Temps d'exécution : {rbf_time:.2f} secondes")
print(classification_report(y_test, y_pred_rbf_pca))
