In [6]:
# Importation des librairies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, pairwise_distances, homogeneity_score
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt


# Étape 1: Chargement du dataset

In [7]:
dataset = pd.read_csv("./Fichiers de données/UNSW_NB15_training-set.csv")

In [8]:
dataset

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.000011,udp,-,INT,2,0,496,0,90909.090200,...,1,2,0,0,0,1,2,0,Normal,0
1,2,0.000008,udp,-,INT,2,0,1762,0,125000.000300,...,1,2,0,0,0,1,2,0,Normal,0
2,3,0.000005,udp,-,INT,2,0,1068,0,200000.005100,...,1,3,0,0,0,1,3,0,Normal,0
3,4,0.000006,udp,-,INT,2,0,900,0,166666.660800,...,1,3,0,0,0,2,3,0,Normal,0
4,5,0.000010,udp,-,INT,2,0,2126,0,100000.002500,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,udp,-,INT,2,0,104,0,200000.005100,...,1,2,0,0,0,2,1,0,Normal,0
82328,82329,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,...,1,1,0,0,0,3,2,0,Normal,0
82329,82330,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0
82330,82331,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0


# Étape 2: Prétraitement des données

In [9]:
# Nature des données
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  int64  
 16  sinpkt             823

In [10]:
# Nettoyage des données
dataset.drop_duplicates(inplace=True)
dataset.dropna(inplace=True)

In [11]:
# Normalisation des données numériques
scaler = StandardScaler()
numeric_features = dataset.select_dtypes(include=['float64', 'int64']).columns
dataset[numeric_features] = scaler.fit_transform(dataset[numeric_features])

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  float64
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  float64
 6   dpkts              82332 non-null  float64
 7   sbytes             82332 non-null  float64
 8   dbytes             82332 non-null  float64
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  float64
 11  dttl               82332 non-null  float64
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  float64
 15  dloss              82332 non-null  float64
 16  sinpkt             823

# Etape 3: Sélection des caractéristiques pertinentes

In [13]:
selected_features = ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean']
X = dataset[selected_features]

# Étape 4: Clustering par densité

In [14]:
dbscan_model = DBSCAN(eps=0.3, min_samples=10)

In [15]:
pred_model = dbscan_model.fit_predict(X)

In [17]:
print("Score de silhouette: ", silhouette_score(X, pred_model))

Score de silhouette:  0.11143487701169497


In [18]:
# Calcul du dendrogramme pour visualiser la structure hiérarchique des clusters
linked = linkage(X, method='complete', metric='euclidean')
plt.figure(figsize=(10, 7))
dendrogram(linked,distance_sort='descending')
plt.title('Dendrogramme des clusters')
plt.xlabel('Échantillons')
plt.show()