In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('network_data.csv')

In [3]:
df['protocol'] = df['protocol'].astype('category').cat.codes
df['tcp_flags'] = df['tcp_flags'].fillna('None').astype('category').cat.codes
df.fillna(0, inplace=True)  # Remplacer les NaN par 0 ou une autre valeur selon le cas

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1789 entries, 0 to 1788
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   timestamp       1789 non-null   object 
 1   src_ip          1789 non-null   object 
 2   dst_ip          1789 non-null   object 
 3   src_port        1789 non-null   float64
 4   dst_port        1789 non-null   float64
 5   protocol        1789 non-null   int8   
 6   length          1789 non-null   int64  
 7   icmp_type       1789 non-null   float64
 8   tcp_flags       1789 non-null   int8   
 9   entropy_src_ip  1789 non-null   float64
 10  entropy_dst_ip  1789 non-null   float64
 11  window_tx       1789 non-null   int64  
dtypes: float64(5), int64(2), int8(2), object(3)
memory usage: 143.4+ KB


In [5]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['length', 'src_port', 'dst_port', 'icmp_type', 'tcp_flags']])

In [6]:
# Entraînement de l'Isolation Forest
clf = IsolationForest(n_estimators=200, contamination='auto')
clf.fit(scaled_features)

In [7]:
# Prédiction des anomalies
predictions = clf.predict(scaled_features)
df['anomaly'] = predictions

In [8]:
# Identifier les anomalies (les anomalies sont marquées comme -1)
anomalies = df[df['anomaly'] == -1]

In [9]:
anomalies.head(5)

Unnamed: 0,timestamp,src_ip,dst_ip,src_port,dst_port,protocol,length,icmp_type,tcp_flags,entropy_src_ip,entropy_dst_ip,window_tx,anomaly
252,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1514,0.0,0,3.037211,2.332998,41859,-1
253,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1514,0.0,3,3.051668,2.327602,43373,-1
254,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1230,0.0,3,3.064073,2.322201,44603,-1
255,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1173,0.0,3,3.07485,2.316798,45776,-1
284,2024-03-18 22:55:48,192.168.1.18,52.222.149.65,62007.0,443.0,2,1292,0.0,2,2.991764,2.393403,51148,-1


In [10]:
anomalies.head(20)

Unnamed: 0,timestamp,src_ip,dst_ip,src_port,dst_port,protocol,length,icmp_type,tcp_flags,entropy_src_ip,entropy_dst_ip,window_tx,anomaly
252,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1514,0.0,0,3.037211,2.332998,41859,-1
253,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1514,0.0,3,3.051668,2.327602,43373,-1
254,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1230,0.0,3,3.064073,2.322201,44603,-1
255,2024-03-18 22:55:36,20.120.65.166,192.168.1.18,443.0,55347.0,1,1173,0.0,3,3.07485,2.316798,45776,-1
284,2024-03-18 22:55:48,192.168.1.18,52.222.149.65,62007.0,443.0,2,1292,0.0,2,2.991764,2.393403,51148,-1
285,2024-03-18 22:55:48,52.222.149.65,192.168.1.18,443.0,62007.0,2,1242,0.0,2,3.020177,2.388428,52390,-1
286,2024-03-18 22:55:48,52.222.149.65,192.168.1.18,443.0,62007.0,2,1242,0.0,2,3.03927,2.383455,53632,-1
287,2024-03-18 22:55:48,52.222.149.65,192.168.1.18,443.0,62007.0,2,1242,0.0,2,3.054805,2.378483,54874,-1
288,2024-03-18 22:55:48,52.222.149.65,192.168.1.18,443.0,62007.0,2,1242,0.0,2,3.068021,2.373515,56116,-1
297,2024-03-18 22:55:48,52.222.149.65,192.168.1.18,443.0,62007.0,2,1494,0.0,2,3.04254,2.469049,59734,-1


In [11]:
anomalies.describe()

Unnamed: 0,src_port,dst_port,protocol,length,icmp_type,tcp_flags,entropy_src_ip,entropy_dst_ip,window_tx,anomaly
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,3863.222222,54153.333333,1.333333,1385.166667,0.0,1.666667,2.997378,2.380792,60303.333333,-1.0
std,14510.773959,13742.562553,0.485071,138.489435,0.0,1.283378,0.201664,0.066501,24214.878419,0.0
min,443.0,443.0,1.0,1173.0,0.0,0.0,2.195092,2.229111,8264.0,-1.0
25%,443.0,55348.0,1.0,1242.0,0.0,0.0,3.018625,2.33857,47119.0,-1.0
50%,443.0,55353.0,1.0,1494.0,0.0,2.0,3.047104,2.375999,55495.0,-1.0
75%,443.0,60361.5,2.0,1509.0,0.0,3.0,3.064069,2.442916,65509.5,-1.0
max,62007.0,62007.0,2.0,1514.0,0.0,3.0,3.07485,2.474393,105743.0,-1.0


In [12]:
df.shape

(1789, 13)

In [13]:
anomalies.shape

(18, 13)