# Analyse et prétraitement des données

In [1]:
#pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [4]:
# Load the dataset
df = pd.read_csv("C:\LP-CIS\S6\Machine Learning & CyberSec\Devoir\CICIDS2017_Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")

In [5]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,389,113095465,48,24,9668,10012,403,0,201.416667,203.548293,...,32,203985.5,575837.3,1629110,379,13800000.0,4277541.0,16500000,6737603,BENIGN
1,389,113473706,68,40,11364,12718,403,0,167.117647,171.919413,...,32,178326.875,503426.9,1424245,325,13800000.0,4229413.0,16500000,6945512,BENIGN
2,0,119945515,150,0,0,0,0,0,0.0,0.0,...,0,6909777.333,11700000.0,20400000,6,24400000.0,24300000.0,60100000,5702188,BENIGN
3,443,60261928,9,7,2330,4221,1093,0,258.888889,409.702161,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,53,269,2,2,102,322,51,51,51.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [7]:
# opimising the dataset's size
import numpy as np
df = df.copy()

for column in df.columns:
    if df[column].dtype == np.int64:
        maxVal = df[column].max()
        if maxVal < 120:
            df[column] = df[column].astype(np.int8)
        elif maxVal < 32767:
            df[column] = df[column].astype(np.int16)
        else:
            df[column] = df[column].astype(np.int32)
            
    if df[column].dtype == np.float64:
        maxVal = df[column].max()
        minVal = df[df[column]>0][column]
        if maxVal < 120 and minVal>0.01 :
            df[column] = df[column].astype(np.float16)
        else:
            df[column] = df[column].astype(np.float32)

In [8]:
#after optimize of size
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170366 entries, 0 to 170365
Data columns (total 79 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Destination Port             170366 non-null  int32  
 1   Flow Duration                170366 non-null  int32  
 2   Total Fwd Packets            170366 non-null  int32  
 3   Total Backward Packets       170366 non-null  int32  
 4   Total Length of Fwd Packets  170366 non-null  int32  
 5   Total Length of Bwd Packets  170366 non-null  int32  
 6   Fwd Packet Length Max        170366 non-null  int16  
 7   Fwd Packet Length Min        170366 non-null  int16  
 8   Fwd Packet Length Mean       170366 non-null  float32
 9   Fwd Packet Length Std        170366 non-null  float32
 10  Bwd Packet Length Max        170366 non-null  int16  
 11  Bwd Packet Length Min        170366 non-null  int16  
 12  Bwd Packet Length Mean       170366 non-null  float32
 13 

In [9]:
# Feature selection
df = df[['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
         'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max',
         'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 
         'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean',
         'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 
         'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 
         'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
         'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 
         'Fwd URG Flags', 'Bwd URG Flags', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count',
         'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Label']]


In [10]:
#selecting the necessary columns : remaining 40
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170366 entries, 0 to 170365
Data columns (total 41 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Destination Port             170366 non-null  int32  
 1   Flow Duration                170366 non-null  int32  
 2   Total Fwd Packets            170366 non-null  int32  
 3   Total Backward Packets       170366 non-null  int32  
 4   Total Length of Fwd Packets  170366 non-null  int32  
 5   Total Length of Bwd Packets  170366 non-null  int32  
 6   Fwd Packet Length Max        170366 non-null  int16  
 7   Fwd Packet Length Min        170366 non-null  int16  
 8   Fwd Packet Length Mean       170366 non-null  float32
 9   Fwd Packet Length Std        170366 non-null  float32
 10  Bwd Packet Length Max        170366 non-null  int16  
 11  Bwd Packet Length Min        170366 non-null  int16  
 12  Bwd Packet Length Mean       170366 non-null  float32
 13 

In [18]:
df['Label'].value_counts()

BENIGN                        168186
Web Attack � Brute Force        1507
Web Attack � XSS                 652
Web Attack � Sql Injection        21
Name: Label, dtype: int64

In [19]:
from sklearn. preprocessing import LabelEncoder
#from categorical data to numerical format
label = LabelEncoder()
df['Label' ]= label.fit_transform(df[ 'Label' ] )
df['Label'].unique()

array([0, 1, 3, 2])

In [20]:
#show missing data
df.isnull().sum()

Destination Port                0
Flow Duration                   0
Total Fwd Packets               0
Total Backward Packets          0
Total Length of Fwd Packets     0
Total Length of Bwd Packets     0
Fwd Packet Length Max           0
Fwd Packet Length Min           0
Fwd Packet Length Mean          0
Fwd Packet Length Std           0
Bwd Packet Length Max           0
Bwd Packet Length Min           0
Bwd Packet Length Mean          0
Bwd Packet Length Std           0
Flow Bytes/s                   20
Flow Packets/s                  0
Flow IAT Mean                   0
Flow IAT Std                    0
Flow IAT Max                    0
Flow IAT Min                    0
Fwd IAT Total                   0
Fwd IAT Mean                    0
Fwd IAT Std                     0
Fwd IAT Max                     0
Fwd IAT Min                     0
Bwd IAT Total                   0
Bwd IAT Mean                    0
Bwd IAT Std                     0
Bwd IAT Max                     0
Bwd IAT Min   

In [21]:
# Handle missing data
df['Flow Bytes/s'].fillna(df['Flow Bytes/s'].mean(),inplace=True)

In [22]:
#handle infinite values and NaN values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

In [23]:
df.isnull().sum()

Destination Port               0
Flow Duration                  0
Total Fwd Packets              0
Total Backward Packets         0
Total Length of Fwd Packets    0
Total Length of Bwd Packets    0
Fwd Packet Length Max          0
Fwd Packet Length Min          0
Fwd Packet Length Mean         0
Fwd Packet Length Std          0
Bwd Packet Length Max          0
Bwd Packet Length Min          0
Bwd Packet Length Mean         0
Bwd Packet Length Std          0
Flow Bytes/s                   0
Flow Packets/s                 0
Flow IAT Mean                  0
Flow IAT Std                   0
Flow IAT Max                   0
Flow IAT Min                   0
Fwd IAT Total                  0
Fwd IAT Mean                   0
Fwd IAT Std                    0
Fwd IAT Max                    0
Fwd IAT Min                    0
Bwd IAT Total                  0
Bwd IAT Mean                   0
Bwd IAT Std                    0
Bwd IAT Max                    0
Bwd IAT Min                    0
Fwd PSH Fl

In [24]:
# find duplicated rows
dup_rows = df.duplicated()
df = df.drop_duplicates()
print(dup_rows)


0         False
1         False
2         False
3         False
4         False
          ...  
170361    False
170362    False
170363    False
170364     True
170365    False
Length: 170231, dtype: bool


In [25]:
# check for duplicated columns
duplicated_cols = df.T.duplicated()

# get duplicated column names
duplicated_col_names = df.columns[duplicated_cols].tolist()

print("Duplicated column names:", duplicated_col_names)

Duplicated column names: ['Fwd URG Flags', 'Bwd URG Flags', 'SYN Flag Count']


In [31]:
# drop duplicated columns and keep only one column
df = df.drop(['Fwd URG Flags', 'Bwd URG Flags'], axis=1)

        Destination Port  Flow Duration  Total Fwd Packets  \
0                    389      113095465                 48   
1                    389      113473706                 68   
2                      0      119945515                150   
3                    443       60261928                  9   
4                     53            269                  2   
...                  ...            ...                ...   
170360               443            181                  3   
170361             55641             49                  1   
170362             45337            217                  2   
170363                22        1387547                 41   
170365             60146             50                  1   

        Total Backward Packets  Total Length of Fwd Packets  \
0                           24                         9668   
1                           40                        11364   
2                            0                            0   
3  

In [33]:
df.shape

(159742, 39)

In [40]:
#End of preprocessing step
# Save the preprocessed dataset
df.to_csv('preprocessed_dataset.csv', index=False)