This file contains code for sampling cleaned CIC-IDS-2017 dataset to be used for experiments.

In [1]:
import sys
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.utils import shuffle, resample
from sklearn.utils.random import sample_without_replacement

script_dir = os.path.dirname(os.path.abspath("experiments"))
sys.path.append(os.path.dirname(script_dir))

from experiments.predictions import labels

import warnings
warnings.filterwarnings("ignore")

Model: "Classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Data (InputLayer)           [(None, 31)]              0         
                                                                 
 Classifier (Sequential)     (None, 15)                71951     
                                                                 
Total params: 71,951
Trainable params: 71,951
Non-trainable params: 0
_________________________________________________________________
None


IndexError: invalid index to scalar variable.

In [2]:
clean = pd.read_csv("../data/clean1.csv")

In [3]:
clean.label.value_counts()

BENIGN                      2271320
DoS_Hulk                     230124
PortScan                     158804
DDoS                         128025
DoS_GoldenEye                 10293
FTPPatator                     7935
SSHPatator                     5897
DoS_slowloris                  5796
DoS_Slowhttptest               5499
Bot                            1956
Web_Attack_Brute_Force         1507
Web_Attack_XSS                  652
Infiltration                     36
Web_Attack_Sql_Injection         21
Heartbleed                       11
Name: label, dtype: int64

`preproc_data` function implements data preprocessing steps from `preprocessing.ipynb` file - but as a single function. 

In [9]:
# preproc_data function implements data preprocessing steps from preprocessing.ipynb file - but as a single function.
def preproc_data(dataset, train_sample: float, pca_dim=31):
    
    # Label encode
    le = LabelEncoder()
    dataset['label'] = le.fit_transform(dataset['label'])
    
    print("Original Label -> Encoded Label:")
    for label, encoded_label in zip(le.classes_, le.transform(le.classes_)):
        print(f"{label} -> {encoded_label}")
    
    # Train test split
    x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1], 
                                                        dataset['label'], 
                                                        test_size=1-train_sample, 
                                                        random_state=0)
    # Standard scaling
    ss = StandardScaler().fit(x_train)
    joblib.dump(ss, "ss.pkl")

    x_train = ss.transform(x_train)
    x_test = ss.transform(x_test)
    
    # PCA
    pca = PCA(n_components=31).fit(x_train)
    joblib.dump(pca, "pca.pkl")
    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)
    
    # Normalization
    norm = Normalizer().fit(x_train)
    joblib.dump(norm, "norm.pkl")
    x_train = norm.transform(x_train)
    x_test = norm.transform(x_test)
    
    # Reshaping 
    y_train = y_train.values.reshape(-1,1)
    y_test = y_test.values.reshape(-1,1)
    
    return x_train, x_test, y_train, y_test

In [4]:
low_member_thresh = 2000

low_member_labels = list(clean['label'].value_counts()[clean['label'].value_counts() < low_member_thresh].index)
low_member_labels

['Bot',
 'Web_Attack_Brute_Force',
 'Web_Attack_XSS',
 'Infiltration',
 'Web_Attack_Sql_Injection',
 'Heartbleed']

### 10%

This dataset sample contains only 10% of each class members except those which contain <2000 intances. They are taken in their entirety.

In [5]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [6]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.1, random_state=1)

In [7]:
data = data.append(low_member_data, ignore_index=True)

In [8]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [9]:
np.save("data/preserve10/x_train.npy", x_train)
np.save("data/preserve10/y_train.npy", y_train)
np.save("data/preserve10/x_test.npy", x_test)
np.save("data/preserve10/y_test.npy", y_test)

### 25%

In [12]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [13]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.25, random_state=1)

In [14]:
data = data.append(low_member_data, ignore_index=True)

In [15]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [16]:
np.save("data/preserve25/x_train.npy", x_train)
np.save("data/preserve25/y_train.npy", y_train)
np.save("data/preserve25/x_test.npy", x_test)
np.save("data/preserve25/y_test.npy", y_test)

### 50%

In [8]:
low_member_data = clean[clean.label.str.contains("|".join(low_member_labels))]

In [9]:
data = clean[~clean.label.str.contains("|".join(low_member_labels))].sample(frac=.5, random_state=1)

In [10]:
data = data.append(low_member_data, ignore_index=True)

In [11]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)

In [13]:
np.save("../data/preserve50/x_train.npy", x_train)
np.save("../data/preserve50/y_train.npy", y_train)
np.save("../data/preserve50/x_test.npy", x_test)
np.save("../data/preserve50/y_test.npy", y_test)

### 100%

In [24]:
x_train, x_test, y_train, y_test = preproc_data(clean, train_sample=0.75, pca_dim=31)

In [6]:
np.save("data/preserve100/x_train.npy", x_train)
np.save("data/preserve100/y_train.npy", y_train)
np.save("data/preserve100/x_test.npy", x_test)
np.save("data/preserve100/y_test.npy", y_test)

# PCA 1

In [14]:
def preproc_data_1(dataset, train_sample: float, pca_dim=1):
    
    # Label encode
    le = LabelEncoder()
    dataset['label'] = le.fit_transform(dataset['label'])
    
    # Train test split
    x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-1], 
                                                        dataset['label'], 
                                                        test_size=1-train_sample, 
                                                        random_state=0)
    # Standard scaling
    ss = StandardScaler().fit(x_train)

    x_train = ss.transform(x_train)
    x_test = ss.transform(x_test)
    
    # PCA
    pca = PCA(n_components=1).fit(x_train)

    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)
    
    # Normalization
    norm = Normalizer().fit(x_train)

    x_train = norm.transform(x_train)
    x_test = norm.transform(x_test)
    
    # Reshaping 
    y_train = y_train.values.reshape(-1,1)
    y_test = y_test.values.reshape(-1,1)
    
    return x_train, x_test, y_train, y_test

In [15]:
x_train, x_test, y_train, y_test = preproc_data_1(clean, train_sample=0.75, pca_dim=1)


In [16]:
np.save("../data/preserve100-pca1/x_train.npy", x_train)
np.save("../data/preserve100-pca1/y_train.npy", y_train)
np.save("../data/preserve100-pca1/x_test.npy", x_test)
np.save("../data/preserve100-pca1/y_test.npy", y_test)

In [17]:
x_train = np.load("../data/preserve100-pca1/x_train.npy")
x_train.shape[1]

1

# 剔除 PortScan DDos

In [13]:
data = clean.copy()
print(data.label.value_counts())
data = data[~data['label'].isin(['PortScan'])]
data = data[~data['label'].isin(['DDoS'])]
print("=========================")
print(data.label.value_counts())
data.head()

BENIGN                      2271320
DoS_Hulk                     230124
PortScan                     158804
DDoS                         128025
DoS_GoldenEye                 10293
FTPPatator                     7935
SSHPatator                     5897
DoS_slowloris                  5796
DoS_Slowhttptest               5499
Bot                            1956
Web_Attack_Brute_Force         1507
Web_Attack_XSS                  652
Infiltration                     36
Web_Attack_Sql_Injection         21
Heartbleed                       11
Name: label, dtype: int64
BENIGN                      2271320
DoS_Hulk                     230124
DoS_GoldenEye                 10293
FTPPatator                     7935
SSHPatator                     5897
DoS_slowloris                  5796
DoS_Slowhttptest               5499
Bot                            1956
Web_Attack_Brute_Force         1507
Web_Attack_XSS                  652
Infiltration                     36
Web_Attack_Sql_Injection         21
He

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [11]:
dataset = clean.copy()
# 创建 LabelEncoder 对象并进行编码
le = LabelEncoder()
dataset['encoded_label'] = le.fit_transform(dataset['label'])

# 打印每一个标签对应的编码值
print("Original Label -> Encoded Label:")
for label, encoded_label in zip(le.classes_, le.transform(le.classes_)):
    print(f"{label} -> {encoded_label}")

Original Label -> Encoded Label:
BENIGN -> 0
Bot -> 1
DDoS -> 2
DoS_GoldenEye -> 3
DoS_Hulk -> 4
DoS_Slowhttptest -> 5
DoS_slowloris -> 6
FTPPatator -> 7
Heartbleed -> 8
Infiltration -> 9
PortScan -> 10
SSHPatator -> 11
Web_Attack_Brute_Force -> 12
Web_Attack_Sql_Injection -> 13
Web_Attack_XSS -> 14


In [14]:
dataset = data.copy()
# 创建 LabelEncoder 对象并进行编码
le = LabelEncoder()
dataset['encoded_label'] = le.fit_transform(dataset['label'])

# 打印每一个标签对应的编码值
print("Original Label -> Encoded Label:")
for label, encoded_label in zip(le.classes_, le.transform(le.classes_)):
    print(f"{label} -> {encoded_label}")

Original Label -> Encoded Label:
BENIGN -> 0
Bot -> 1
DoS_GoldenEye -> 2
DoS_Hulk -> 3
DoS_Slowhttptest -> 4
DoS_slowloris -> 5
FTPPatator -> 6
Heartbleed -> 7
Infiltration -> 8
SSHPatator -> 9
Web_Attack_Brute_Force -> 10
Web_Attack_Sql_Injection -> 11
Web_Attack_XSS -> 12


In [10]:
x_train, x_test, y_train, y_test = preproc_data(data, train_sample=0.75, pca_dim=31)


Original Label -> Encoded Label:
0 -> 0
1 -> 1
2 -> 2
3 -> 3
4 -> 4
5 -> 5
6 -> 6
7 -> 7
8 -> 8
9 -> 9
10 -> 10
11 -> 11
12 -> 12


KeyboardInterrupt: 

In [8]:
np.save("../data/data-test-unkwonattack/x_train_100.npy", x_train)
np.save("../data/data-test-unkwonattack/y_train_100.npy", y_train)
np.save("../data/data-test-unkwonattack/x_test_100.npy", x_test)
np.save("../data/data-test-unkwonattack/y_test_100.npy", y_test)