# Feature Engineering and Data Rebalancing

>[Feature Engineering and Data Rebalancing](#scrollTo=zftwMhm78Nd-)

>>[import](#scrollTo=NISkXWSA8QhI)

>>[google drive](#scrollTo=t7Zwtq0Y_ixE)

>>[Read the CICIDS2017 dataset](#scrollTo=RMWmGOak8qDH)

>>[calculate the sum of importance scores](#scrollTo=GwdTm08-Dmez)

>>[select the important features from top to bottom until the accumulated importance reaches 90%](#scrollTo=iXnHvYbDD5sM)

>>[FCBF_module](#scrollTo=xrudBuAZAz0s)

>>[FCBF](#scrollTo=vto7UgJ8BiVU)

>>[Train-test split after feature selection](#scrollTo=SQpABuq9DPNK)

>>[Solve class-imbalance by SMOTE](#scrollTo=Cr6RhtFkDa8B)

>>[Solve class-imbalance by TomekLinks](#scrollTo=5rcvZqWmc5H-)



## import

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import math
import random
import matplotlib.pyplot as plt
import shutil
from sklearn.preprocessing import QuantileTransformer
from PIL import Image
import warnings
import csv
from scipy.stats import entropy
from google.colab import drive
warnings.filterwarnings("ignore")
print("Done")

Done


##google drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PROJECT_PATH= '/content/drive/MyDrive/project/data/'
DATA='CICIDS2017'

## Read the CICIDS2017 dataset

In [None]:
df =pd.read_csv(PROJECT_PATH+'data_CICIDS2017.csv')

In [None]:
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Normal
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Normal
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Normal
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Normal
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Normal


In [None]:
df.isnull().values.any()

False

In [None]:
importances=np.array([5.00249856e-01, 3.77473419e-01, 2.60627616e-01, 3.07800510e-01,
       4.45050149e-01, 4.97066895e-01, 3.97253673e-01, 1.95457386e-01,
       3.55984552e-01, 2.78330072e-01, 4.55748909e-01, 2.73917675e-01,
       4.85041922e-01, 2.83741954e-01, 4.01468183e-01, 3.61530681e-01,
       3.49958080e-01, 2.68124366e-01, 3.89918149e-01, 1.79293244e-01,
       3.49240175e-01, 3.40737675e-01, 2.42704464e-01, 3.67079813e-01,
       1.94196920e-01, 2.42662170e-01, 2.29430530e-01, 1.65264746e-01,
       2.58972019e-01, 1.83175675e-01, 1.06097882e-02, 2.58120046e-04,
       3.43187687e-04, 0.00000000e+00, 3.49319022e-01, 3.72896752e-01,
       3.62111066e-01, 3.82239509e-01, 1.95649844e-01, 4.15240017e-01,
       5.22351890e-01, 5.49210329e-01, 5.49360262e-01, 2.72183587e-02,
       1.04229346e-02, 1.00074148e-04, 1.14998204e-01, 9.13239804e-02,
       2.77385000e-02, 0.00000000e+00, 1.45660024e-04, 1.90953395e-01,
       5.54254667e-01, 3.55904374e-01, 4.84917658e-01, 3.49502525e-01,
       0.00000000e+00, 0.00000000e+00, 2.60631721e-04, 0.00000000e+00,
       2.62401282e-04, 0.00000000e+00, 2.60750926e-01, 4.45495713e-01,
       3.07550330e-01, 4.96904354e-01, 4.52091660e-01, 4.40042151e-01,
       1.67718030e-01, 1.85228496e-01, 1.82588630e-01, 2.17847077e-02,
       1.80285462e-01, 1.81121986e-01, 1.58744121e-01, 2.97772536e-02,
       1.65679995e-01, 1.60039336e-01])

In [None]:
importances.shape

(78,)

In [None]:
X=df.drop(['Label'], axis=1)
label=df['Label']

In [None]:
label.shape


(2298395,)

## calculate the sum of importance scores

In [None]:
features=X.columns

In [None]:
features

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [None]:
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

## select the important features from top to bottom until the accumulated importance reaches 90%

In [None]:
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break
# X_fs = df[fs].values # has 50 features

In [None]:
fs

[' Average Packet Size',
 ' Packet Length Variance',
 ' Packet Length Std',
 ' Packet Length Mean',
 ' Destination Port',
 ' Total Length of Bwd Packets',
 ' Subflow Bwd Bytes',
 ' Bwd Packet Length Mean',
 ' Avg Bwd Segment Size',
 'Bwd Packet Length Max',
 'Init_Win_bytes_forward',
 'Total Length of Fwd Packets',
 ' Subflow Fwd Bytes',
 ' Init_Win_bytes_backward',
 ' Max Packet Length',
 'Flow Bytes/s',
 ' Fwd Packet Length Max',
 ' Flow IAT Max',
 ' Bwd Packets/s',
 ' Flow Duration',
 ' Bwd Header Length',
 ' Fwd IAT Max',
 'Fwd Packets/s',
 ' Flow Packets/s',
 ' Fwd Packet Length Mean',
 ' Avg Fwd Segment Size',
 'Fwd IAT Total',
 ' Fwd Header Length.1',
 ' Fwd Header Length',
 ' Flow IAT Mean',
 ' Fwd IAT Mean',
 ' Total Backward Packets',
 ' Subflow Bwd Packets',
 ' Bwd Packet Length Std',
 ' Fwd Packet Length Std',
 ' Bwd Packet Length Min',
 ' Flow IAT Std',
 'Subflow Fwd Packets',
 ' Total Fwd Packets',
 ' Bwd IAT Max',
 'Bwd IAT Total',
 ' Fwd IAT Std',
 ' Bwd IAT Mean',
 ' M

## FCBF_module

In [None]:
# -*- coding: utf-8 -*-

import numpy as np


def count_vals(x):
    _, occ = np.unique(x, return_counts=True)
    # occ = np.zeros(shape=vals.shape)
    # for i in range(vals.size):
    #     occ[i] = np.sum(x == vals[i])
    return occ


def entropy(x):
    n = float(x.shape[0])
    ocurrence = count_vals(x)
    px = ocurrence / n
    return -1 * np.sum(px * np.log2(px))


def symmetricalUncertain(x, y):
    n = float(y.shape[0])
    vals = np.unique(y)
    # Computing Entropy for the feature x.
    Hx = entropy(x)
    # Computing Entropy for the feature y.
    Hy = entropy(y)
    # Computing Joint entropy between x and y.
    partial = np.zeros(shape=(vals.shape[0]))
    for i in range(vals.shape[0]):
        partial[i] = entropy(x[y == vals[i]])

    partial[np.isnan(partial) == 1] = 0
    py = count_vals(y).astype(dtype='float64') / n
    Hxy = np.sum(py[py > 0] * partial)
    IG = Hx - Hxy
    res = 2 * IG / (Hx + Hy)
    return res


def suGroup(x, n):
    m = x.shape[0]
    x = np.reshape(x, (n, m / n)).T
    m = x.shape[1]
    SU_matrix = np.zeros(shape=(m, m))
    for j in range(m - 1):
        x2 = x[:, j + 1::]
        y = x[:, j]
        temp = np.apply_along_axis(symmetricalUncertain, 0, x2, y)
        for k in range(temp.shape[0]):
            SU_matrix[j, j + 1::] = temp
            SU_matrix[j + 1::, j] = temp

    return 1 / float(m - 1) * np.sum(SU_matrix, axis=1)


def isprime(a):
    return all(a % i for i in range(2, a))


"""
get
"""


def get_i(a):
    if isprime(a):
        a -= 1
    return filter(lambda x: a % x == 0, range(2, a))


"""
FCBF - Fast Correlation Based Filter

L. Yu and H. Liu. Feature Selection for High‐Dimensional Data: A Fast Correlation‐Based Filter Solution.
In Proceedings of The Twentieth International Conference on Machine Leaning (ICML‐03), 856‐863.
Washington, D.C., August 21‐24, 2003.
"""


class FCBF:
    idx_sel = []

    def __init__(self, th=0.01):
        '''
        Parameters
        ---------------
            th = The initial threshold
        '''
        self.th = th

    def fit(self, x, y):
        '''
        This function executes FCBF algorithm and saves indexes
        of selected features in self.idx_sel

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''
        self.idx_sel = []
        """
        First Stage: Computing the SU for each feature with the response.
        """
        SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
        SU_list = SU_vec[SU_vec > self.th]
        SU_list[::-1].sort()

        m = x[:, SU_vec > self.th].shape
        x_sorted = np.zeros(shape=m)

        for i in range(m[1]):
            ind = np.argmax(SU_vec)
            SU_vec[ind] = 0
            x_sorted[:, i] = x[:, ind].copy()
            self.idx_sel.append(ind)

        """
        Second Stage: Identify relationships between feature to remove redundancy.
        """
        j = 0
        while True:
            """
            Stopping Criteria:The search finishes
            """
            if j >= x_sorted.shape[1]: break
            y = x_sorted[:, j].copy()
            x_list = x_sorted[:, j + 1:].copy()
            if x_list.shape[1] == 0: break

            SU_list_2 = SU_list[j + 1:]
            SU_x = np.apply_along_axis(symmetricalUncertain, 0,
                                       x_list, y)

            comp_SU = SU_x >= SU_list_2
            to_remove = np.where(comp_SU)[0] + j + 1
            if to_remove.size > 0:
                x_sorted = np.delete(x_sorted, to_remove, axis=1)
                SU_list = np.delete(SU_list, to_remove, axis=0)
                to_remove.sort()
                for r in reversed(to_remove):
                    self.idx_sel.remove(self.idx_sel[r])
            j = j + 1

    def fit_transform(self, x, y):
        '''
        This function fits the feature selection
        algorithm and returns the resulting subset.

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''
        self.fit(x, y)
        return x[:, self.idx_sel]

    def transform(self, x):
        '''
        This function applies the selection
        to the vector x.

        Parameters
        ---------------
            x = dataset  [NxM]
        '''
        return x[:, self.idx_sel]


"""
FCBF# - Fast Correlation Based Filter
B. Senliol, G. Gulgezen, et al. Fast Correlation Based Filter (FCBF) with a Different Search Strategy.
In Computer and Information Sciences (ISCIS ‘08) 23rd International Symposium on, pages 1‐4.
Istanbul, October 27‐29, 2008.
"""


class FCBFK(FCBF):
    idx_sel = []

    def __init__(self, k=10):
        '''
        Parameters
        ---------------
            k = Number of features to include in the
            subset.
        '''
        self.k = k

    def fit(self, x, y):
        '''
        This function executes FCBFK algorithm and saves indexes
        of selected features in self.idx_sel

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''
        self.idx_sel = []
        """
        First Stage: Computing the SU for each feature with the response.
        """
        SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)

        SU_list = SU_vec[SU_vec > 0]
        SU_list[::-1].sort()

        m = x[:, SU_vec > 0].shape
        x_sorted = np.zeros(shape=m)

        for i in range(m[1]):
            ind = np.argmax(SU_vec)
            SU_vec[ind] = 0
            x_sorted[:, i] = x[:, ind].copy()
            self.idx_sel.append(ind)

        """
        Second Stage: Identify relationships between features to remove redundancy with stopping
        criteria (features in x_best == k).
        """
        j = 0
        while True:
            y = x_sorted[:, j].copy()
            SU_list_2 = SU_list[j + 1:]
            x_list = x_sorted[:, j + 1:].copy()

            """
            Stopping Criteria:The search finishes
            """
            if x_list.shape[1] == 0: break

            SU_x = np.apply_along_axis(symmetricalUncertain, 0,
                                       x_list, y)

            comp_SU = SU_x >= SU_list_2
            to_remove = np.where(comp_SU)[0] + j + 1
            if to_remove.size > 0 and x.shape[1] > self.k:

                for i in reversed(to_remove):

                    x_sorted = np.delete(x_sorted, i, axis=1)
                    SU_list = np.delete(SU_list, i, axis=0)
                    self.idx_sel.remove(self.idx_sel[i])
                    if x_sorted.shape[1] == self.k: break

            if x_list.shape[1] == 1 or x_sorted.shape[1] == self.k:
                break
            j = j + 1

        if len(self.idx_sel) > self.k:
            self.idx_sel = self.idx_sel[:self.k]


"""
FCBFiP - Fast Correlation Based Filter in Pieces
"""


class FCBFiP(FCBF):
    idx_sel = []

    def __init__(self, k=10, npieces=2):
        '''
        Parameters
        ---------------
            k = Number of features to include in the
            subset.
            npieces = Number of pieces to divide the
            feature space.
        '''
        self.k = k
        self.npieces = npieces

    def fit(self, x, y):
        '''
        This function executes FCBF algorithm and saves indexes
        of selected features in self.idx_sel

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''

        """
        First Stage: Computing the SU for each feature with the response. We sort the
        features. When we have a prime number of features we remove the last one from the
        sorted features list.
        """
        m = x.shape
        nfeaturesPieces = int(m[1] / float(self.npieces))
        SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)

        x_sorted = np.zeros(shape=m, dtype='float64')
        idx_sorted = np.zeros(shape=m[1], dtype='int64')
        for i in range(m[1]):
            ind = np.argmax(SU_vec)
            SU_vec[ind] = -1
            idx_sorted[i] = ind
            x_sorted[:, i] = x[:, ind].copy()

        if isprime(m[1]):
            x_sorted = np.delete(x_sorted, m[1] - 1, axis=1)
            ind_prime = idx_sorted[m[1] - 1]
            idx_sorted = np.delete(idx_sorted, m[1] - 1)
            # m = x_sorted.shape
        """
        Second Stage: Identify relationships between features into its vecinity
        to remove redundancy with stopping criteria (features in x_best == k).
        """

        x_2d = np.reshape(x_sorted.T, (self.npieces, nfeaturesPieces * m[0])).T

        SU_x = np.apply_along_axis(suGroup, 0, x_2d, nfeaturesPieces)
        SU_x = np.reshape(SU_x.T, (self.npieces * nfeaturesPieces,))
        idx_sorted2 = np.zeros(shape=idx_sorted.shape, dtype='int64')
        SU_x[np.isnan(SU_x)] = 1

        for i in range(idx_sorted.shape[0]):
            ind = np.argmin(SU_x)
            idx_sorted2[i] = idx_sorted[ind]
            SU_x[ind] = 10

        """
        Scoring step
        """
        self.scores = np.zeros(shape=m[1], dtype='int64')

        for i in range(m[1]):
            if i in idx_sorted:
                self.scores[i] = np.argwhere(i == idx_sorted) + np.argwhere(i == idx_sorted2)
        if isprime(m[1]):
            self.scores[ind_prime] = 2 * m[1]
        self.set_k(self.k)

    def set_k(self, k):
        self.k = k
        scores_temp = -1 * self.scores

        self.idx_sel = np.zeros(shape=self.k, dtype='int64')
        for i in range(self.k):
            ind = np.argmax(scores_temp)
            scores_temp[ind] = -100000000
            self.idx_sel[i] = ind


In [None]:
df_sample=df.sample(250_000, random_state=0)
X_sample=df_sample.drop(['Label'], axis=1)
label_sample = df_sample['Label']
X_fs_sample=X_sample[fs].values

In [None]:
X_fs_sample.shape

(250000, 49)

## FCBF

In [None]:
fcbf = FCBFK(k = 25)

fcbf.fit(X_fs_sample,label_sample)

In [None]:
idx=sorted(fcbf.idx_sel)

In [None]:
X_fs = df[fs].values

In [None]:
X_fss=X_fs[:, idx]

In [None]:
X_fss.shape

(2298395, 25)

In [None]:
fs1 = np.array(fs)

In [None]:
features1=fs1[idx]

In [None]:
features1

array([' Average Packet Size', ' Packet Length Variance',
       ' Packet Length Std', ' Destination Port',
       ' Total Length of Bwd Packets', ' Subflow Bwd Bytes',
       ' Bwd Packet Length Mean', ' Avg Bwd Segment Size',
       'Bwd Packet Length Max', 'Init_Win_bytes_forward',
       'Total Length of Fwd Packets', ' Subflow Fwd Bytes',
       ' Init_Win_bytes_backward', ' Fwd Packet Length Max',
       ' Bwd Header Length', ' Fwd Header Length.1', ' Fwd Header Length',
       ' Total Backward Packets', ' Subflow Bwd Packets',
       ' Bwd Packet Length Std', ' Bwd Packet Length Min',
       'Subflow Fwd Packets', ' Total Fwd Packets', ' Down/Up Ratio',
       ' min_seg_size_forward'], dtype='<U28')

## Train-test split after feature selection

In [None]:
X_imbalance = pd.DataFrame(X_fss, columns = features1)

## Solve class-imbalance by SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
print(label.shape)
print(label.shape)

(2298395,)
(2298395,)


In [None]:
smote=SMOTE(n_jobs=-1,sampling_strategy={'Normal': 1741839, 'Dos/DDos': 379748, 'PortScan': 158804, 'Brute Force': 40000,
                                          'Web Attack': 40000, 'Botnet ARES': 40000, 'Infiltration': 40000})
# smote=SMOTE(n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_imbalance, label)

In [None]:
print('Resampled dataset shape %s' % Counter(label))
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({'Normal': 1741839, 'Dos/DDos': 379748, 'PortScan': 158804, 'Brute Force': 13832, 'Web Attack': 2180, 'Botnet ARES': 1956, 'Infiltration': 36})
Resampled dataset shape Counter({'Normal': 1741839, 'Dos/DDos': 379748, 'PortScan': 158804, 'Botnet ARES': 40000, 'Infiltration': 40000, 'Web Attack': 40000, 'Brute Force': 40000})


## Solve class-imbalance by NearMiss

In [None]:
from imblearn.under_sampling import NearMiss

nm = NearMiss(n_jobs=-1,sampling_strategy={'Normal': 40000, 'Dos/DDos': 40000, 'PortScan': 40000, 'Brute Force': 40000,
                                          'Web Attack': 40000, 'Botnet ARES': 40000, 'Infiltration': 40000})
X_res, y_res = nm.fit_resample(X_resampled, y_resampled)

In [None]:
print('Resampled dataset shape %s' % Counter(y_resampled))
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({'Normal': 1741839, 'Dos/DDos': 379748, 'PortScan': 158804, 'Botnet ARES': 40000, 'Infiltration': 40000, 'Web Attack': 40000, 'Brute Force': 40000})
Resampled dataset shape Counter({'Botnet ARES': 40000, 'Brute Force': 40000, 'Dos/DDos': 40000, 'Infiltration': 40000, 'Normal': 40000, 'PortScan': 40000, 'Web Attack': 40000})


In [None]:
X_res['Label']=y_res

In [None]:
X_res.head()

Unnamed: 0,Average Packet Size,Packet Length Variance,Packet Length Std,Destination Port,Total Length of Bwd Packets,Subflow Bwd Bytes,Bwd Packet Length Mean,Avg Bwd Segment Size,Bwd Packet Length Max,Init_Win_bytes_forward,...,Fwd Header Length,Total Backward Packets,Subflow Bwd Packets,Bwd Packet Length Std,Bwd Packet Length Min,Subflow Fwd Packets,Total Fwd Packets,Down/Up Ratio,min_seg_size_forward,Label
0,0.0,0.0,0.0,8080.0,0.0,0.0,0.0,0.0,0.0,237.0,...,32.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,32.0,Botnet ARES
1,39.111111,4645.511111,68.157986,8080.0,140.0,140.0,35.0,35.0,128.0,8192.0,...,112.0,4.0,4.0,62.064483,0.0,5.0,5.0,0.0,20.0,Botnet ARES
2,286.285714,423807.7143,651.005157,8080.0,134.0,134.0,44.666667,44.666667,128.0,8192.0,...,92.0,3.0,3.0,72.231111,0.0,4.0,4.0,0.0,20.0,Botnet ARES
3,48.714286,5711.696429,75.575766,8080.0,134.0,134.0,44.666667,44.666667,128.0,8192.0,...,92.0,3.0,3.0,72.231111,0.0,4.0,4.0,0.0,20.0,Botnet ARES
4,48.714286,5711.696429,75.575766,8080.0,134.0,134.0,44.666667,44.666667,128.0,8192.0,...,92.0,3.0,3.0,72.231111,0.0,4.0,4.0,0.0,20.0,Botnet ARES


In [None]:
X_res.shape

(280000, 26)

In [None]:
X_res.to_csv(PROJECT_PATH+'/data25_CICIDS2017.csv')