# Feature Engineering and Data Rebalancing

>[Feature Engineering and Data Rebalancing](#scrollTo=ZoEOqJj8zhL9)

>>[import](#scrollTo=z1DwmXfQzzPJ)

>>[google drive](#scrollTo=vj-4P6eUz4K4)

>>[Read the NSL-KDD dataset](#scrollTo=YNprwXhI0IC5)

>>>[Transform all features that type object to int](#scrollTo=akJY2ZBZGJI_)

>>[FCBF_module](#scrollTo=ZVRcbzdODXBx)

>>[FCBF](#scrollTo=qS__yIjYDikI)

>>[Solve class-imbalance by SMOTE](#scrollTo=JVk_qTqDE7DG)



## import

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import math
import random
import matplotlib.pyplot as plt
import shutil
from sklearn.preprocessing import QuantileTransformer
from PIL import Image
import warnings
import csv
from scipy.stats import entropy
from google.colab import drive
warnings.filterwarnings("ignore")
print("Done")

Done


##google drive

In [None]:
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
PROJECT_PATH= '/content/drive/MyDrive/project/data/'
DATA='NSL-KDD'

## Read the NSL-KDD dataset

In [None]:
df =pd.read_csv(PROJECT_PATH+'all_data_NSL_KDD.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Duration,Protocol,Service,Flag,Src Bytes,Dst Bytes,DoS,Wrong Fragment,Urgent,...,Dst Host Same Srv Rate,Dst Host Diff Srv Rate,Dst Host Same Src Port Rate,Dst Host Srv Diff Host Rate,Dst Host Serror Rate,Dst Host Srv Serror Rate,Dst Host Rerror Rate,Dst Host Srv Rerror Rate,Class,Difficulty Level
0,0,0,tcp,ftp_data,SF,491,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,Normal,20
1,1,0,udp,other,SF,146,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,Normal,15
2,2,0,tcp,private,S0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DoS,19
3,3,0,tcp,http,SF,232,8153,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,Normal,21
4,4,0,tcp,http,SF,199,420,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal,21


In [None]:
df.isnull().values.any()

False

In [None]:
df=df.drop(['Unnamed: 0'], axis=1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148517 entries, 0 to 148516
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Duration                     148517 non-null  int64  
 1   Protocol                     148517 non-null  object 
 2   Service                      148517 non-null  object 
 3   Flag                         148517 non-null  object 
 4   Src Bytes                    148517 non-null  int64  
 5   Dst Bytes                    148517 non-null  int64  
 6   DoS                          148517 non-null  int64  
 7   Wrong Fragment               148517 non-null  int64  
 8   Urgent                       148517 non-null  int64  
 9   Hot                          148517 non-null  int64  
 10  Num Failed Logins            148517 non-null  int64  
 11  Logged In                    148517 non-null  int64  
 12  Num Compromised              148517 non-null  int64  
 13 

In [None]:
df.shape

(148517, 43)

### Transform all features that type object to int

In [None]:
# Transform all features that type object to int
# create dictionary {label: features}
labels=['Protocol','Service','Flag']
dic_features={name:df[name].unique() for name in labels}
dic_features

{'Protocol': array(['tcp', 'udp', 'icmp'], dtype=object),
 'Service': array(['ftp_data', 'other', 'private', 'http', 'remote_job', 'name',
        'netbios_ns', 'eco_i', 'mtp', 'telnet', 'finger', 'domain_u',
        'supdup', 'uucp_path', 'Z39_50', 'smtp', 'csnet_ns', 'uucp',
        'netbios_dgm', 'urp_i', 'auth', 'domain', 'ftp', 'bgp', 'ldap',
        'ecr_i', 'gopher', 'vmnet', 'systat', 'http_443', 'efs', 'whois',
        'R2L4', 'iso_tsap', 'echo', 'klogin', 'link', 'sunrpc', 'login',
        'kshell', 'sql_net', 'time', 'hostnames', 'exec', 'ntp_u',
        'discard', 'nntp', 'courier', 'ctf', 'ssh', 'daytime', 'shell',
        'netstat', 'pop_3', 'nnsp', 'IRC', 'pop_2', 'printer', 'tim_i',
        'pm_dump', 'red_i', 'netbios_ssn', 'rje', 'X11', 'urh_i',
        'http_8001', 'aol', 'http_2784', 'tftp_u', 'harvest', 'imap4'],
       dtype=object),
 'Flag': array(['SF', 'S0', 'REJ', 'RSTR', 'SH', 'RSTO', 'S1', 'RSTOS0', 'S3',
        'S2', 'OTH'], dtype=object)}

In [None]:
#add columns with label of the feature.
# if the feature appear in the row add 1 else 0
df1=df.copy()
for k in dic_features.keys():
    for v in dic_features[k]:
        df1[v]=df[k].map(lambda feature:1 if feature==v else 0 )

In [None]:
df1.shape

(148517, 128)

In [None]:
df=df1

In [None]:
label=df['Class']
df=df.drop(['Class','Protocol','Service','Flag'],axis=1)
X=df.copy()

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148517 entries, 0 to 148516
Columns: 124 entries, Duration to OTH
dtypes: float64(15), int64(109)
memory usage: 140.5 MB


## FCBF_module

In [None]:
# -*- coding: utf-8 -*-

import numpy as np


def count_vals(x):
    _, occ = np.unique(x, return_counts=True)
    # occ = np.zeros(shape=vals.shape)
    # for i in range(vals.size):
    #     occ[i] = np.sum(x == vals[i])
    return occ


def entropy(x):
    n = float(x.shape[0])
    ocurrence = count_vals(x)
    px = ocurrence / n
    return -1 * np.sum(px * np.log2(px))


def symmetricalUncertain(x, y):
    n = float(y.shape[0])
    vals = np.unique(y)
    # Computing Entropy for the feature x.
    Hx = entropy(x)
    # Computing Entropy for the feature y.
    Hy = entropy(y)
    # Computing Joint entropy between x and y.
    partial = np.zeros(shape=(vals.shape[0]))
    for i in range(vals.shape[0]):
        partial[i] = entropy(x[y == vals[i]])

    partial[np.isnan(partial) == 1] = 0
    py = count_vals(y).astype(dtype='float64') / n
    Hxy = np.sum(py[py > 0] * partial)
    IG = Hx - Hxy
    res = 2 * IG / (Hx + Hy)
    return res


def suGroup(x, n):
    m = x.shape[0]
    x = np.reshape(x, (n, m / n)).T
    m = x.shape[1]
    SU_matrix = np.zeros(shape=(m, m))
    for j in range(m - 1):
        x2 = x[:, j + 1::]
        y = x[:, j]
        temp = np.apply_along_axis(symmetricalUncertain, 0, x2, y)
        for k in range(temp.shape[0]):
            SU_matrix[j, j + 1::] = temp
            SU_matrix[j + 1::, j] = temp

    return 1 / float(m - 1) * np.sum(SU_matrix, axis=1)


def isprime(a):
    return all(a % i for i in range(2, a))


"""
get
"""


def get_i(a):
    if isprime(a):
        a -= 1
    return filter(lambda x: a % x == 0, range(2, a))


"""
FCBF - Fast Correlation Based Filter

L. Yu and H. Liu. Feature Selection for High‐Dimensional Data: A Fast Correlation‐Based Filter Solution.
In Proceedings of The Twentieth International Conference on Machine Leaning (ICML‐03), 856‐863.
Washington, D.C., August 21‐24, 2003.
"""


class FCBF:
    idx_sel = []

    def __init__(self, th=0.01):
        '''
        Parameters
        ---------------
            th = The initial threshold
        '''
        self.th = th

    def fit(self, x, y):
        '''
        This function executes FCBF algorithm and saves indexes
        of selected features in self.idx_sel

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''
        self.idx_sel = []
        """
        First Stage: Computing the SU for each feature with the response.
        """
        SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
        SU_list = SU_vec[SU_vec > self.th]
        SU_list[::-1].sort()

        m = x[:, SU_vec > self.th].shape
        x_sorted = np.zeros(shape=m)

        for i in range(m[1]):
            ind = np.argmax(SU_vec)
            SU_vec[ind] = 0
            x_sorted[:, i] = x[:, ind].copy()
            self.idx_sel.append(ind)

        """
        Second Stage: Identify relationships between feature to remove redundancy.
        """
        j = 0
        while True:
            """
            Stopping Criteria:The search finishes
            """
            if j >= x_sorted.shape[1]: break
            y = x_sorted[:, j].copy()
            x_list = x_sorted[:, j + 1:].copy()
            if x_list.shape[1] == 0: break

            SU_list_2 = SU_list[j + 1:]
            SU_x = np.apply_along_axis(symmetricalUncertain, 0,
                                       x_list, y)

            comp_SU = SU_x >= SU_list_2
            to_remove = np.where(comp_SU)[0] + j + 1
            if to_remove.size > 0:
                x_sorted = np.delete(x_sorted, to_remove, axis=1)
                SU_list = np.delete(SU_list, to_remove, axis=0)
                to_remove.sort()
                for r in reversed(to_remove):
                    self.idx_sel.remove(self.idx_sel[r])
            j = j + 1

    def fit_transform(self, x, y):
        '''
        This function fits the feature selection
        algorithm and returns the resulting subset.

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''
        self.fit(x, y)
        return x[:, self.idx_sel]

    def transform(self, x):
        '''
        This function applies the selection
        to the vector x.

        Parameters
        ---------------
            x = dataset  [NxM]
        '''
        return x[:, self.idx_sel]


"""
FCBF# - Fast Correlation Based Filter
B. Senliol, G. Gulgezen, et al. Fast Correlation Based Filter (FCBF) with a Different Search Strategy.
In Computer and Information Sciences (ISCIS ‘08) 23rd International Symposium on, pages 1‐4.
Istanbul, October 27‐29, 2008.
"""


class FCBFK(FCBF):
    idx_sel = []

    def __init__(self, k=10):
        '''
        Parameters
        ---------------
            k = Number of features to include in the
            subset.
        '''
        self.k = k

    def fit(self, x, y):
        '''
        This function executes FCBFK algorithm and saves indexes
        of selected features in self.idx_sel

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''
        self.idx_sel = []
        """
        First Stage: Computing the SU for each feature with the response.
        """
        SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)

        SU_list = SU_vec[SU_vec > 0]
        SU_list[::-1].sort()

        m = x[:, SU_vec > 0].shape
        x_sorted = np.zeros(shape=m)

        for i in range(m[1]):
            ind = np.argmax(SU_vec)
            SU_vec[ind] = 0
            x_sorted[:, i] = x[:, ind].copy()
            self.idx_sel.append(ind)

        """
        Second Stage: Identify relationships between features to remove redundancy with stopping
        criteria (features in x_best == k).
        """
        j = 0
        while True:
            y = x_sorted[:, j].copy()
            SU_list_2 = SU_list[j + 1:]
            x_list = x_sorted[:, j + 1:].copy()

            """
            Stopping Criteria:The search finishes
            """
            if x_list.shape[1] == 0: break

            SU_x = np.apply_along_axis(symmetricalUncertain, 0,
                                       x_list, y)

            comp_SU = SU_x >= SU_list_2
            to_remove = np.where(comp_SU)[0] + j + 1
            if to_remove.size > 0 and x.shape[1] > self.k:

                for i in reversed(to_remove):

                    x_sorted = np.delete(x_sorted, i, axis=1)
                    SU_list = np.delete(SU_list, i, axis=0)
                    self.idx_sel.remove(self.idx_sel[i])
                    if x_sorted.shape[1] == self.k: break

            if x_list.shape[1] == 1 or x_sorted.shape[1] == self.k:
                break
            j = j + 1

        if len(self.idx_sel) > self.k:
            self.idx_sel = self.idx_sel[:self.k]


"""
FCBFiP - Fast Correlation Based Filter in Pieces
"""


class FCBFiP(FCBF):
    idx_sel = []

    def __init__(self, k=10, npieces=2):
        '''
        Parameters
        ---------------
            k = Number of features to include in the
            subset.
            npieces = Number of pieces to divide the
            feature space.
        '''
        self.k = k
        self.npieces = npieces

    def fit(self, x, y):
        '''
        This function executes FCBF algorithm and saves indexes
        of selected features in self.idx_sel

        Parameters
        ---------------
            x = dataset  [NxM]
            y = label    [Nx1]
        '''

        """
        First Stage: Computing the SU for each feature with the response. We sort the
        features. When we have a prime number of features we remove the last one from the
        sorted features list.
        """
        m = x.shape
        nfeaturesPieces = int(m[1] / float(self.npieces))
        SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)

        x_sorted = np.zeros(shape=m, dtype='float64')
        idx_sorted = np.zeros(shape=m[1], dtype='int64')
        for i in range(m[1]):
            ind = np.argmax(SU_vec)
            SU_vec[ind] = -1
            idx_sorted[i] = ind
            x_sorted[:, i] = x[:, ind].copy()

        if isprime(m[1]):
            x_sorted = np.delete(x_sorted, m[1] - 1, axis=1)
            ind_prime = idx_sorted[m[1] - 1]
            idx_sorted = np.delete(idx_sorted, m[1] - 1)
            # m = x_sorted.shape
        """
        Second Stage: Identify relationships between features into its vecinity
        to remove redundancy with stopping criteria (features in x_best == k).
        """

        x_2d = np.reshape(x_sorted.T, (self.npieces, nfeaturesPieces * m[0])).T

        SU_x = np.apply_along_axis(suGroup, 0, x_2d, nfeaturesPieces)
        SU_x = np.reshape(SU_x.T, (self.npieces * nfeaturesPieces,))
        idx_sorted2 = np.zeros(shape=idx_sorted.shape, dtype='int64')
        SU_x[np.isnan(SU_x)] = 1

        for i in range(idx_sorted.shape[0]):
            ind = np.argmin(SU_x)
            idx_sorted2[i] = idx_sorted[ind]
            SU_x[ind] = 10

        """
        Scoring step
        """
        self.scores = np.zeros(shape=m[1], dtype='int64')

        for i in range(m[1]):
            if i in idx_sorted:
                self.scores[i] = np.argwhere(i == idx_sorted) + np.argwhere(i == idx_sorted2)
        if isprime(m[1]):
            self.scores[ind_prime] = 2 * m[1]
        self.set_k(self.k)

    def set_k(self, k):
        self.k = k
        scores_temp = -1 * self.scores

        self.idx_sel = np.zeros(shape=self.k, dtype='int64')
        for i in range(self.k):
            ind = np.argmax(scores_temp)
            scores_temp[ind] = -100000000
            self.idx_sel[i] = ind


## FCBF

In [None]:
X=X.to_numpy()

In [None]:
fcbf = FCBFK(k = 25)

fcbf.fit(X,label)

In [None]:
idx=sorted(fcbf.idx_sel)

In [None]:
idx

[1,
 2,
 5,
 7,
 10,
 14,
 17,
 18,
 26,
 27,
 32,
 33,
 38,
 45,
 49,
 64,
 95,
 107,
 108,
 109,
 111,
 112,
 113,
 117,
 123]

In [None]:
X_fss=X[:, idx]

In [None]:
X_fss.shape

(148517, 25)

In [None]:
features = np.array(df.columns)

In [None]:
features1=features[idx]

In [None]:
features1

array(['Src Bytes', 'Dst Bytes', 'Urgent', 'Num Failed Logins',
       'Root Shell', 'Num Shells', 'Is Hot Logins', 'Is Guest Login',
       'Diff Srv Rate', 'Srv Diff Host Rate',
       'Dst Host Same Src Port Rate', 'Dst Host Srv Diff Host Rate',
       'Difficulty Level', 'http', 'eco_i', 'ftp', 'pop_3', 'http_8001',
       'aol', 'http_2784', 'harvest', 'imap4', 'SF', 'SH', 'OTH'],
      dtype=object)

In [None]:
X_imbalance = pd.DataFrame(X_fss, columns = features1)

## Solve class-imbalance by SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
print('Resampled dataset shape %s' % Counter(label))

Resampled dataset shape Counter({'Normal': 77054, 'DoS': 53385, 'Probe': 13936, 'R2L': 3880, 'U2R': 262})


In [None]:


smote=SMOTE(n_jobs=-1,sampling_strategy={'Normal': 77054, 'DoS': 53385, 'Probe': 50000, 'R2L': 50000, 'U2R': 50000})
# smote=SMOTE(n_jobs=-1)
X_train, y_train = smote.fit_resample(X_imbalance, label)

In [None]:
print('Resampled dataset shape %s' % Counter(y_train))

Resampled dataset shape Counter({'Normal': 77054, 'DoS': 53385, 'R2L': 50000, 'Probe': 50000, 'U2R': 50000})


In [None]:
X_train['Class']=y_train

In [None]:
X_train.head()

Unnamed: 0,Src Bytes,Dst Bytes,Urgent,Num Failed Logins,Root Shell,Num Shells,Is Hot Logins,Is Guest Login,Diff Srv Rate,Srv Diff Host Rate,...,pop_3,http_8001,aol,http_2784,harvest,imap4,SF,SH,OTH,Class
0,491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Normal
1,146.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Normal
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DoS
3,232.0,8153.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Normal
4,199.0,420.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Normal


In [None]:
X_train.to_csv(PROJECT_PATH+'/data25_NSL_KDD.csv')