# Feature Engineering and Preprocessing

## Plan of Action

1. Import libraries
2. Import dataset
3. Perform feature engineering
    a. Combine spkts and dpkts
    b. Perform log transform on features with skewed distribution and remove original features
4. Standardization
5. One hot encoding
6. Prepare data prep pipeline
7. Prepare and validate train and test datasets
8. Split train and test datset into X_train and y_train.
9. Baseline modeling

## Importing Libraries

In [19]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from prettytable import PrettyTable

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

dataset_path = 'dataset'
saved_files_path = 'saved_files/'

## Importing Dataset

In [3]:
X_train, y_train = pickle.load(open(saved_files_path+'final_train_complete.pkl', 'rb'))
X_test, y_test = pickle.load(open(saved_files_path+'final_test_complete.pkl', 'rb'))

# Parametrs
saved_parameters = pickle.load(open(saved_files_path+'saved_params.pkl', 'rb'))

In [5]:
print(X_train.shape)
print(X_test.shape)

(2032034, 41)
(508009, 42)


In [6]:
#Drop highly correlated data points
#Code reference: https://chrisalbon.com/code/machine_learning/feature_selection/drop_highly_correlated_features/

print("Shape before dropping highly correlated features: ",X_train.shape)

# Create correlation matrix
feat_corr = X_train.corr(method='pearson').abs()

# Select upper triangle of correlation matrix
upper_traingle = feat_corr.where(np.triu(np.ones(feat_corr.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.98
columns_to_drop = [column for column in upper_traingle.columns if any(upper_traingle[column] > 0.98)]
#Adding attack_categories column because we are building binary classification dataset
#columns_to_drop.append('attack_cat')

# Drop features 
X_train.drop(columns_to_drop, axis=1, inplace=True)

#Dictionary to store parameters
saved_parameters['cols_to_drop'] = columns_to_drop
saved_parameters['columns_in_train'] = X_train.columns

print("Shape after dropping highly correlated features: ",X_train.shape)
print("All the columns present in the dataset are: ",X_train.columns)

Shape before dropping highly correlated features:  (2032034, 41)
Shape after dropping highly correlated features:  (2032034, 38)
All the columns present in the dataset are:  Index(['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss',
       'service', 'sload', 'dload', 'spkts', 'swin', 'stcpb', 'dtcpb', 'smean',
       'dmean', 'trans_depth', 'response_body_len', 'sjit', 'djit', 'sinpkt',
       'dinpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports',
       'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'],
      dtype='object')


In [7]:
#utility  functions
def get_multi_corr(data):
    '''
    function to return the correlation values of each feature with each other
    '''
    return data.corr()

def get_corr_between_cols(data, y, col1, col2='label'):
    '''
    Function to return correlation value with the label feature
    '''
    return round(data[col1].corr(y),4)

def get_corr_with_log1p_transformation_with_label(data, y, col1, col2='label'):
    '''
    Function to return the correlation value with the log1p transformation 
    '''
    return round(data[col1].apply(np.log1p).corr(y), 4)

def get_numeric_data_column_list(data):
    '''
    Function to return the list of numerical data
    '''
    df = list(data.select_dtypes(include='number').columns)
    #df.remove('id')
    df.remove('is_ftp_login')
    df.remove('is_sm_ips_ports')
    #df.remove('label')
    return df

## Feature engineering

### Feature set 1
 1. Total bytes from source and destination -> sbytes+dbytes
 2. Total load from source and destination -> sload+dload

In [8]:
X_train['total_bytes'] = X_train['sbytes']+X_train['dbytes']
X_train['total_load'] = X_train['sload'] + X_train['dload']
#X_train = X_train.drop('Unnamed: 0', axis=1) 
X_train.head()

Unnamed: 0,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,service,sload,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,total_bytes,total_load
1921074,udp,INT,7e-06,264,0,60,0,0,dns,150857100.0,...,0,22,22,14,14,14,14,22,264,150857100.0
2012474,udp,INT,7e-06,264,0,60,0,0,dns,150857100.0,...,0,19,19,19,19,19,19,19,264,150857100.0
115171,tcp,FIN,1.033946,1684,10168,31,29,3,http,12101.21,...,0,1,1,2,2,1,1,1,11852,86410.7
2164402,udp,CON,0.001007,146,178,31,29,0,dns,579940.4,...,0,4,1,3,4,2,1,2,324,1286991.0
1889220,udp,INT,9e-06,114,0,254,0,0,dns,50666660.0,...,0,36,36,31,31,31,18,36,114,50666660.0


## Feature engineering - technique 2 - applying log1p transform

In [170]:
#For all the numeric columns for which the correlation with label feature increases on applying log1p transformation we will transform it to their log 1p value

#Get the numeric features
numeric_cols = get_numeric_data_column_list(X_train)
abs_difference = {}
for col in numeric_cols:
    print("-"*80)
    print("Correlation for feature: ",col," with label.")
    print("Normal Correlation: ", get_corr_between_cols(X_train, y_train, col))
    print("Log1p Correlation: ",get_corr_with_log1p_transformation_with_label(X_train, y_train, col))
    print("Absolute difference between normal and log1p correlation: ", round(np.abs(np.abs(get_corr_between_cols(X_train, y_train, col)) - np.abs(get_corr_with_log1p_transformation_with_label(X_train, y_train, col))),4))
    abs_difference[col] = round(np.abs(np.abs(get_corr_between_cols(X_train, y_train, col)) - np.abs(get_corr_with_log1p_transformation_with_label(X_train, y_train, col))),4)
    

--------------------------------------------------------------------------------
Correlation for feature:  dur  with label.
Normal Correlation:  -0.0792
Log1p Correlation:  -0.104
Absolute difference between normal and log1p correlation:  0.0248
--------------------------------------------------------------------------------
Correlation for feature:  sbytes  with label.
Normal Correlation:  -0.1694
Log1p Correlation:  -0.3968
Absolute difference between normal and log1p correlation:  0.2274
--------------------------------------------------------------------------------
Correlation for feature:  dbytes  with label.
Normal Correlation:  -0.0943
Log1p Correlation:  -0.5539
Absolute difference between normal and log1p correlation:  0.4596
--------------------------------------------------------------------------------
Correlation for feature:  sttl  with label.
Normal Correlation:  0.9252
Log1p Correlation:  0.9095
Absolute difference between normal and log1p correlation:  0.0157
--------

In [9]:
#Columns for which correlation is high and we would transform them by applying log1p transform
cols_for_log1p = ['dur', 'sbytes', 'dbytes', 'sloss', 'spkts', 'dload', 'dttl', 'sload', 'dload', 'sinpkt', 'dinpkt', 'dmean', 'sjit', 'djit']

#Saving these parameters in saved parameters dictionary
saved_parameters['cols_for_log1p'] = set(set(X_train.columns).intersection(set(cols_for_log1p)))-set(saved_parameters['cols_to_drop'])
#Saving the saved parameters dictionary in a pickle file
pickle.dump(saved_parameters, open(os.path.join(saved_files_path,'saved_parameters.pkl'),'wb'))

In [10]:
def log1p_transform(data):
    '''
    This function transforms the columns stored in the saved parameters dictionary by applying log1p transform and returns the data.
    '''
    #Loading the saved columns
    saved_parameters = pickle.load(open(saved_files_path+'saved_parameters.pkl', 'rb'))
    
    #Getting the names of the columns for which we need to apply log1p 
    log1p_cols = saved_parameters['cols_for_log1p']
    
    #Transforming the log1p functions
    for col in log1p_cols:
        transformed_col = col+'_log1p'
        data[transformed_col] = data[col].apply(np.log1p)
        data = data.drop([col], axis=1)
    
    return data

In [11]:
#Applying the log1p transform on the train data
X_train = log1p_transform(X_train)
print(X_train.shape)
X_train.head()

(2032034, 40)


Unnamed: 0,proto,state,sttl,service,swin,stcpb,dtcpb,smean,trans_depth,response_body_len,...,dload_log1p,sinpkt_log1p,dinpkt_log1p,sbytes_log1p,sload_log1p,sjit_log1p,dur_log1p,spkts_log1p,dmean_log1p,djit_log1p
1921074,udp,INT,60,dns,0,0,0,132,0,0,...,0.0,0.006976,0.0,5.57973,18.831844,0.0,7e-06,1.098612,0.0,0.0
2012474,udp,INT,60,dns,0,0,0,132,0,0,...,0.0,0.006976,0.0,5.57973,18.831844,0.0,7e-06,1.098612,0.0,0.0
115171,tcp,FIN,31,http,255,600762523,601772495,120,1,3924,...,11.216007,4.38833,4.12374,7.429521,9.401144,9.079867,0.709978,2.70805,6.338594,8.946974
2164402,udp,CON,31,dns,0,0,0,73,0,0,...,13.468859,0.003992,0.006976,4.990433,13.270682,0.0,0.001006,1.098612,4.49981,0.0
1889220,udp,INT,254,dns,0,0,0,57,0,0,...,0.0,0.00896,0.0,4.744932,17.740779,0.0,9e-06,1.098612,0.0,0.0


## Data Preprocessing - Standardization

In [12]:
#Fetching all the numeric features to standardize them
result_numeric = list(X_train.select_dtypes(include='number').columns)

#Removing the names of the columns for which standardization is not required
for i in saved_parameters['binary_cols']:
    result_numeric.remove(i)

#Using the standard scaler from sklearn library
scaler = StandardScaler()
scaler.fit(X_train[result_numeric])

#Dumping the scaler object in pickle file
pickle.dump(scaler, open(os.path.join(saved_files_path,'scaler_2.pkl'), 'wb'))

In [13]:
#Data pipeline function
#Standard scaler on data
def scale_data(data):
    '''
    This function standardize the numerical columns
    '''
    #Using Standard Scaler from sklearn preprocessing to scale our numeric features
    scaler = pickle.load(open(saved_files_path+'scaler_2.pkl', 'rb'))

    result_numeric = list(data.select_dtypes(include='number').columns)
    result_numeric.remove('is_sm_ips_ports')
    result_numeric.remove('is_ftp_login')

    #Using numeric columns to scale them using Standard Scaler
    data[result_numeric] = scaler.transform(data[result_numeric])
    
    return data


In [14]:
#Standardizing the train data
X_train = scale_data(X_train)

## Saving some important parameters

In [15]:
#Saving some important parameters
saved_parameters['columns'] = X_train.columns
saved_parameters['num_col'] = result_numeric

#From the features csv file we can see that these are categories that state feature can take
saved_parameters['state_categories'] = ['ACC', 'CLO', 'CON', 'ECO', 'ECR', 'FIN', 'INT', 'MAS', 'PAR', 'REQ', 'RST', 'TST', 'TXD', 'URH', 'URN']

#From the features csv file we can see that these are categories that service feature can take
saved_parameters['service_categories'] =['http', 'ftp', 'smtp', 'ssh', 'dns', 'ftp-data' ,'irc']

#proto one hot encoding categories
saved_parameters['proto_categories'] = X_train['proto'].unique()

#Saving the saved parameters dictionary in a pickle file
pickle.dump(saved_parameters, open(os.path.join(saved_files_path,'saved_parameters.pkl'),'wb'))

## One hot encoding

In [16]:
#Categorical encoding - one hot encoding
def categorical_encoding(data):
    '''
    This function one hot encodes service, proto and state categorical features
    '''
    saved_parameters = pickle.load(open(saved_files_path+'saved_parameters.pkl', 'rb'))
    state_categories = saved_parameters['state_categories']
    service_categories = saved_parameters['service_categories']
    proto_categories = saved_parameters['proto_categories']

    #For state feature
    for i in state_categories:
        data['state_'+i] = [1 if cat==i else 0 for cat in data['state'].values]

    data = data.drop(['state'], axis=1)
    
    #For service feature
    for i in service_categories:
        data['service_'+i] = [1 if cat==i else 0 for cat in data['service'].values]

    data = data.drop(['service'], axis=1)
    
    #For proto feature
    for i in proto_categories:
        data['proto_'+i] = [1 if cat==i else 0 for cat in data['proto'].values]

    data = data.drop(['proto'], axis=1)
    
    return data

In [17]:
#Applying the categorical encoding function on train dataset
X_train = categorical_encoding(X_train)
print("Shape of the train data: ",X_train.shape)
print("Shape of the test data: ",X_test.shape)

Shape of the train data:  (2032034, 194)
Shape of the test data:  (508009, 42)


In [220]:
#Saving mode vals
saved_mode_vals = {}

for i in X_train.columns:
    saved_mode_vals[i] = X_train[i].mode()[0]
    
pickle.dump(saved_mode_vals, open(os.path.join(saved_files_path,'saved_mode_vals.pkl'),'wb'))

In [202]:
def data_cleanup(data, set_name = "data"):
    '''
    This function applies various techniques to clean the dataset.
    '''
    saved_parameters = pickle.load(open(saved_files_path+'saved_parameters.pkl', 'rb'))
    saved_mode_vals = pickle.load(open(saved_files_path+'saved_mode_vals.pkl', 'rb'))
    
    #Cleaning the data
    for col in data.columns:
        mod = saved_mode_vals[col]
        
        #Fixing Binary data columns
        if col in saved_parameters['binary_cols']:
            data[col][data[col] > 1] = mod
        
        #Replacing '-' with "None" for service feature
        data[col] = data[col].replace(to_replace='-', value="None")
        #Filling null values
        data[col] = data[col].fillna(value=mod)
        #Filling empty values
        data[col] = data[col].replace(to_replace=' ', value=mod)

    #Fixing the data types of data
    data_types_to_correct = list(set(data.select_dtypes(exclude='number').columns) - set(saved_parameters['cat_cols']))
    
    for col in data_types_to_correct:
        data[col] = data[col].astype(float)
        
    print("The shape of {} after data cleanup is: {}".format(set_name, data.shape))
    
    return data

In [221]:
X_train = data_cleanup(X_train, 'Training data')

The shape of Training data after data cleanup is: (2032034, 194)


## Now preparing the test dataset

In [222]:
print(X_test.shape)
X_test = X_test.drop('Unnamed: 0', axis=1)
X_test.head()

(508009, 42)


Unnamed: 0,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,...,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
1444308,udp,CON,0.001221,146,178,31,29,0,0,dns,...,,,,1,2,4,2,1,1,1
2188426,tcp,FIN,0.018467,2438,19186,31,29,7,13,-,...,,,,7,3,3,4,1,1,4
1038228,tcp,FIN,0.03191,7820,15998,31,29,30,32,-,...,0.0,0.0,0.0,13,11,9,10,1,1,7
281265,udp,CON,0.001005,132,164,31,29,0,0,dns,...,0.0,0.0,0.0,1,6,2,6,1,1,1
2383961,tcp,FIN,0.289009,4528,2872,31,29,7,7,-,...,,,,7,7,3,6,1,1,8


In [223]:
def create_test_data(data):
    '''
    This function uses all the pipeline functions to create the test dataset
    '''
    #droping the index
    data = data.reset_index(drop=True)
    
    #Dropping columns with high correlation
    saved_parameters = pickle.load(open(saved_files_path+'saved_parameters.pkl', 'rb'))
    #data = data.drop(saved_parameters['mandatory_to_drop'], axis=1)
    data = data.drop(saved_parameters['cols_to_drop'], axis=1)
    
    #Had inconsistent datatype in train data
    data['ct_ftp_cmd'] = data['ct_ftp_cmd'].replace(to_replace=' ', value=0).astype(float)
    
    #Generating new feature set 1
    data['total_bytes'] = data['sbytes']+data['dbytes']
    data['total_load'] = data['sload'] + data['dload']
    
    #Log1p transform
    data = log1p_transform(data)
    
    #Scale transform
    data = scale_data(data)
    
    #One hot encoding
    data = categorical_encoding(data)
    
    #Cleaning data
    data = data_cleanup(data)
    
    return data

#Applying the entire data pipeline to test dataset
X_test = create_test_data(X_test)

The shape of data after data cleanup is: (508009, 194)


In [224]:
print(X_test.shape)
X_test.head()

(508009, 194)


Unnamed: 0,sttl,swin,stcpb,dtcpb,smean,trans_depth,response_body_len,tcprtt,synack,ackdat,...,proto_esp,proto_stp,proto_wb-expak,proto_compaq-peer,proto_rvd,proto_ip,proto_aris,proto_rtp,proto_igmp,proto_udt
0,-0.425838,-1.196075,-0.88714,-0.88724,-0.337423,-0.237366,-0.089758,-0.135307,-0.128343,-0.121503,...,0,0,0,0,0,0,0,0,0,0
1,-0.425838,0.836077,0.06132,1.586638,-0.396687,-0.237366,-0.089758,-0.121729,-0.109606,-0.11559,...,0,0,0,0,0,0,0,0,0,0
2,-0.425838,0.836077,0.381132,1.89336,-0.396687,-0.237366,-0.089758,-0.119451,-0.106046,-0.115045,...,0,0,0,0,0,0,0,0,0,0
3,-0.425838,-1.196075,-0.88714,-0.88724,-0.383517,-0.237366,-0.089758,-0.135307,-0.128343,-0.121503,...,0,0,0,0,0,0,0,0,0,0
4,-0.425838,0.836077,1.454779,-0.051558,0.327645,-0.237366,-0.089758,-0.120371,-0.107063,-0.115716,...,0,0,0,0,0,0,0,0,0,0


In [225]:
# Matching test data columns with train data columns
all(X_train.columns == X_test.columns)

True

In [226]:
# Saving all the files to disk to use later
pickle.dump((X_train, y_train), open(saved_files_path+'final_train_fe.pkl', 'wb'))
pickle.dump((X_test, y_test), open(saved_files_path+'final_test_fe.pkl', 'wb'))