# Dataset Preparation 

## Importing Libraries

In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

#File path to save files
saved_files_path = 'saved_files/'

#Dictionary to save the important parameters
saved_params = {}

## Loading features dataset 

In [97]:
#Loading features dataset
features = pd.read_csv('dataset/features/UNSW_NB15_features.csv')

#Getting all the non important columns which we would need to drop from the dataset
mandatory_to_drop = ['srcip', 'sport', 'dstip', 'dsport', 'stime', 'ltime', 'attack_cat']

#Saving the impotant parameters in a dictionary
saved_params['mandatory_to_drop'] = mandatory_to_drop

## Loading 4 train dataset and merging them

In [56]:
#Train data 1
train_data1 = pd.read_csv('dataset/UNSW-NB15_1.csv')
train_data1.columns = [i.lower() for i in features['Name'].values] #Converting all the columns to lower case
train_data1.drop(mandatory_to_drop, axis=1, inplace=True) #Dropping  all the not important columns
print(train_data1.shape) #Out put shape of the dataset

(700000, 42)
(700000, 42)


In [57]:
#Train data 2
train_data2 = pd.read_csv('dataset/UNSW-NB15_2.csv')
train_data2.columns = [i.lower() for i in features['Name'].values]#Converting all the columns to lower case
train_data2.drop(mandatory_to_drop, axis=1, inplace=True)#Dropping  all the not important columns
print(train_data2.shape)#Out put shape of the dataset

(700000, 42)


In [58]:
#Train data 3
train_data3 = pd.read_csv('dataset/UNSW-NB15_3.csv')
train_data3.columns = [i.lower() for i in features['Name'].values]#Converting all the columns to lower case
train_data3.drop(mandatory_to_drop, axis=1, inplace=True)#Dropping  all the not important columns
print(train_data3.shape)#Out put shape of the dataset

(700000, 42)


In [59]:
#Train data 4
train_data4 = pd.read_csv('dataset/UNSW-NB15_4.csv')
train_data4.columns = [i.lower() for i in features['Name'].values]#Converting all the columns to lower case
train_data4.drop(mandatory_to_drop, axis=1, inplace=True)#Dropping  all the not important columns
print(train_data4.shape)#Out put shape of the dataset

(440043, 42)


In [60]:
#Merging datasets
train_data_temp1 = train_data1.append(train_data2)
print(train_data_temp1.shape)
train_data_temp1.head()

(1400000, 42)


Unnamed: 0,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,label
0,udp,CON,0.036133,528,304,31,29,0,0,-,...,0.0,0,2,4,2,3,1,1,2,0
1,udp,CON,0.001119,146,178,31,29,0,0,dns,...,0.0,0,12,8,1,2,2,1,1,0
2,udp,CON,0.001209,132,164,31,29,0,0,dns,...,0.0,0,6,9,1,1,1,1,1,0
3,udp,CON,0.001169,146,178,31,29,0,0,dns,...,0.0,0,7,9,1,1,1,1,1,0
4,udp,CON,0.078339,568,312,31,29,0,0,-,...,0.0,0,2,4,2,3,1,1,2,0


In [61]:
#Merging datasets
train_data_temp2 = train_data_temp1.append(train_data3)
print(train_data_temp2.shape)
train_data_temp2.head()

(2100000, 42)


Unnamed: 0,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,label
0,udp,CON,0.036133,528,304,31,29,0,0,-,...,0.0,0,2,4,2,3,1,1,2,0
1,udp,CON,0.001119,146,178,31,29,0,0,dns,...,0.0,0,12,8,1,2,2,1,1,0
2,udp,CON,0.001209,132,164,31,29,0,0,dns,...,0.0,0,6,9,1,1,1,1,1,0
3,udp,CON,0.001169,146,178,31,29,0,0,dns,...,0.0,0,7,9,1,1,1,1,1,0
4,udp,CON,0.078339,568,312,31,29,0,0,-,...,0.0,0,2,4,2,3,1,1,2,0


In [62]:
#Merging datasets
train_data = train_data_temp2.append(train_data4)
print(train_data.shape)
train_data.head()

(2540043, 42)


Unnamed: 0,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,label
0,udp,CON,0.036133,528,304,31,29,0,0,-,...,0.0,0,2,4,2,3,1,1,2,0
1,udp,CON,0.001119,146,178,31,29,0,0,dns,...,0.0,0,12,8,1,2,2,1,1,0
2,udp,CON,0.001209,132,164,31,29,0,0,dns,...,0.0,0,6,9,1,1,1,1,1,0
3,udp,CON,0.001169,146,178,31,29,0,0,dns,...,0.0,0,7,9,1,1,1,1,1,0
4,udp,CON,0.078339,568,312,31,29,0,0,-,...,0.0,0,2,4,2,3,1,1,2,0


In [63]:
#Saving the merged dataset into a csv file
train_data.to_csv('dataset/NIDS_dataset_final.csv')

In [64]:
#Loading the dataset and splitting in X and y
train_data = pd.read_csv('dataset/NIDS_dataset_final.csv')
print(train_data.shape)
train_data.head()
X, y = train_data.drop('label', axis=1), train_data['label']

(2540043, 43)


In [65]:
#Making a train test split in the dataset
train, test = train_test_split(train_data, test_size=0.20, stratify=y, random_state=42)

In [66]:
#Dropping the extra formed useless feature
train = train.drop('Unnamed: 0', axis=1)
train.head()

Unnamed: 0,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,label
1921074,udp,INT,7e-06,264,0,60,0,0,0,dns,...,,,22,22,14,14,14,14,22,0
2012474,udp,INT,7e-06,264,0,60,0,0,0,dns,...,,,19,19,19,19,19,19,19,0
115171,tcp,FIN,1.033946,1684,10168,31,29,3,5,http,...,0.0,0.0,1,1,2,2,1,1,1,0
2164402,udp,CON,0.001007,146,178,31,29,0,0,dns,...,,,4,1,3,4,2,1,2,0
1889220,udp,INT,9e-06,114,0,254,0,0,0,dns,...,,,36,36,31,31,31,18,36,1


In [67]:
train.isnull().sum() #Getting all the features with number of null values in them

proto                      0
state                      0
dur                        0
sbytes                     0
dbytes                     0
sttl                       0
dttl                       0
sloss                      0
dloss                      0
service                    0
sload                      0
dload                      0
spkts                      0
dpkts                      0
swin                       0
dwin                       0
stcpb                      0
dtcpb                      0
smean                      0
dmean                      0
trans_depth                0
response_body_len          0
sjit                       0
djit                       0
sinpkt                     0
dinpkt                     0
tcprtt                     0
synack                     0
ackdat                     0
is_sm_ips_ports            0
ct_state_ttl               0
ct_flw_http_mthd     1078078
is_ftp_login         1143497
ct_ftp_cmd                 0
ct_srv_src    

In [71]:
#Handle service - we can see many '-' in service - replace them with "None"
train['service'] = train['service'].replace(to_replace='-', value="None")
train['service'].value_counts()

None        997348
dns         625188
http        164955
ftp-data    100585
smtp         65376
ftp          39247
ssh          37708
pop3          1230
dhcp           143
ssl            103
snmp            93
radius          30
irc             28
Name: service, dtype: int64

In [73]:
#Getting the mode value for this feature
train['ct_flw_http_mthd'].value_counts() #0.0 is the mode value. We will use to fill the na values

0.0     789755
1.0     150428
6.0       6421
4.0       5240
3.0        587
2.0        465
5.0        350
9.0        218
14.0       179
12.0       111
30.0        47
8.0         44
16.0        41
36.0        28
10.0        23
25.0        19
Name: ct_flw_http_mthd, dtype: int64

In [74]:
#Filling the na values
train['ct_flw_http_mthd'] = train['ct_flw_http_mthd'].fillna(value=0).astype(int)

In [75]:
#Getting the mode value for this feature
train['is_ftp_login'].value_counts()

0.0    853709
1.0     34679
4.0       126
2.0        23
Name: is_ftp_login, dtype: int64

In [81]:
#Filling the na values
train['is_ftp_login'] = train['is_ftp_login'].fillna(value=0).astype('int')

In [77]:
#Checking if some null values are there now
train.isnull().sum().sum()

0

In [78]:
#Checking the distribution of label in test and train data

train_0_count = train.label.value_counts()[0]/len(train['label'].values)
train_1_count = train.label.value_counts()[1]/len(train['label'].values)

test_0_count = test.label.value_counts()[0]/len(test['label'].values)
test_1_count = test.label.value_counts()[1]/len(test['label'].values)

print("Percentage of label 0 in train data set is: ", round(train_0_count*100, 3),"%.")
print("Percentage of label 1 in train data set is: ", round(train_1_count*100, 3),"%.")
print("Percentage of label 0 in test data set is: ", round(test_0_count*100, 3),"%.")
print("Percentage of label 1 in test data set is: ", round(test_1_count*100, 3),"%.")

Percentage of label 0 in train data set is:  87.351 %.
Percentage of label 1 in train data set is:  12.649 %.
Percentage of label 0 in test data set is:  87.351 %.
Percentage of label 1 in test data set is:  12.649 %.


In [82]:
#Checking the dataset for dtypes now:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2032034 entries, 1921074 to 340529
Data columns (total 42 columns):
 #   Column             Dtype  
---  ------             -----  
 0   proto              object 
 1   state              object 
 2   dur                float64
 3   sbytes             int64  
 4   dbytes             int64  
 5   sttl               int64  
 6   dttl               int64  
 7   sloss              int64  
 8   dloss              int64  
 9   service            object 
 10  sload              float64
 11  dload              float64
 12  spkts              int64  
 13  dpkts              int64  
 14  swin               int64  
 15  dwin               int64  
 16  stcpb              int64  
 17  dtcpb              int64  
 18  smean              int64  
 19  dmean              int64  
 20  trans_depth        int64  
 21  response_body_len  int64  
 22  sjit               float64
 23  djit               float64
 24  sinpkt             float64
 25  dinpkt       

In [83]:
#Checking the number of categories in dataset
train.dtypes.value_counts()

int64      26
float64    10
object      4
int32       2
dtype: int64

In [84]:
#Verifying non numeric data types
train.select_dtypes(exclude='number').columns

Index(['proto', 'state', 'service', 'ct_ftp_cmd'], dtype='object')

In [85]:
#According to the features csv file, ct_ftp_cmd feature is an integer type and not categorical so lets fix that

In [86]:
train['ct_ftp_cmd'].value_counts()
#We can see that we have 667038 null values which we will replace by mode values which is 0 in our case here.

     1143497
0     849050
1      18288
1      13749
0       4583
2        988
4        664
3        576
6        259
5        220
4        126
2         23
8         11
Name: ct_ftp_cmd, dtype: int64

In [87]:
#Replacing empty space with mode value and converting to numerical
train['ct_ftp_cmd'] = train['ct_ftp_cmd'].replace(to_replace=' ', value=0).astype(int)

In [88]:
#Verifying if ct_ftp_cmd has been corrected
train.select_dtypes(exclude='number').columns #Corrected

Index(['proto', 'state', 'service'], dtype='object')

In [89]:
#We also saw that there are 2 binary features from the features.csv file namely 'is_sm_ips_ports', 'is_ftp_login' 
#Lets examine these

In [90]:
train['is_sm_ips_ports'].value_counts() #This is fine

0    2028655
1       3379
Name: is_sm_ips_ports, dtype: int64

In [91]:
train['is_ftp_login'].value_counts() #We need to fix these and replace values greater than 1 with 1

0    1997206
1      34679
4        126
2         23
Name: is_ftp_login, dtype: int64

In [92]:
train['is_ftp_login'][train['is_ftp_login'] > 1] = 1
train['is_ftp_login'].value_counts() #Fixed

0    1997206
1      34828
Name: is_ftp_login, dtype: int64

In [100]:
#Saving some information
saved_params['total_columns'] = train.columns
saved_params['cat_cols'] = train.select_dtypes(exclude='number').columns
saved_params['binary_cols'] = ['is_sm_ips_ports', 'is_ftp_login']
saved_params['numeric_cols'] = train.select_dtypes(include='number').columns
pickle.dump(saved_params, open('saved_files/saved_params.pkl', 'wb'))

In [93]:
#Splitting in X and y of train and test dataset
X_train, y_train = train.drop('label', axis=1), train.label
X_test, y_test = test.drop('label', axis=1), test.label

In [94]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2032034, 41)
(2032034,)
(508009, 42)
(508009,)


In [95]:
# Saving all the files to disk to use later
pickle.dump((X_train, y_train), open('saved_files/final_train_complete.pkl', 'wb'))
pickle.dump((X_test, y_test), open('saved_files/final_test_complete.pkl', 'wb'))