# Imports

In [1]:
# Importing required libraries for the project
import sys # for python library version
import numpy as np # for scientific computing
import pandas as pd # for data anaysis
import matplotlib # for visualization
import seaborn as sns # for visualization
import sklearn # ML Library

In [2]:
print('Python: {}'.format(sys.version))  # Python version
print('numpy: {}'.format(np.__version__))  # Numpy version
print('pandas: {}'.format(pd.__version__))  # Pandas version
print('matplotlib: {}'.format(matplotlib.__version__))  # Matplotlib version
print('seaborn: {}'.format(sns.__version__))  # seaborn version
print('sklearn: {}'.format(sklearn.__version__))  # sklearn version

Python: 3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 11:27:44) [MSC v.1900 64 bit (AMD64)]
numpy: 1.15.2
pandas: 0.23.4
matplotlib: 3.0.0
seaborn: 0.8.1
sklearn: 0.19.1


In [3]:
# No warning of any kind please!
import warnings
# will ignore any warnings
warnings.filterwarnings("ignore")

# Getting Started

First thing first, we need to import/read the dataset and have a peak at it....

In [4]:
# importing the dataset to a variable
data = pd.read_csv("K:/CIC-2017-dataset/CIC-IDS-2017\MergedML_CSV/MergedML.csv")

# displaying first 3 observations
data.head(3)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,49188,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


We can see that the data has been imported successfully. Now we need to know the number of observations and features we have.

In [5]:
# dimensions of the data
# where x will be no. of observation
# and y will be features including 1 target variable
x, y = data.shape   # x=445909   y=79

print('We have ', x, ' number of observations and ', y-1, ' features for this dataset to predict type of traffic.')  # removing count of a target variable in 'y'

We have  2542141  number of observations and  78  features for this dataset to predict type of traffic.


Let's look at the datatypes of each feature and see if it needs any processing if the feature is not in its appropriate form

In [6]:
# datatypes of features
data.dtypes

 Destination Port                 int64
 Flow Duration                    int64
 Total Fwd Packets                int64
 Total Backward Packets           int64
Total Length of Fwd Packets       int64
 Total Length of Bwd Packets      int64
 Fwd Packet Length Max            int64
 Fwd Packet Length Min            int64
 Fwd Packet Length Mean         float64
 Fwd Packet Length Std          float64
Bwd Packet Length Max             int64
 Bwd Packet Length Min            int64
 Bwd Packet Length Mean         float64
 Bwd Packet Length Std          float64
Flow Bytes/s                     object
 Flow Packets/s                  object
 Flow IAT Mean                  float64
 Flow IAT Std                   float64
 Flow IAT Max                     int64
 Flow IAT Min                     int64
Fwd IAT Total                     int64
 Fwd IAT Mean                   float64
 Fwd IAT Std                    float64
 Fwd IAT Max                      int64
 Fwd IAT Min                      int64


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542141 entries, 0 to 2542140
Data columns (total 79 columns):
 Destination Port               int64
 Flow Duration                  int64
 Total Fwd Packets              int64
 Total Backward Packets         int64
Total Length of Fwd Packets     int64
 Total Length of Bwd Packets    int64
 Fwd Packet Length Max          int64
 Fwd Packet Length Min          int64
 Fwd Packet Length Mean         float64
 Fwd Packet Length Std          float64
Bwd Packet Length Max           int64
 Bwd Packet Length Min          int64
 Bwd Packet Length Mean         float64
 Bwd Packet Length Std          float64
Flow Bytes/s                    object
 Flow Packets/s                 object
 Flow IAT Mean                  float64
 Flow IAT Std                   float64
 Flow IAT Max                   int64
 Flow IAT Min                   int64
Fwd IAT Total                   int64
 Fwd IAT Mean                   float64
 Fwd IAT Std                    flo

Since there is an object NAN feature columns, like 'Flow Bytes/s' and  ' Flow Packets/s', that can not be scaled later.
So we can drop that kind of features.

In [8]:
data = data.drop(['Flow Bytes/s', ' Flow Packets/s'], axis=1)

In [9]:
data.shape     # (2542141, 77)

(2542141, 77)

Our dataset now has 76 features and 1 target variable ' Label'. From 76 features, 60 are numeric and 18 are catrgorical. From 18 categorical, 10 are the features that always contain only one value 0. So we can delete that kind of redundant catrgorical columns whose values are always zero.


We will split the data in 2 parts. First part will contain all numerical features 'num_fea' and second part will contain all binary or categorical features 'binary_fea' of the data. The target variable ' Label' is excluded.

In [10]:
# Extracting all numerical features from data
num_fea = data[[" Destination Port",
           " Flow Duration",
           " Total Fwd Packets",
           " Total Backward Packets",
           "Total Length of Fwd Packets",
           " Total Length of Bwd Packets",
           " Down/Up Ratio",
           " Fwd Packet Length Max",
           " Fwd Packet Length Min",
           " Fwd Packet Length Mean",
           " Fwd Packet Length Std",
           "Bwd Packet Length Max",
           " Bwd Packet Length Min",
           " Bwd Packet Length Mean",
           " Bwd Packet Length Std",
           " Flow IAT Mean",
           " Flow IAT Std",
           " Flow IAT Max",
           " Flow IAT Min", 
           "Fwd IAT Total",
           " Fwd IAT Mean",
           " Fwd IAT Std",
           " Fwd IAT Max", 
           " Fwd IAT Min",
           "Bwd IAT Total", 
           " Bwd IAT Mean",
           " Bwd IAT Std", 
           " Bwd IAT Max", 
           " Bwd IAT Min", 
           " Fwd Header Length",
           " Bwd Header Length",
           "Fwd Packets/s",
           " Bwd Packets/s", 
           " Min Packet Length",
           " Max Packet Length",
           " Packet Length Mean",
           " Packet Length Std",
           " Packet Length Variance",
           " Avg Bwd Segment Size",
           " Average Packet Size",
           " Avg Fwd Segment Size",
           " Fwd Header Length",
           "Subflow Fwd Packets", 
           " Subflow Fwd Bytes", 
           " Subflow Bwd Packets",
           " Subflow Bwd Bytes",
           "Init_Win_bytes_forward", 
           " Init_Win_bytes_backward", 
           " act_data_pkt_fwd",
           " min_seg_size_forward",
           "Active Mean",
           " Active Std",
           " Active Max",
           " Active Min",
           "Idle Mean",
           " Idle Std",
           " Idle Max",
           " Idle Min"

 ]]

In [11]:
# extracting all binary/ categorical features from data
binary_fea = data[["Fwd PSH Flags",
           " Bwd PSH Flags",  # 0
           " Fwd URG Flags",  # 0
           " Bwd URG Flags",  # 0
           "FIN Flag Count", 
           " SYN Flag Count",
           " RST Flag Count",
           " PSH Flag Count",
           " ACK Flag Count",
           " URG Flag Count",
           " CWE Flag Count",  # 0
           " ECE Flag Count",
           "Fwd Avg Bytes/Bulk",  # 0
           " Fwd Avg Packets/Bulk", # 0
           " Fwd Avg Bulk Rate",  # 0
           " Bwd Avg Bytes/Bulk",  # 0
           " Bwd Avg Packets/Bulk", # 0
           "Bwd Avg Bulk Rate"]]   # 0

In [12]:
binary_fea.describe()

Unnamed: 0,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate
count,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0
mean,0.0476083,0.0,0.0,0.0,0.03743734,0.0476083,0.0002391685,0.2857493,0.3256904,0.09588965,0.0,0.0002403486,0.0,0.0,0.0,0.0,0.0,0.0
std,0.2129361,0.0,0.0,0.0,0.189831,0.2129361,0.01546323,0.4517706,0.4686323,0.2944399,0.0,0.01550132,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
listLabel = list(binary_fea["Bwd Avg Bulk Rate"].drop_duplicates().values)
print('\n The types of all traffics are {}'.format(listLabel))


 The types of all traffics are [0]


So we can delete the redundant feature columns whose values are always zero

In [38]:
binary_fea = binary_fea.drop([" Bwd PSH Flags",  # 0
           " Fwd URG Flags",  # 0
           " Bwd URG Flags",  # 0
           " CWE Flag Count",   # 0
           "Fwd Avg Bytes/Bulk",  # 0
           " Fwd Avg Packets/Bulk", # 0
           " Fwd Avg Bulk Rate",  # 0
           " Bwd Avg Bytes/Bulk",  # 0
           " Bwd Avg Packets/Bulk", # 0
           "Bwd Avg Bulk Rate"], axis=1)  # 0

In [39]:
binary_fea.shape

(2542141, 8)

Test if there any missing values in DataFrame. It turns out there are missing values

In [40]:
data.isnull().values.any()

False

Removing Observation which has any Missing Values in it....

In [41]:
# will delete observation if it has any missing values in any of the features.
data.dropna()

# shape of the data after deleting missing entries
data.shape

(2542141, 77)

NO Missing Values...!! That's great!

!!! Handling Duplicates

In [43]:
# deleting duplicates, except the first observation
data.drop_duplicates(keep='first')

# shape of the data after deleting duplicate entries
data.shape

(2542141, 77)

NO Duplicates too..! Neat!

# Class Distribution:
Let's take a look how each class is distributed..

We have uneven samples of traffic type, where BENIGN has the highest no. of observation. But we do have enough samples to train the model learning different patterns of each traffic types. We will see how models performs with these uneven amount of distributions in Model Evaluation section.

In [44]:
# grouping by forest cover type and calculating total occurance
data.groupby(' Label').size()

 Label
BENIGN                        1984531
Bot                              1966
DDoS                           128027
DoS GoldenEye                   10293
DoS Hulk                       231073
DoS Slowhttptest                 5499
DoS slowloris                    5796
FTP-Patator                      7938
Heartbleed                         11
PortScan                       158930
SSH-Patator                      5897
Web Attack � Brute Force         1507
Web Attack � Sql Injection         21
Web Attack � XSS                  652
dtype: int64

In [57]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [None]:
props = pd.read_csv(r"../input/properties_2016.csv")  #The properties dataset

#train = pd.read_csv(r"../input/train_2016_v2.csv")   # The parcelid's with their outcomes
#samp = pd.read_csv(r"../input/sample_submission.csv")  #The parcelid's for the testset