In [1]:
import warnings
warnings.filterwarnings("ignore")

## Loading Data

In [2]:
from datapackage import Package

package = Package('https://datahub.io/machine-learning/kddcup99/datapackage.json')

# print list of all resources:
print(package.resource_names)


['validation_report', 'kddcup99_csv', 'kddcup99_json', 'kddcup99_zip', 'kddcup99_csv_preview', 'kddcup99']


In [3]:
import pandas as pd
df = pd.read_csv("kddcup99.csv")

In [4]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal


## Import Packages

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Dataset Exploration

### Shape of the data

In [6]:
rows_count, columns_count = df.shape
print('Total Number of rows :', rows_count)
print('Total Number of columns :', columns_count)

Total Number of rows : 494020
Total Number of columns : 42


### Below we can see the features involved in the data

In [7]:
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell',
       'lsu_attempted', 'lnum_root', 'lnum_file_creations', 'lnum_shells',
       'lnum_access_files', 'lnum_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')

### Describing Data  
The standard deviation of balance, duration and pdays is greater than one third of the mean, distrubution would be spreaded out around the mean, we will see this in visualisation section.

In [8]:
df_transpose = df.describe().T
df_transpose

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,494020.0,47.9794,707.747185,0.0,0.0,0.0,0.0,58329.0
src_bytes,494020.0,3025.615744,988219.101225,0.0,45.0,520.0,1032.0,693375640.0
dst_bytes,494020.0,868.530774,33040.034672,0.0,0.0,0.0,0.0,5155468.0
land,494020.0,4.5e-05,0.006673,0.0,0.0,0.0,0.0,1.0
wrong_fragment,494020.0,0.006433,0.134805,0.0,0.0,0.0,0.0,3.0
urgent,494020.0,1.4e-05,0.00551,0.0,0.0,0.0,0.0,3.0
hot,494020.0,0.034519,0.782103,0.0,0.0,0.0,0.0,30.0
num_failed_logins,494020.0,0.000152,0.01552,0.0,0.0,0.0,0.0,5.0
logged_in,494020.0,0.148245,0.355343,0.0,0.0,0.0,0.0,1.0
lnum_compromised,494020.0,0.010212,1.798328,0.0,0.0,0.0,0.0,884.0


###  Five point summary

In [9]:
df_transpose[['min', '25%', '50%', '75%', 'max']]

Unnamed: 0,min,25%,50%,75%,max
duration,0.0,0.0,0.0,0.0,58329.0
src_bytes,0.0,45.0,520.0,1032.0,693375640.0
dst_bytes,0.0,0.0,0.0,0.0,5155468.0
land,0.0,0.0,0.0,0.0,1.0
wrong_fragment,0.0,0.0,0.0,0.0,3.0
urgent,0.0,0.0,0.0,0.0,3.0
hot,0.0,0.0,0.0,0.0,30.0
num_failed_logins,0.0,0.0,0.0,0.0,5.0
logged_in,0.0,0.0,0.0,0.0,1.0
lnum_compromised,0.0,0.0,0.0,0.0,884.0


### Lets See if there is any Null Values
There is no null value in the data

In [10]:
df.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
lnum_compromised               0
lroot_shell                    0
lsu_attempted                  0
lnum_root                      0
lnum_file_creations            0
lnum_shells                    0
lnum_access_files              0
lnum_outbound_cmds             0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [11]:
df.isnull().values.any()

False

## Sometimes we see '?' as null value, So lets just check it.

In [12]:
for value in df.columns:
     print(value,":", sum(df[value] == '?'))

duration : 0
protocol_type : 0
service : 0
flag : 0
src_bytes : 0
dst_bytes : 0
land : 0
wrong_fragment : 0
urgent : 0
hot : 0
num_failed_logins : 0
logged_in : 0
lnum_compromised : 0
lroot_shell : 0
lsu_attempted : 0
lnum_root : 0
lnum_file_creations : 0
lnum_shells : 0
lnum_access_files : 0
lnum_outbound_cmds : 0
is_host_login : 0
is_guest_login : 0
count : 0
srv_count : 0
serror_rate : 0
srv_serror_rate : 0
rerror_rate : 0
srv_rerror_rate : 0
same_srv_rate : 0
diff_srv_rate : 0
srv_diff_host_rate : 0
dst_host_count : 0
dst_host_srv_count : 0
dst_host_same_srv_rate : 0
dst_host_diff_srv_rate : 0
dst_host_same_src_port_rate : 0
dst_host_srv_diff_host_rate : 0
dst_host_serror_rate : 0
dst_host_srv_serror_rate : 0
dst_host_rerror_rate : 0
dst_host_srv_rerror_rate : 0
label : 0


### Data types involved in the data

In [13]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
lnum_compromised                 int64
lroot_shell                      int64
lsu_attempted                    int64
lnum_root                        int64
lnum_file_creations              int64
lnum_shells                      int64
lnum_access_files                int64
lnum_outbound_cmds               int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494020 non-null  int64  
 1   protocol_type                494020 non-null  object 
 2   service                      494020 non-null  object 
 3   flag                         494020 non-null  object 
 4   src_bytes                    494020 non-null  int64  
 5   dst_bytes                    494020 non-null  int64  
 6   land                         494020 non-null  int64  
 7   wrong_fragment               494020 non-null  int64  
 8   urgent                       494020 non-null  int64  
 9   hot                          494020 non-null  int64  
 10  num_failed_logins            494020 non-null  int64  
 11  logged_in                    494020 non-null  int64  
 12  lnum_compromised             494020 non-null  int64  
 13 

### Converting Object to category

In [15]:
for feature in df.columns: 
    if df[feature].dtype == 'object': 
        df[feature] = pd.Categorical(df[feature])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   duration                     494020 non-null  int64   
 1   protocol_type                494020 non-null  category
 2   service                      494020 non-null  category
 3   flag                         494020 non-null  category
 4   src_bytes                    494020 non-null  int64   
 5   dst_bytes                    494020 non-null  int64   
 6   land                         494020 non-null  int64   
 7   wrong_fragment               494020 non-null  int64   
 8   urgent                       494020 non-null  int64   
 9   hot                          494020 non-null  int64   
 10  num_failed_logins            494020 non-null  int64   
 11  logged_in                    494020 non-null  int64   
 12  lnum_compromised             494020 non-null

#### This help us to know that there's various factors on which attacks depends.
