#### Business objective: Build network intrusion detection system to detect anamolies and attacks in the network.

#### Statistics problem: 
* Multinomial classification: Activity is normal or DOS or PROBE or R2L or U2R

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.formula.api as smf
import scipy.stats as stats

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.tree import DecisionTreeClassifier

In [2]:
col_n = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land", "wrong_fragment","urgent","hot","num_failed_logins","logged_in", "num_compromised","root_shell","su_attempted","num_root","num_file_creations", "num_shells","num_access_files","num_outbound_cmds","is_host_login", "is_guest_login","count","srv_count","serror_rate", "srv_serror_rate", "rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate", "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate", "dst_host_diff_srv_rate","dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"]

In [3]:
train = pd.read_csv('train.txt', sep = ",", header = None, names = col_n)
test = pd.read_csv('test.txt', sep = ",", header = None, names = col_n)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22544 entries, 0 to 22543
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     22544 non-null  int64  
 1   protocol_type                22544 non-null  object 
 2   service                      22544 non-null  object 
 3   flag                         22544 non-null  object 
 4   src_bytes                    22544 non-null  int64  
 5   dst_bytes                    22544 non-null  int64  
 6   land                         22544 non-null  int64  
 7   wrong_fragment               22544 non-null  int64  
 8   urgent                       22544 non-null  int64  
 9   hot                          22544 non-null  int64  
 10  num_failed_logins            22544 non-null  int64  
 11  logged_in                    22544 non-null  int64  
 12  num_compromised              22544 non-null  int64  
 13  root_shell      

#### Checking missing values

In [6]:
train.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [7]:
test.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

### No missing values in the data

### Data Preparation

#### As Y variable is not straight forward in a classification problem; defining Y variable

In [8]:
train['Y'] = train['attack']
test['Y'] = test['attack']

In [9]:
train.Y[train['Y'] == 'normal'] = '0'
test.Y[test['Y'] == 'normal'] = '0'

In [10]:
dos_list = ['neptune','smurf','apache2','back','processtable','pod','worm','teardrop','land','udpstorm']

In [11]:
for x in dos_list:
    train.Y[train['Y'] == x] = '1'
    test.Y[test['Y'] == x] = '1'

In [12]:
probe = ['saint','mscan','satan','nmap','ipsweep','portsweep']

In [13]:
for x in probe:
    train.Y[train['Y'] == x] = '2'
    test.Y[test['Y'] == x] = '2'

In [14]:
r2l = ['guess_passwd','warezmaster','snmpgetattack','httptunnel','snmpguess','mailbomb','multihop','named','sendmail','xlock','xsnoop','ftp_write','imap','phf','warezclient','spy']

In [15]:
for x in r2l:
    train.Y[train['Y'] == x] = '3'
    test.Y[test['Y'] == x] = '3'

In [16]:
u2r = ['buffer_overflow','ps','loadmodule','xterm','rootkit','perl','sqlattack']

In [17]:
for x in u2r:
    train.Y[train['Y'] == x] = '4'
    test.Y[test['Y'] == x] = '4'

In [18]:
train.Y.value_counts()

0    67343
1    45927
2    11656
3      995
4       52
Name: Y, dtype: int64

In [19]:
test.Y.value_counts()

0    9711
1    7167
3    3178
2    2421
4      67
Name: Y, dtype: int64

In [20]:
train.drop('attack', axis = 1, inplace = True)
test.drop('attack', axis = 1, inplace = True)

In [21]:
#splitting data into categorical and numerical variables

num_col=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_col=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object', 'O']]

In [22]:
train_num=train[num_col]
test_num=test[num_col]

In [23]:
train_cat=train[cat_col]
test_cat=test[cat_col]

In [24]:
print('train_num', train_num.shape)
print('train_cat', train_cat.shape)
print('test_num', test_num.shape)
print('test_cat', test_cat.shape)

train_num (125973, 39)
train_cat (125973, 4)
test_num (22544, 39)
test_cat (22544, 4)


#### Outlier capping

In [25]:
#Handling Outliers
def outlier_capping(x):
    x = x.clip(upper=x.quantile(0.99))
    x = x.clip(lower=x.quantile(0.01))
    return x

train_num = train_num.apply(lambda x: outlier_capping(x))
test_num = test_num.apply(lambda x: outlier_capping(x))

In [26]:
# profile_report = pandas_profiling.ProfileReport(train_cat)
# profile_report.to_file('traincat.html')

In [27]:
# removing variables with high cardinality

train_cat.drop('service', axis = 1, inplace = True)
test_cat.drop('service', axis = 1, inplace = True)

In [28]:
train_cat.head()

Unnamed: 0,protocol_type,flag,Y
0,tcp,SF,0
1,udp,SF,0
2,tcp,S0,1
3,tcp,SF,0
4,tcp,SF,0


#### Data Audit Report

In [29]:
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

num_summary = train_num.apply(lambda x: var_summary(x)).T
num_summary

Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
duration,125973.0,0.0,19111660.0,151.71238,0.0,1095.861922,1200913.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,9590.1568,9590.56
src_bytes,125973.0,0.0,129689000.0,1029.498123,44.0,6137.901578,37673840.0,0.0,0.0,0.0,0.0,0.0,44.0,276.0,848.0,1480.0,54540.0,54540.0
dst_bytes,125973.0,0.0,169070700.0,1342.118549,0.0,3804.294003,14472650.0,0.0,0.0,0.0,0.0,0.0,0.0,516.0,3375.8,8314.0,25519.0,25519.0
land,125973.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,125973.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,125973.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,125973.0,0.0,6238.0,0.049519,0.0,0.351827,0.1237823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
num_failed_logins,125973.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,125973.0,0.0,49852.0,0.395736,0.0,0.48901,0.2391308,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
num_compromised,125973.0,0.0,1286.0,0.010209,0.0,0.100521,0.0101044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


#### Creating Dummy variables

In [30]:
#Creating dummies for nominal categorical variables
def create_dummies( df, colname ):
    col_dummies = pd.get_dummies(df[colname], prefix=colname, drop_first=True)
    df = pd.concat([df, col_dummies], axis=1)
    df.drop( colname, axis = 1, inplace = True )
    return df

for c_feature in ['protocol_type' , 'flag']:
    train_cat[c_feature] = train_cat[c_feature].astype('category')
    test_cat[c_feature] = test_cat[c_feature].astype('category')
    train_cat = create_dummies(train_cat , c_feature )
    test_cat = create_dummies(test_cat , c_feature )

In [31]:
#Combining both categorical and Continuous data
train_final = pd.concat([train_num,train_cat],axis=1)
test_final = pd.concat([test_num,test_cat],axis=1)

In [32]:
train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 52 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  float64
 1   src_bytes                    125973 non-null  int64  
 2   dst_bytes                    125973 non-null  int64  
 3   land                         125973 non-null  int64  
 4   wrong_fragment               125973 non-null  int64  
 5   urgent                       125973 non-null  int64  
 6   hot                          125973 non-null  int64  
 7   num_failed_logins            125973 non-null  int64  
 8   logged_in                    125973 non-null  int64  
 9   num_compromised              125973 non-null  int64  
 10  root_shell                   125973 non-null  int64  
 11  su_attempted                 125973 non-null  int64  
 12  num_root                     125973 non-null  int64  
 13 

In [33]:
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22544 entries, 0 to 22543
Data columns (total 52 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     22544 non-null  float64
 1   src_bytes                    22544 non-null  int64  
 2   dst_bytes                    22544 non-null  float64
 3   land                         22544 non-null  int64  
 4   wrong_fragment               22544 non-null  int64  
 5   urgent                       22544 non-null  int64  
 6   hot                          22544 non-null  int64  
 7   num_failed_logins            22544 non-null  int64  
 8   logged_in                    22544 non-null  int64  
 9   num_compromised              22544 non-null  int64  
 10  root_shell                   22544 non-null  int64  
 11  su_attempted                 22544 non-null  int64  
 12  num_root                     22544 non-null  int64  
 13  num_file_creatio

#### Splitting data into Train and test

In [34]:
y = train_final['Y']
x = train_final.columns.difference(['Y'])

In [35]:
#split the data into train & test - for sklearn techniques
train_X, test_X,train_y, test_y = train_test_split(train_final[x], y, test_size = 0.3, random_state=123)

### Model Building

### Random Forest Classifier

In [36]:
para_grid_multi_rf = {'n_estimators' : [70,80,90,100,110],
                      'max_features' : [10,11,12,13,14,15]}

gscv_multi_rf = GridSearchCV(estimator=RandomForestClassifier(),
                       param_grid=para_grid_multi_rf,
                       cv = 10,
                       verbose = 1,
                       n_jobs =-1)

gscv_multi_rf.fit(train_X, train_y)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.5min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rand

In [37]:
#Score on train dataset
gscv_multi_rf.best_score_

0.9993195761712755

In [38]:
# Get the best parameters
gscv_multi_rf.best_params_

{'max_features': 14, 'n_estimators': 110}

In [39]:
gscv_multi_rf.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=14,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=110,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
# Get the estimator
best_rf = gscv_multi_rf.best_estimator_

In [41]:
train_X['y_pred'] = best_rf.predict(train_X)
test_X['y_pred'] = best_rf.predict(test_X)

In [42]:
print("Train score {:.4f}".format(metrics.accuracy_score(train_y, train_X.y_pred)))
print("Test score {:.4f}".format(metrics.accuracy_score(test_y, test_X.y_pred)))

Train score 1.0000
Test score 0.9993


In [43]:
print(metrics.classification_report(train_y, train_X.y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     47181
           1       1.00      1.00      1.00     32066
           2       1.00      1.00      1.00      8194
           3       1.00      1.00      1.00       700
           4       1.00      1.00      1.00        40

    accuracy                           1.00     88181
   macro avg       1.00      1.00      1.00     88181
weighted avg       1.00      1.00      1.00     88181



In [44]:
print(metrics.classification_report(test_y, test_X.y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20162
           1       1.00      1.00      1.00     13861
           2       1.00      1.00      1.00      3462
           3       0.99      0.98      0.98       295
           4       0.90      0.75      0.82        12

    accuracy                           1.00     37792
   macro avg       0.98      0.94      0.96     37792
weighted avg       1.00      1.00      1.00     37792



#### Decision Tree

In [45]:
param_grid = {'max_depth': np.arange(3, 5),
             'max_features': np.arange(3,5)}

In [46]:
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
tree.fit( train_X, train_y )

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': array([3, 4]),
                         

In [47]:
tree.best_score_

0.8904298176580443

In [48]:
tree.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [49]:
tree.best_params_

{'max_depth': 3, 'max_features': 4}

In [50]:
train_pred = tree.predict(train_X)

In [51]:
test_pred = tree.predict(test_X)

In [54]:
train_X = train_X[train_X.columns.difference(['y_pred'])]

#### Building the final model

In [55]:
clf_tree = DecisionTreeClassifier( max_depth = 4, max_features=4, max_leaf_nodes=5 )
clf_tree.fit( train_X, train_y )

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=4, max_leaf_nodes=5,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [56]:
clf_tree.feature_importances_

array([0.        , 0.16748471, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.74326944, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06696647, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.02227938, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [57]:
# summarize the selection of the attributes
import itertools
feature_map = [(i, v) for i, v in itertools.zip_longest(train_X.columns, clf_tree.feature_importances_)]

feature_map

[('count', 0.0),
 ('diff_srv_rate', 0.16748471305722734),
 ('dst_bytes', 0.0),
 ('dst_host_count', 0.0),
 ('dst_host_diff_srv_rate', 0.0),
 ('dst_host_rerror_rate', 0.0),
 ('dst_host_same_src_port_rate', 0.0),
 ('dst_host_same_srv_rate', 0.0),
 ('dst_host_serror_rate', 0.0),
 ('dst_host_srv_count', 0.0),
 ('dst_host_srv_diff_host_rate', 0.0),
 ('dst_host_srv_rerror_rate', 0.0),
 ('dst_host_srv_serror_rate', 0.7432694355434698),
 ('duration', 0.0),
 ('flag_REJ', 0.0),
 ('flag_RSTO', 0.0),
 ('flag_RSTOS0', 0.0),
 ('flag_RSTR', 0.0),
 ('flag_S0', 0.0),
 ('flag_S1', 0.0),
 ('flag_S2', 0.0),
 ('flag_S3', 0.0),
 ('flag_SF', 0.06696647208193003),
 ('flag_SH', 0.0),
 ('hot', 0.0),
 ('is_guest_login', 0.0),
 ('is_host_login', 0.0),
 ('land', 0.0),
 ('last_flag', 0.0),
 ('logged_in', 0.0),
 ('num_access_files', 0.0),
 ('num_compromised', 0.022279379317372908),
 ('num_failed_logins', 0.0),
 ('num_file_creations', 0.0),
 ('num_outbound_cmds', 0.0),
 ('num_root', 0.0),
 ('num_shells', 0.0),
 ('prot

In [58]:
Feature_importance = pd.DataFrame(feature_map, columns=['Feature', 'importance'])
Feature_importance.sort_values('importance', inplace=True, ascending=False)
Feature_importance.head(10)

Unnamed: 0,Feature,importance
12,dst_host_srv_serror_rate,0.743269
1,diff_srv_rate,0.167485
22,flag_SF,0.066966
31,num_compromised,0.022279
0,count,0.0
38,protocol_type_udp,0.0
29,logged_in,0.0
30,num_access_files,0.0
32,num_failed_logins,0.0
33,num_file_creations,0.0


### Results of Random Forest were near perfect, which could mean Model was overfitting. Hence, choosing Decision Tree Model which has a decent score and captures the important features