In [72]:
# import relevant modules
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import imblearn

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.inf, linewidth=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

print("pandas : {0}".format(pd.__version__))
print("numpy : {0}".format(np.__version__))
print("matplotlib : {0}".format(matplotlib.__version__))
print("seaborn : {0}".format(sns.__version__))
print("sklearn : {0}".format(sklearn.__version__))
print("imblearn : {0}".format(imblearn.__version__))

pandas : 1.4.4
numpy : 1.21.5
matplotlib : 3.5.2
seaborn : 0.11.2
sklearn : 1.0.2
imblearn : 0.10.0


In [73]:
# Dataset field names
datacols = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"]

# Load NSL_KDD train dataset
df1 = pd.read_csv("train.csv") # change path to where the dataset is located.
df2 = pd.read_csv("test.csv")
# Load NSL_KDD test dataset
#x_test = pd.read_csv("test.csv", sep=",", names=datacols)
#y_test = x_test[['src_bytes','dst_host_srv_count','service','dst_bytes','num_outbound_cmds','attack']]

In [74]:
print(df1.attack.unique())

n = len(pd.unique(df1['attack']))
print(n)
print(df1['attack'].min()) 
print(df1['attack'].max())  
 

['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap' 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop' 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land' 'loadmodule' 'spy' 'perl']
23
back
warezmaster


In [75]:
print(df1.src_bytes.unique())

n1 = len(pd.unique(df1['src_bytes']))
print(n1)
print(df1['src_bytes'].min()) 
print(df1['src_bytes'].max())  

[       491        146          0        232        199        287        334        300         18        233        343        253        147        437        227        215        241          8        303         45        105         43        324       1591        290         28        255        302        220         44        641        310        696        221        181         42         12        210        321        329        320        225        247        353       3065        102        259        309        278        313        219        245        328        251       2089        237          1        218        325          9         46       1079        273        262       1032        261        294        284        214        748       1766        266        295        228         30       2078         19        661       1075        223        350        201        271      14416        176        855        944        954        166        306       148

In [76]:
print(df1.dst_host_srv_count.unique())

n2 = len(pd.unique(df1['dst_host_srv_count']))
print(n2)
print(df1['dst_host_srv_count'].min()) 
print(df1['dst_host_srv_count'].max())  

[ 25   1  26 255  19   9  15  23  13  12 219  20   2  16  17  53  59 250   5  63 147  11  80  38   4 245 212  46  48  10 133  67   6  21  58  31  93  83  47   8   7  95 254   3 119 247 182 249 252 152 239 167  49  82 129  64  14  92  18 253  68 195 117  69  34  42 100 142  28  50  52 166 104 240  22 185  24  44 157  71 232 123 187  60 156 111 113  88  43  30 112  61 101  81 154  36 234 164 251  62 171  55 200 242  41  75 226 115 153  39 132 217  79 169  37  70  35 224 143  72 192  65 205 135  40  96 161 141 151  98 225  33  78 214  29 105 178 110  73  90 216  84  66  27 246 177 196 175  76 118 189 158 183  51 114  54 191 116 237  87 243 126 238  97 130  57 109 155 229 139 148 168 199 134 190 170 163  85 106 193 180 165  45 207 248 174 120 194 162 144  77 172  91 235 236 125 223 203  32 131 138 197 231 136 137 146 149 210 127 176 181 244 159 227 173 124 215 218 208 145 206 241 121 198 128 102 221 201 179 122 150  89  99 108  56 107  86 188 202 222 230 184 211 213 228  74 103  94 186 220

In [77]:
print(df1.service.unique())

n3 = len(pd.unique(df1['service']))
print(n3)
print(df1['service'].min()) 
print(df1['service'].max())  

['ftp_data' 'other' 'private' 'http' 'remote_job' 'name' 'netbios_ns' 'eco_i' 'mtp' 'telnet' 'finger' 'domain_u' 'supdup' 'uucp_path' 'Z39_50' 'smtp' 'csnet_ns' 'uucp' 'netbios_dgm' 'urp_i' 'auth' 'domain' 'ftp' 'bgp' 'ldap' 'ecr_i' 'gopher' 'vmnet' 'systat' 'http_443' 'efs' 'whois' 'imap4' 'iso_tsap' 'echo' 'klogin' 'link' 'sunrpc' 'login' 'kshell' 'sql_net' 'time' 'hostnames' 'exec' 'ntp_u' 'discard' 'nntp' 'courier' 'ctf' 'ssh' 'daytime' 'shell' 'netstat' 'pop_3' 'nnsp' 'IRC' 'pop_2' 'printer' 'tim_i' 'pm_dump' 'red_i' 'netbios_ssn' 'rje' 'X11' 'urh_i' 'http_8001' 'aol' 'http_2784' 'tftp_u' 'harvest']
70
IRC
whois


In [78]:
print(df1.dst_bytes.unique())

n4 = len(pd.unique(df1['dst_bytes']))
print(n4)
print(df1['dst_bytes'].min()) 
print(df1['dst_bytes'].max())  

[         0       8153        420       2251      13788        616       1178      11905        105      14421       6588      10499       1400        555         45        147         43       2302        372       3006        861        498       1398         69        133       2794        333       2878         42       2790       5683       3982        379       3762       1416       1483         44        331        102        750        354        306       7227        761       1212       9165       2243        301        335        511        431        145        974       2649        275        138         46        334        139        903       1357       1011       8412       1719        981      10352      16096        283         78       8437        128        135        529        465       6834        329        364        656       1185       2470       1938       4781      11564       2445        426        762        575        330        326      34660       269

9326
0
1309937401


In [79]:
print(df1.logged_in.unique())

n5 = len(pd.unique(df1['logged_in']))
print(n5)
print(df1['logged_in'].min()) 
print(df1['logged_in'].max())  

[0 1]
2
0
1


In [80]:
print(df1.dst_host_serror_rate.unique())

n5 = len(pd.unique(df1['dst_host_serror_rate']))
print(n5)
print(df1['dst_host_serror_rate'].min()) 
print(df1['dst_host_serror_rate'].max())  

[0.   1.   0.03 0.99 0.53 0.07 0.97 0.01 0.09 0.11 0.8  0.02 0.08 0.98 0.93 0.51 0.22 0.25 0.33 0.16 0.12 0.05 0.5  0.92 0.14 0.2  0.67 0.06 0.04 0.1  0.77 0.57 0.48 0.55 0.63 0.17 0.96 0.13 0.34 0.78 0.89 0.68 0.86 0.66 0.79 0.82 0.29 0.18 0.87 0.15 0.94 0.81 0.61 0.39 0.19 0.62 0.4  0.6  0.24 0.73 0.37 0.84 0.46 0.58 0.71 0.47 0.26 0.21 0.28 0.9  0.56 0.69 0.41 0.72 0.95 0.42 0.64 0.59 0.43 0.75 0.88 0.23 0.7  0.74 0.65 0.76 0.44 0.27 0.91 0.31 0.36 0.3  0.49 0.85 0.52 0.45 0.35 0.32 0.83 0.54 0.38]
101
0.0
1.0


In [81]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df1['service']=le.fit_transform(df1['service'])
df1['service']=pd.DataFrame(df1['service'])
df1['attack']=le.fit_transform(df1['attack'])
df1['attack']=pd.DataFrame(df1['attack'])
x_train = df1[['src_bytes','dst_host_srv_count','service','dst_bytes','logged_in','dst_host_serror_rate']] # removes an unwanted extra field
y_train = df1['attack']
df1.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,Unnamed: 42
0,0,tcp,20,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,11,20
1,0,udp,44,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,11,15
2,0,tcp,49,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,9,19
3,0,tcp,24,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,11,21
4,0,tcp,24,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,21


In [82]:

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df2['service']=le.fit_transform(df2['service'])
df2['service']=pd.DataFrame(df2['service'])
df2['attack']=le.fit_transform(df2['attack'])
df2['attack']=pd.DataFrame(df2['attack'])
x_test = df1[['src_bytes','dst_host_srv_count','service','dst_bytes','logged_in','dst_host_serror_rate']] # removes an unwanted extra field
y_test = df1['attack']
df1.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,Unnamed: 42
0,0,tcp,20,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,11,20
1,0,udp,44,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,11,15
2,0,tcp,49,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,9,19
3,0,tcp,24,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,11,21
4,0,tcp,24,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,21


In [83]:
x_train.head()

Unnamed: 0,src_bytes,dst_host_srv_count,service,dst_bytes,logged_in,dst_host_serror_rate
0,491,25,20,0,0,0.0
1,146,1,44,0,0,0.0
2,0,26,49,0,0,1.0
3,232,255,24,8153,1,0.03
4,199,255,24,420,1,0.0


In [84]:
print(df1['attack'].min()) 
print(df1['attack'].max()) 

print(df1['service'].min()) 
print(df1['service'].max()) 

0
22
0
69


In [85]:
from sklearn import tree

            
# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier.fit(x_train, y_train);

In [86]:
ypred=DTC_Classifier.predict(x_test)
ypred

array([11, 11,  9, 11, 11,  9,  9,  9,  9,  9,  9,  9, 11, 21,  9,  9, 11,  5, 11, 11,  9,  9, 11, 11,  9, 11,  9, 11, 11, 11,  5,  9, 11, 15, 11, 11, 11,  9, 11,  9,  9,  9, 11, 11, 11,  9, 20, 11, 21,  9, 11,  9, 20, 11, 11, 11,  5,  9,  9, 11, 11,  9, 17, 11,  9, 11,  9,  9,  9,  9, 11,  9, 11, 11,  9, 11,  9, 11, 11,  9, 11,  9,  9,  5, 11,  9, 11,  9,  9, 11, 11, 11, 11,  9,  9, 11, 11, 11, 11,  9, 11, 11, 11, 11, 20, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15,  9, 11, 11, 11,  9,  9,  9, 11,  9, 11, 11, 11, 11, 11,  9,  9, 11, 11,  9,  9, 11, 11, 18,  9, 11,  9,  9, 11, 11, 11,  9, 11, 21,  9,  9, 11,  9,  9,  9, 11, 11,  9,  5,  9, 11, 11, 11, 11,  9, 15, 11, 11, 11, 11,  9, 11, 18,  9,  9,  9, 11, 17, 17,  9, 11, 11, 11, 11, 20,  9,  9,  9, 11, 11, 21, 11,  9,  9, 11, 11, 11, 11, 15, 11, 11,  9, 11, 11, 11, 11,  9, 11, 11,  9, 17, 14,  9, 11,  9,  9, 11, 11, 11, 11, 11,  9, 21,  9,  5, 11, 11,  9, 11, 11, 11, 11, 11, 20, 11,  9,  9,  9, 11, 11, 11,  5,  9,  9,  9, 11,  9,  5,  

In [87]:
DTC_Classifier.predict([[1,2,10,30,20,30]])

array([9])

In [88]:
test_predict = DTC_Classifier.predict(x_test)

# Prediction on training data
train_predict = DTC_Classifier.predict(x_train)

In [89]:
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,classification_report,confusion_matrix
train_accuracy = accuracy_score(train_predict,y_train)
print("Training accuracy of MLP model is:",train_accuracy*100)
print("Classification report of training:"'\n',classification_report(train_predict,y_train))

Training accuracy of MLP model is: 98.05275733688966
Classification report of training:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       957
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        53
           4       0.82      1.00      0.90         9
           5       0.91      0.78      0.84      4176
           6       0.17      1.00      0.29         3
           7       1.00      1.00      1.00         9
           8       1.00      1.00      1.00         7
           9       0.99      0.99      0.99     41362
          10       0.53      0.82      0.64       957
          11       1.00      1.00      1.00     67235
          12       1.00      1.00      1.00         3
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00       201
          15       0.83      0.77      0.80   

In [90]:
test_accuracy = accuracy_score(test_predict,y_test)
print("Testing accuracy of MLP model is:",test_accuracy*100)
print("Classification report of testing:"'\n',classification_report(test_predict,y_test))

Testing accuracy of MLP model is: 98.05275733688966
Classification report of testing:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       957
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        53
           4       0.82      1.00      0.90         9
           5       0.91      0.78      0.84      4176
           6       0.17      1.00      0.29         3
           7       1.00      1.00      1.00         9
           8       1.00      1.00      1.00         7
           9       0.99      0.99      0.99     41362
          10       0.53      0.82      0.64       957
          11       1.00      1.00      1.00     67235
          12       1.00      1.00      1.00         3
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00       201
          15       0.83      0.77      0.80     

In [91]:
import pickle

file = open('int_det_model.pkl' ,'wb')
pickle.dump(DTC_Classifier,file)