In [1]:
# Import dataset
from sklearn.datasets import fetch_kddcup99
kdd99_data = fetch_kddcup99() #can also do subset='http'
import numpy as np
X_og = kdd99_data['data']
y_og = kdd99_data['target']

In [2]:
# When running entire dataset - Remove categorical data
X_num_sample = np.delete(X_og,[1,2,3],1)
print(X_num_sample.shape)
print(y_og.shape)

(494021, 38)
(494021,)


In [3]:
# Make sure all categorical columns were removed
X_num_sample[0:1]

array([[0, 181, 5450, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,
        8, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 9, 9, 1.0, 0.0, 0.11, 0.0,
        0.0, 0.0, 0.0, 0.0]], dtype=object)

In [4]:
# When running entire dataset - Take a sample from the dataset 
sample_indices = np.random.choice(range(len(y_og)), 59000)
X_sample = X_num_sample[sample_indices,:]
y = y_og[sample_indices]
print(X_sample.shape)
print(y.shape)

(59000, 38)
(59000,)


In [None]:
# http subset - Take a sample from the dataset
sample_indices = np.random.choice(range(len(y_og)), 59000)
X_sample = X_og[sample_indices,:]
y = y_og[sample_indices]
print(X_sample.shape)
print(y.shape)

In [5]:
# Assign each attack type to a numerical representation of an attack category
attack_cats = [y]

for i in range(X_sample.shape[0]):
  
    if(y[i]==b'normal.'):
        y[i] = 0
    
# dos
    if(y[i]==b'neptune.'or y[i]==b'smurf.'or y[i]==b'pod.' or y[i]==b'teardrop.' or y[i]==b'land.' or y[i]==b'back.'):
        y[i] = 1

# u2r
    if(y[i]==b'buffer_overflow.' or y[i]==b'loadmodule.' or y[i]==b'perl.' or y[i]==b'rootkit.'):
        y[i] = 2
    
# r2l 
    if(y[i]==b'guess_passwd.' or y[i]==b'ftp_write.' or y[i]==b'imap.' or y[i]==b'phf.' or y[i]==b'multihop.' or y[i]==b'warezmaster.' or y[i]==b'warezclient.' or y[i]==b'spy.'):
        y[i] = 3

# probe
    if(y[i]==b'portsweep.' or y[i]==b'ipsweep.' or y[i]==b'nmap.' or y[i]==b'satan.'):
        y[i] = 4

In [6]:
# y is now an array of integers
y = np.int64(y)
y[0:100]

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1], dtype=int64)

In [7]:
# Split data into testing and training sets
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_sample, y, test_size=0.33, random_state=42)

In [15]:
# Use RF classifier to label attack types as 'normal, dos, u2r, r2l, or probe'
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth = 500, random_state = 42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [16]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

r = recall_score(y_test, y_pred, average = 'micro')
p = precision_score(y_test, y_pred, average = 'micro')
f = f1_score(y_test, y_pred, average = 'micro')
print('r=', r, '\np=', p, '\nf=', f)


r= 0.9989727786337956 
p= 0.9989727786337956 
f= 0.9989727786337956


In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_pred))
print('\n', confusion_matrix(y_test, y_pred))
print('\nAccuracy = ', accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3935
           1       1.00      1.00      1.00     15303
           2       0.00      0.00      0.00         2
           3       0.97      0.82      0.89        44
           4       0.99      0.96      0.98       186

    accuracy                           1.00     19470
   macro avg       0.79      0.75      0.77     19470
weighted avg       1.00      1.00      1.00     19470


 [[ 3933     0     0     1     1]
 [    0 15303     0     0     0]
 [    2     0     0     0     0]
 [    8     0     0    36     0]
 [    7     1     0     0   178]]

Accuracy =  0.9989727786337956


  _warn_prf(average, modifier, msg_start, len(result))
