In [None]:
import os
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import keras
import tensorflow as tf
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, BatchNormalization
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential

In [None]:
#os.listdir()
dataset_root = os.path.join(os.getcwd(),'Datasets/KDD')

In [None]:
train_file = os.path.join(dataset_root, 'KDDTrain+.csv')
test_file = os.path.join(dataset_root, 'KDDTest+.csv')

In [None]:
header_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'success_pred']

In [None]:
col_names = np.array(header_names)

nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(nominal_idx).difference(binary_idx))

nominal_cols = col_names[nominal_idx].tolist()
binary_cols = col_names[binary_idx].tolist()
numeric_cols = col_names[numeric_idx].tolist()

In [None]:
category = defaultdict(list)
category['benign'].append('normal')

name = os.path.join(dataset_root,'name.txt')
with open(name, 'r') as f:
    for line in f.readlines():
        #print(line)
        attack, cat = line.strip().split(' ')
        category[cat].append(attack)

attack_mapping = dict((v,k) for k in category for v in category[k])

In [None]:
attack_mapping

In [None]:
DataSetKDDTrain = pd.read_csv(train_file, names=header_names)
DataSetKDDTrain['attack_category'] = DataSetKDDTrain['attack_type'].map(lambda x: attack_mapping[x])
DataSetKDDTrain.drop(['success_pred'], axis=1, inplace=True)
    
DataSetKDDTest = pd.read_csv(test_file, names=header_names)
DataSetKDDTest['attack_category'] = DataSetKDDTest['attack_type'].map(lambda x: attack_mapping[x])
DataSetKDDTest.drop(['success_pred'], axis=1, inplace=True)

In [None]:
src_train = DataSetKDDTrain['src_bytes']
src_test = DataSetKDDTest['src_bytes']
dst_train = DataSetKDDTrain['dst_bytes']
dst_test = DataSetKDDTest['dst_bytes']

In [None]:
np.max(src_train), np.max(src_test), np.max(dst_train), np.max(dst_test)

In [None]:
src_train = np.array(src_train).reshape(-1, 1)
src_test = np.array(src_test).reshape(-1, 1)
dst_train = np.array(dst_train).reshape(-1, 1)
dst_test = np.array(dst_test).reshape(-1, 1)

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
src_train = scaler.fit_transform(src_train)
dst_train = scaler.fit_transform(dst_train)
src_test = scaler.fit_transform(src_test)
dst_test = scaler.fit_transform(dst_test)

In [None]:
DataSetKDDTrain['src_bytes'] = src_train
DataSetKDDTest['src_bytes'] = src_test
DataSetKDDTrain['dst_bytes'] = dst_train
DataSetKDDTest['dst_bytes'] = dst_test

In [None]:
for j in zip(range(len(set(DataSetKDDTrain["protocol_type"]))), list(set(DataSetKDDTrain["protocol_type"]))):
    DataSetKDDTrain.loc[(DataSetKDDTrain['protocol_type'] == j[1]), "protocol_type"] = j[0]

for j in zip(range(len(set(DataSetKDDTrain["service"]))), list(set(DataSetKDDTrain["service"]))):
    DataSetKDDTrain.loc[(DataSetKDDTrain['service'] == j[1]), "service"] = j[0]
    
for j in zip(range(len(set(DataSetKDDTrain["flag"]))), list(set(DataSetKDDTrain["flag"]))):
    DataSetKDDTrain.loc[(DataSetKDDTrain['flag'] == j[1]), "flag"] = j[0]

In [None]:
for j in zip(range(len(set(DataSetKDDTest["protocol_type"]))), list(set(DataSetKDDTest["protocol_type"]))):
    DataSetKDDTest.loc[(DataSetKDDTest['protocol_type'] == j[1]), "protocol_type"] = j[0]

for j in zip(range(len(set(DataSetKDDTest["service"]))), list(set(DataSetKDDTest["service"]))):
    DataSetKDDTest.loc[(DataSetKDDTest['service'] == j[1]), "service"] = j[0]
    
for j in zip(range(len(set(DataSetKDDTest["flag"]))), list(set(DataSetKDDTest["flag"]))):
    DataSetKDDTest.loc[(DataSetKDDTest['flag'] == j[1]), "flag"] = j[0]

In [None]:
train_attack_types = DataSetKDDTrain['attack_type'].value_counts()
train_attack_cats = DataSetKDDTrain['attack_category'].value_counts()

In [None]:
test_attack_types = DataSetKDDTest['attack_type'].value_counts()
test_attack_cats = DataSetKDDTest['attack_category'].value_counts()

In [None]:
train_attack_types.plot(kind='barh', figsize=(20,10), fontsize=20)

In [None]:
train_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)

In [None]:
test_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)

In [None]:
DataSetKDDTrain[binary_cols].describe().transpose()

In [None]:
DataSetKDDTrain.groupby(['su_attempted']).size()

In [None]:
DataSetKDDTrain['su_attempted'].replace(2, 0, inplace=True)
DataSetKDDTest['su_attempted'].replace(2, 0, inplace=True)
DataSetKDDTrain.groupby(['su_attempted']).size()

In [None]:
DataSetKDDTrain.groupby(['num_outbound_cmds']).size()

In [None]:
DataSetKDDTrain.drop('num_outbound_cmds', axis = 1, inplace=True)
DataSetKDDTest.drop('num_outbound_cmds', axis = 1, inplace=True)
numeric_cols.remove('num_outbound_cmds')

In [None]:
train_Y = DataSetKDDTrain['attack_category']
train_x_raw = DataSetKDDTrain.drop(['attack_category','attack_type'], axis=1)
test_Y = DataSetKDDTest['attack_category']
test_x_raw = DataSetKDDTest.drop(['attack_category','attack_type'], axis=1)

In [None]:
combined_df_raw = pd.concat([train_x_raw, test_x_raw])
combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True)

train_x = combined_df[:len(train_x_raw)]
test_x = combined_df[len(train_x_raw):]

# Store dummy variable feature names
dummy_variables = list(set(train_x)-set(combined_df_raw))

In [None]:
train_x.describe().T

In [None]:
from sklearn.preprocessing import StandardScaler

durations = train_x['duration'].values.reshape(-1, 1)
standard_scaler = StandardScaler().fit(durations)
scaled_durations = standard_scaler.transform(durations)
pd.Series(scaled_durations.flatten()).describe()

In [None]:
set(train_Y)

In [None]:
train_Y_bin = train_Y.apply(lambda x: 0 if x is 'benign' else 1)
test_Y_bin = test_Y.apply(lambda x: 0 if x is 'benign' else 1)

In [None]:
#had tried improving imbalancing, no significant change in result
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss, accuracy_score

classifier = DecisionTreeClassifier(random_state=17)
classifier.fit(train_x, train_Y)

pred_y = classifier.predict(test_x)
#print(pred_y)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)
print(accuracy_score(pred_y,test_Y))

In [None]:
from sklearn.metrics import confusion_matrix, zero_one_loss, accuracy_score

In [None]:
from sklearn.tree import export_graphviz
import pydotplus
from PIL import Image
import os

dot_data = export_graphviz(classifier, feature_names=list(train_x.columns),class_names=['benign', 'probe','dos','u2r','r2l'],rounded=True,filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)

graph.write_png("viz_tree.png")

In [None]:
#variants can be added here, the kernel trick one should work better
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(train_x, train_Y)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

In [None]:
print(accuracy_score(pred_y,test_Y))

In [None]:
from sklearn.svm import SVC
model = SVC(verbose=True,max_iter=1000)

model.fit(train_x, train_Y)
pred_y = model.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)
print(accuracy_score(pred_y,test_Y))

In [None]:
train_x = train_x.astype(float)
test_x = test_x.astype(float)
type(test_x)

In [None]:
test_x.shape

In [None]:
shape = np.array(train_x).shape
shape

In [None]:
train_x = np.array(train_x)
train_Y = np.array(train_Y)
test_x = np.array(test_x)
test_Y = np.array(test_Y)

In [None]:
train_x = train_x.reshape(125973, -1, 1)
test_x = test_x.reshape(22544, 118, 1)
test_x.shape

In [None]:
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=5, activation='tanh', input_shape=(118, 1)))
model.add(BatchNormalization())
model.add(Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(118, 1)))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['accuracy'])

In [None]:
model.fit(train_x, train_Y_bin, epochs=1, batch_size=16, validation_split=0.2)

In [None]:
pred_y = model.predict(test_x)

acc = {}
for i in np.arange(0, 0.5, 0.005):
    pred = (pred_y > i).astype(int)
    acc.update({i: accuracy_score(test_Y_bin, pred)})

In [None]:
np.max(acc)

In [None]:
np.mean(pred_y)

In [None]:
train_Y_bin[0:100]

In [None]:
a = 0
for i in range(len(test_x)):
    if test_Y_bin[i] == 1:
        a += 1
a

In [None]:
type(train_Y_bin)

In [None]:
train_x_view = train_x.reshape(-1, 118)

In [None]:
np.max(train_x)

In [None]:
train_x_view