<a href="https://colab.research.google.com/github/Jeff-Rudolph/anomaly-based-intrusion-detection/blob/main/WideDeepIntrusionDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')



import numpy as np
import os
import scipy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.ensemble as ske
import pandas
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn import metrics
from tensorflow import keras
import tensorflow as tf
import tensorflow.keras.metrics as tfm





In [None]:
file_size = int(input("Would you like to use 10% data file or full file?(0 - %, 1 - Full):"))
if(file_size == 0):
  pdata = pandas.read_csv('/content/drive/My Drive/data/kddcup10pct.txt',header=None)
if(file_size == 1):
  pdata = pandas.read_csv('/content/drive/My Drive/data/kddcup_data_corrected.txt',header=None)

In [None]:
feature_names_str = '''duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.'''

temp = feature_names_str.split('.')
feature_names = []


for x in temp:
  x = x.replace("\n","")
  x = x.replace(": continuous","")
  x = x.replace(": symbolic","")
  feature_names.append(x)
#print(len(feature_names))

feature_names.pop() #removes unnecessary empty string element at end of list
feature_names.append('event') 

pdata.columns = feature_names

In [None]:
#inspecting initial dataset
print(pdata.shape)
print(pdata['event'].value_counts()/len(pdata)*100)

In [None]:
pdata.drop_duplicates(keep='first', inplace = True) 
#removes duplicates if inplace=true 
#arg subset=false(default) this way things will only be removed if 
#100% match including event

In [None]:
print(pdata.shape)#checking new size after removing dupes
print(pdata['event'].value_counts()/len(pdata)*100)


In [None]:
#pdata.var(axis=0,numeric_only=True) 
#num outbound commands variance is 0 can drop this col

In [None]:
protocol_list = pandas.factorize(pdata['protocol_type'])[0]
pdata['protocol_type_dense'] = protocol_list
pdata['protocol_type_dense'] = pdata['protocol_type_dense'] - 1


service_list = pandas.factorize(pdata['service'])[0]
pdata['service_type_dense'] = service_list
if(file_size == 0):
  pdata['service_type_dense'] = pdata['service_type_dense'] - 33
if(file_size == 1):
  pdata['service_type_dense'] = pdata['service_type_dense'] - 35

flag_list = pandas.factorize(pdata['flag'])[0]
pdata['flag_type_dense'] = flag_list
pdata['flag_type_dense'] = pdata['flag_type_dense'] - 5


#dummy variable text to int conversion for these categorical inputs to avoid
#curse of dimensionality problems caused by OH encode need to make sure these 
#dont get normalized.
#subtractions are to balance numbers around 0 
#(10% and full have diff numbers for service tag)

In [None]:
x_data = pdata.drop(['event'], axis=1)

y_data = pdata['event'].values

In [None]:
count = 0
for i in range(len(y_data)):
  if(y_data[i]=='normal.'):
    count = count + 1
balance = len(y_data)-count
print("Number of normals:",count)
print("Number of data points:",len(y_data))
print("percent normal:",count/(len(y_data)))

In [None]:
classification = int(input("Would you like to run with binary or multiclass Y values?(0 - binary, 1 - multi):"))
if(classification == 0):
  for i in range(len(y_data)):
    if y_data[i] in ['normal.']:
      y_data[i] = 'normal'
    else:
      y_data[i] = 'hack'
  #turning into binary problem



In [None]:
x_data = x_data.drop(['num_outbound_cmds'],axis=1)
#dropped for 0 variance in entire file

In [None]:
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(x_data,y_data,test_size=0.3) 
#random shuffle then split

In [None]:
x_normalizer = StandardScaler()

x_train_norm = x_train
x_test_norm = x_test

continous_list = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised',
                  'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'count',
                  'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
                  'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                  'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                  'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


x_train_norm[continous_list] = x_normalizer.fit_transform(x_train[continous_list])
x_test_norm[continous_list] = x_normalizer.transform(x_test[continous_list])

y_one_hot_encoder = sklearn.preprocessing.OneHotEncoder(sparse=False,handle_unknown='ignore')
y_test = y_test.reshape(-1,1)
y_train = y_train.reshape(-1,1)
y_train = y_one_hot_encoder.fit_transform(y_train)
y_test = y_one_hot_encoder.transform(y_test)


In [None]:
protocol_cat = list(x_train['protocol_type'].values) #pull entire col into list
protocol_cat = list(set(protocol_cat)) #turn list into mathematical set to remove duplicates


prot_OH_encoder = CountVectorizer(vocabulary=protocol_cat, binary=True) #only 0 or 1 in sparse matrix
protocol_train = prot_OH_encoder.fit_transform(x_train['protocol_type'].values)

protocol_test = prot_OH_encoder.transform(x_test['protocol_type'].values)


service_cat = list(x_train['service'].values) #do same for service
service_cat = list(set(service_cat))
serv_OH_encoder = CountVectorizer(vocabulary=service_cat, binary=True,lowercase=False)
service_train = serv_OH_encoder.fit_transform(x_train['service'].values)
service_test = serv_OH_encoder.transform(x_test['service'].values)


flag_cat = list(x_train['flag'].values)#same for flag
flag_cat = list(set(flag_cat))
flag_OH_encoder = CountVectorizer(vocabulary=flag_cat, binary=True,lowercase=False)
flag_train = flag_OH_encoder.fit_transform(x_train['flag'].values)
flag_test = flag_OH_encoder.transform(x_test['flag'].values)



In [None]:
protocol_train_cols = protocol_train.toarray()
service_train_cols = service_train.toarray() #turn the csr's into regular matrix(2D list)
flag_train_cols = flag_train.toarray()

protocol_test_cols = protocol_test.toarray()
service_test_cols = service_test.toarray()
flag_test_cols = flag_test.toarray()

In [None]:
#turning the OH encoded features into matrices to be joined to the original DF's 
protocol_colnames = prot_OH_encoder.get_feature_names()
train_prot_df = pandas.DataFrame(protocol_train_cols, columns=protocol_colnames) 
test_prot_df = pandas.DataFrame(protocol_test_cols, columns=protocol_colnames)

service_colnames = serv_OH_encoder.get_feature_names()
train_serv_df = pandas.DataFrame(service_train_cols, columns=service_colnames)
test_serv_df = pandas.DataFrame(service_test_cols, columns=service_colnames)

flag_colnames = flag_OH_encoder.get_feature_names()
train_flag_df = pandas.DataFrame(flag_train_cols, columns=flag_colnames)
test_flag_df = pandas.DataFrame(flag_test_cols, columns=flag_colnames)



In [None]:
train_OH_features = pandas.concat([train_prot_df,train_serv_df,train_flag_df],axis=1)
test_OH_features = pandas.concat([test_prot_df,test_serv_df,test_flag_df],axis=1)

In [None]:
#have to match indexes OH matrix doesnt have the shuffled indexes from x matrix
train_OH_features.index = x_train_norm.index
test_OH_features.index = x_test_norm.index

x_train_norm = pandas.concat([x_train_norm,train_OH_features], axis=1)
x_test_norm = pandas.concat([x_test_norm,test_OH_features], axis=1)

In [None]:
wide_inputs = list(train_OH_features.columns.values) + continous_list

In [None]:
deep_symbolic_features = list(set(feature_names) - set(continous_list) -set(['num_outbound_cmds','event','flag','service','protocol_type']))
wide_symbolic_features = list(set(deep_symbolic_features))
deep_symbolic_features = deep_symbolic_features + ['protocol_type_dense',	'service_type_dense',	'flag_type_dense']


deep_inputs = continous_list + deep_symbolic_features
wide_inputs = list(train_OH_features.columns.values) + continous_list + wide_symbolic_features


In [None]:
if(classification == 0):
  out_size = 2
  linear_func = 'sigmoid'
else:
  out_size = len(y_train[0])
  linear_func = 'softmax'

wide_model = keras.experimental.LinearModel(activation=linear_func,units=out_size, use_bias=False)

if(classification==0):
  wide_model.compile(
      loss=keras.losses.BinaryCrossentropy(from_logits=False), 
      optimizer=keras.optimizers.Adam(), 
      metrics=[ tf.keras.metrics.Recall(class_id=1),tf.keras.metrics.Recall(class_id=0) ],
  )
else:
  wide_model.compile(
      loss=keras.losses.CategoricalCrossentropy(from_logits=False), 
      optimizer=keras.optimizers.Adam(), 
      metrics=[ tf.keras.metrics.Recall(class_id=i) for i in range(len(y_train[0]))],
  )

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 3 epochs"
        patience=3,
        verbose=1,
    ),
]



In [None]:
###########################################################

In [None]:
if classification == 0 :
  last_hidden = 10
  last_func = 'sigmoid'
else:
  last_hidden = 32
  last_func = 'softmax'

dnn_model = keras.Sequential()
dnn_model.add(tf.keras.layers.Dense(50, input_dim=x_train_norm[deep_inputs].shape[1], activation='relu')) #Funneling down into the size of the output layer with ballpark appropriate sizes of hiddens
dnn_model.add(tf.keras.layers.Dense(last_hidden, activation='relu')) 
dnn_model.add(tf.keras.layers.Dense(out_size, activation=last_func)) 
#makes sure the output layer is the size of the OH matrix generated by y_train


In [None]:
if(classification==0):
  dnn_model.compile(
      loss=keras.losses.BinaryCrossentropy(from_logits=False), 
      optimizer=keras.optimizers.Adam(), 
      metrics=[tf.keras.metrics.Recall(class_id=1),tf.keras.metrics.Recall(class_id=0) ],
  )
else:
  dnn_model.compile(
      loss=keras.losses.CategoricalCrossentropy(from_logits=False), 
      optimizer=keras.optimizers.Adam(), 
      metrics=[ tf.keras.metrics.Recall(class_id=i) for i in range(len(y_train[0]))],
  )



In [None]:
combined_model = keras.experimental.WideDeepModel(wide_model, dnn_model, activation='relu')

if(classification==0):
  combined_model.compile(
      loss=keras.losses.BinaryCrossentropy(from_logits=False), 
      optimizer=keras.optimizers.Adam(), 
      metrics=[ "accuracy" ],
  )
else:
  combined_model.compile(
      loss=keras.losses.CategoricalCrossentropy(from_logits=False), 
      optimizer=keras.optimizers.Adam(), 
      metrics=[ "accuracy"],
  )


In [None]:
combined_model.fit([x_train_norm[wide_inputs],x_train_norm[deep_inputs]], y_train, batch_size=64, epochs=100, callbacks=callbacks)

In [None]:
y_pred = combined_model.predict([x_test_norm[wide_inputs],x_test_norm[deep_inputs]])
#predict Y values

In [None]:
probabilities = (y_pred)
predictions = []
for z in range(len(probabilities)):
  probabilities[z] = [float(round(x)) for x in probabilities[z]]

for i in range(len(probabilities)):
  for j in range(len(probabilities[0])):
    if(probabilities[i][j] > 1):
      probabilities[i][j] = 1

In [None]:
label = y_one_hot_encoder.get_feature_names_out()
norm_pos = 1
for i in range(len(label)):
  if label[i] == 'x0_normal.':
    norm_pos = i
#print(norm_pos)

In [None]:
for i in range(len(label)):
  label[i] = i
label = np.delete(label,norm_pos)

In [None]:
#due to the way this dataset was constructed the Recall the only valid metric:
#“Trivial detection using the TTL aside, we found that it was still useful to 
# evaluate the true positive performance of a network IDS; however, any false 
# positive results were meaningless” (Brugger, 2007)

print(sklearn.metrics.recall_score(y_test,probabilities,average='macro', labels=label))
#the above line takes a simple average of the recall score for each class except
#the normal class. The formula for recall is TP/TP+FN to avoid div by 0 this 
#method has built in measures. If all cases are properly identified then it 
#becomes TP/TP+0 = 1, if the event is not in the sample then recall is set to 0
#The Data's imbalance will cause 0's to appear and drive down the macro average

#print(sklearn.metrics.accuracy_score(y_test,probabilities,normalize=False)) 
#raw number of correct predictions
#print(len(y_test))
#total number of events 

In [None]:
test_values = sklearn.metrics.recall_score(y_test,probabilities,average=None,zero_division=0)
test_keys = y_one_hot_encoder.get_feature_names_out()
res = {test_keys[i]: test_values[i] for i in range(len(test_keys))}
#aligning up recall results in a dictionary just for personal visualizing
print(res)