<a href="https://colab.research.google.com/github/Jeff-Rudolph/anomaly-based-intrusion-detection/blob/main/SupportVectorMachineIntrusionDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


import numpy as np
import os
import scipy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.ensemble as ske
import pandas
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn import svm


In [None]:
file_size = int(input("Would you like to use 10% data file or full file?(0 - %, 1 - Full):"))
if(file_size == 0):
  pdata = pandas.read_csv('/content/drive/My Drive/data/kddcup10pct.txt',header=None)
if(file_size == 1):
  pdata = pandas.read_csv('/content/drive/My Drive/data/kddcup_data_corrected.txt',header=None)

In [None]:
feature_names_str = '''duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.'''

temp = feature_names_str.split('.')
feature_names = []


for x in temp:
  x = x.replace("\n","")
  x = x.replace(": continuous","")
  x = x.replace(": symbolic","")
  feature_names.append(x)

feature_names.pop() #removes unnecessary empty string element at end of list
feature_names.append('event') 

pdata.columns = feature_names

In [None]:
#inspecting initial dataset
print(pdata.shape)
print(pdata['event'].value_counts()/len(pdata)*100)

In [None]:
pdata.drop_duplicates(keep='first', inplace = True) 
#removes duplicates if inplace=true 
#arg subset=false(default) this way things will only be removed if
#100% match including event

In [None]:
#checking new size after removing dupes
print(pdata.shape)
print(pdata['event'].value_counts()/len(pdata)*100)

In [None]:
#pdata.var(axis=0,numeric_only=True) 
#num outbound commands variance is 0 so we can drop this col

In [None]:
dimensionality = int(input("Run the model with low or high dimensions?(0-low, 1-high):"))

In [None]:
if(dimensionality == 0):
  pdata['protocol_type'] = pandas.factorize(pdata['protocol_type'])[0]
  pdata['protocol_type'] = pdata['protocol_type'] - 1


  pdata['service'] = pandas.factorize(pdata['service'])[0]
  if(file_size == 0):
    pdata['service'] = pdata['service'] - 33
  if(file_size == 1):
    pdata['service'] = pdata['service'] - 35

  pdata['flag'] = pandas.factorize(pdata['flag'])[0]
  pdata['flag'] = pdata['flag'] - 5

#dummy variable text to int conversion for these categorical inputs to avoid
#curse of dimensionality problems caused by OH encode need to make sure these 
#dont get normalized.
#subtractions are to balance numbers around 0 
#(10% and full have diff numbers for service tag)

In [None]:
x_data = pdata.drop(['event'], axis=1)

y_data = pdata['event'].values

In [None]:
count = 0
for i in range(len(y_data)):
  if(y_data[i]=='normal.'):
    count = count + 1
balance = len(y_data)-count
print("Number of normals:",count)
print("Number of data points:",len(y_data))
print("percent normal:",count/(len(y_data)))

In [None]:

classification = int(input("Would you like to run with binary or multiclass Y values?(0 - binary, 1 - multi):"))
if(classification == 0):
  for i in range(len(y_data)):
    if y_data[i] in ['normal.']:
      y_data[i] = 'normal'
    else:
      y_data[i] = 'hack' #converting to binary problem
  



In [None]:
x_data = x_data.drop(['num_outbound_cmds'],axis=1)
#dropped for 0 variance in entire file

In [None]:
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(x_data,y_data,test_size=0.3) 
#randomly shuffles then splits data

In [None]:
if(dimensionality == 1):
  protocol_cat = list(x_train['protocol_type'].values) #pull entire col into list
  protocol_cat = list(set(protocol_cat)) 
  #turn list into mathematical set to remove duplicates


  prot_OH_encoder = CountVectorizer(vocabulary=protocol_cat, binary=True) 
  #only 0 or 1 in sparse matrix
  protocol_train = prot_OH_encoder.fit_transform(x_train['protocol_type'].values)
  protocol_test = prot_OH_encoder.transform(x_test['protocol_type'].values)


  service_cat = list(x_train['service'].values) #do same for service
  service_cat = list(set(service_cat))


  serv_OH_encoder = CountVectorizer(vocabulary=service_cat, binary=True,lowercase=False)
  service_train = serv_OH_encoder.fit_transform(x_train['service'].values)

  service_test = serv_OH_encoder.transform(x_test['service'].values)


  flag_cat = list(x_train['flag'].values)#same for flag
  flag_cat = list(set(flag_cat))


  flag_OH_encoder = CountVectorizer(vocabulary=flag_cat, binary=True,lowercase=False)
  flag_train = flag_OH_encoder.fit_transform(x_train['flag'].values)

  flag_test = flag_OH_encoder.transform(x_test['flag'].values)

  
  x_train = x_train.drop(labels=['protocol_type','flag','service'],axis='columns')
  x_test = x_test.drop(labels=['protocol_type','flag','service'],axis='columns')

In [None]:
x_normalizer = StandardScaler()

x_train_norm = x_train
x_test_norm = x_test

continous_list = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised',
                  'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'count',
                  'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
                  'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                  'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                  'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
#only apply Z score (StandardScaler) to numerical continous cols




x_train_norm[continous_list] = x_normalizer.fit_transform(x_train[continous_list])
x_test_norm[continous_list] = x_normalizer.transform(x_test[continous_list])


In [None]:
if(dimensionality == 1 ):
  x_train_norm = hstack((x_train_norm,flag_train,service_train,protocol_train))
  x_test_norm = hstack((x_test_norm,flag_test,service_test,protocol_test))





In [None]:
###########################################################

In [None]:
classifier = svm.SVC(decision_function_shape='ovo')
# Train the model using the training sets
classifier.fit(x_train_norm, y_train)

In [None]:
y_pred = classifier.predict(x_test_norm)
#predict outcomes from testing data

In [None]:
#due to the way this dataset was constructed the Recall the only valid metric:
#“Trivial detection using the TTL aside, we found that it was still useful to 
# evaluate the true positive performance of a network IDS; however, any false 
# positive results were meaningless” (Brugger, 2007)

print(sklearn.metrics.recall_score(y_test,y_pred,average='macro'))
#the above line takes a simple average of the recall score for each class except
#the normal class. The formula for recall is TP/TP+FN to avoid div by 0 this 
#method has built in measures. If all cases are properly identified then it 
#becomes TP/TP+0 = 1, if the event is not in the sample then recall is set to 0
#The Data's imbalance will cause 0's to appear and drive down the macro average
#In the case of SVM I could not seperate the normal case from the macro average.
#To calculate you can remove the [11] 
#(this is always the  index of normal event tag in multiclass)
#entry from the list produced in the output
#below in the multiclass case and calculate your own average. 


#print(sklearn.metrics.accuracy_score(y_test,y_pred,normalize=False)) 
#raw number of correct predictions
#print(len(y_test))
#total number of events 

In [None]:
sklearn.metrics.recall_score(y_test,y_pred,average=None,zero_division=0)
