<a href="https://colab.research.google.com/github/Lasitha-Jayawardana/IDS/blob/main/K_Mean_NSL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import math
from sklearn.cluster import KMeans
# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# Loading the Raw Data into Python

In [None]:
# Loading the data
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IDS Project/NSL_Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IDS Project/NSL_Test.csv')

# print the shape
print(train_data.shape)
print(test_data.shape)

train_data.drop(['id'], axis = 'columns' , inplace = True)
test_data.drop(['id'], axis = 'columns' , inplace = True)
#runs the first 5 rows
#train_data.head()

(125973, 43)
(22544, 43)


# Data Preprocessing ¶

In [None]:
#train_data.info()

In [None]:
#convert object data to categorical
train_data[['protocol_type', 'service', 'flag','class']]= train_data[['protocol_type', 'service', 'flag','class']].apply(lambda x: x.astype('category'))
test_data[['protocol_type', 'service', 'flag','class']]= test_data[['protocol_type', 'service', 'flag','class']].apply(lambda x: x.astype('category'))

#train_data.info()
def checknull():
  # Checking for null values
  print(train_data.isnull().sum())
  print(test_data.isnull().sum())

In [None]:

# select categorical column and count unique values 
def showCatColumnDetails():
  
  for column in train_data.select_dtypes('category'):
    unique_vals = np.unique(train_data[column])
    nr_values = len(unique_vals)
    print('{} :{}'.format(column, nr_values))
  print(".....................protocol_type.........................\n")
  print(train_data['protocol_type'].describe())
  print("\n\n.....................service.........................\n")
  print(train_data['service'].describe())
  print("\n\n......................flag........................\n")
  print(train_data['flag'].describe())

In [None]:
#train_data.columns

In [None]:
# Looping through all the features by our y variable - see if there is relationship
def showCatGraph():

  features = ['protocol_type', 'service', 'flag', 'class']
  for f in features:
      plt.figure(figsize=(12,4)) # this creates a figure 8 inch wide, 4 inch high

      sns.countplot(x = f, data = train_data, palette = 'Set3',order = train_data[f].value_counts().iloc[:20].index)
      plt.xticks(rotation=45)
      plt.show()
    

In [None]:
train_data['class'] = train_data['class'].map( {'normal':0, 'anomaly':1} )
test_data['class'] = test_data['class'].map( {'normal':0, 'anomaly':1} )

In [None]:
#train_data['service']

In [None]:
def addOtherLabel():
  otherlabel = 'Other_service'
  others1 = train_data['service'].value_counts().index[30:]
  # apply new category label
  train_data['service'] = train_data['service'].cat.add_categories([otherlabel])
  train_data['service'] = train_data['service'].replace(others1, otherlabel)

  train_data['service'].replace({'pop_3':'Other_service','sunrpc':'Other_service',
                              'link':'Other_service','name':'Other_service','echo':'Other_service',
                              'netbios_ns':'Other_service'}, inplace = True)

  others1 = test_data['service'].value_counts().index[30:]
  # apply new category label
  test_data['service'] = test_data['service'].cat.add_categories([otherlabel])
  test_data['service'] = test_data['service'].replace(others1, otherlabel)

  test_data['service'].replace({'pop_3':'Other_service','sunrpc':'Other_service',
                              'link':'Other_service','name':'Other_service','echo':'Other_service',
                              'netbios_ns':'Other_service'}, inplace = True)


In [None]:
#addOtherLabel()

In [None]:
def removeDuplicate():
  print('Duplicates train:',train_data.duplicated().sum())
  train_data.drop_duplicates(keep='first',inplace=True)
  print(train_data.duplicated().sum())
  print('Duplicates test:',test_data.duplicated().sum())
  test_data.drop_duplicates(keep='first',inplace=True)
  print(test_data.duplicated().sum())

  train_data.reset_index(drop=True,inplace=True)
  test_data.reset_index(drop=True,inplace=True)

In [None]:
removeDuplicate()

Duplicates train: 9
0
Duplicates test: 3
0


In [None]:
limit = train_data.shape[1]-1
    
X_train = train_data.iloc[:,0:limit] # train set features
Y_train = train_data.iloc[:,limit]

X_test = test_data.iloc[:,0:limit] # test set features
Y_test = test_data.iloc[:,limit]

print('Training X   :',X_train.shape)
print('Training_Y   :',Y_train.shape)
print('Test_X       :',X_test.shape)
print('Test_Y       :',Y_test.shape)

Training X   : (125964, 41)
Training_Y   : (125964,)
Test_X       : (22541, 41)
Test_Y       : (22541,)


In [None]:
def one_hot_encode(X_train,X_test):# Making categorical variables into numeric representation by one- hot encoding
    
    categorical_cols = ['protocol_type','flag','service']

    # Training dataset one hot encoding
    ohe = OneHotEncoder(handle_unknown = 'ignore')
    ohe.fit(X_train[categorical_cols])
    array_hot_encoded1 = ohe.transform(X_train[categorical_cols]).toarray()

    data_hot_encoded1 = pd.DataFrame(array_hot_encoded1, index=X_train.index,columns=ohe.get_feature_names(categorical_cols))
    X_train = X_train.drop(columns=categorical_cols)
    X_train = pd.concat([data_hot_encoded1,X_train], axis=1)
    
    print('X_train shape :',X_train.shape)
    
    # Test dataset one hot encoding
    array_hot_encoded2 = ohe.transform(X_test[categorical_cols]).toarray()
    data_hot_encoded2 = pd.DataFrame(array_hot_encoded2, index=X_test.index,columns=ohe.get_feature_names(categorical_cols))
    X_test = X_test.drop(columns=categorical_cols)
    X_test = pd.concat([data_hot_encoded2,X_test], axis=1)
    
    print('X_test shape :',X_test.shape)
    
    return X_train, X_test



X_train, X_test = one_hot_encode(X_train,X_test)


X_train shape : (125964, 122)
X_test shape : (22541, 122)


In [None]:
#X_train

In [None]:
def featureScalling(X_train,X_test):
  scaler = MinMaxScaler()
  X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
  X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
  return X_train,X_test

In [None]:
X_train,X_test = featureScalling(X_train,X_test)

In [None]:
#X_train.describe()

In [None]:
# Trying with Dimentionality reduction and then Kmeans

def showVariance(X):
  n_components = X.shape[1]

  # Running PCA with all components
  pca = PCA(n_components=n_components,random_state=456)
  X_r = pca.fit(X).transform(X)


  # Calculating the 95% Variance
  total_variance = sum(pca.explained_variance_)
  print("Total Variance in our dataset is: ", total_variance)
  var_98 = total_variance * 0.99
  print("The 98% variance we want to have is: ", var_98)
  print("")

  # Creating a df with the components and explained variance
  a = zip(range(0,n_components), pca.explained_variance_)
  a = pd.DataFrame(a, columns=["PCA Comp", "Explained Variance"])

  # Trying to hit 95%
  print("Variance explain with 30 n_compononets: ", sum(a["Explained Variance"][0:30]))
  print("Variance explain with 35 n_compononets: ", sum(a["Explained Variance"][0:35]))
  print("Variance explain with 40 n_compononets: ", sum(a["Explained Variance"][0:40]))
  print("Variance explain with 41 n_compononets: ", sum(a["Explained Variance"][0:41]))
  print("Variance explain with 50 n_compononets: ", sum(a["Explained Variance"][0:50]))
  print("Variance explain with 53 n_compononets: ", sum(a["Explained Variance"][0:53]))
  print("Variance explain with 55 n_compononets: ", sum(a["Explained Variance"][0:55]))
  print("Variance explain with 60 n_compononets: ", sum(a["Explained Variance"][0:60]))
  print("Variance explain with 65 n_compononets: ", sum(a["Explained Variance"][0:65]))
  print("Variance explain with 70 n_compononets: ", sum(a["Explained Variance"][0:70]))
  print("Variance explain with 75 n_compononets: ", sum(a["Explained Variance"][0:75]))
  print("Variance explain with 80 n_compononets: ", sum(a["Explained Variance"][0:80]))
  return pca.explained_variance_ratio_
  

In [None]:
vr = showVariance(X_train)

Total Variance in our dataset is:  4.239846183908404
The 98% variance we want to have is:  4.19744772206932

Variance explain with 30 n_compononets:  4.040085672999156
Variance explain with 35 n_compononets:  4.068236611415323
Variance explain with 40 n_compononets:  4.094027677077478
Variance explain with 41 n_compononets:  4.098895644899186
Variance explain with 50 n_compononets:  4.138733744028214
Variance explain with 53 n_compononets:  4.150632271614262
Variance explain with 55 n_compononets:  4.158222551402169
Variance explain with 60 n_compononets:  4.1762126849703645
Variance explain with 65 n_compononets:  4.1925286811815505
Variance explain with 70 n_compononets:  4.206577570035951
Variance explain with 75 n_compononets:  4.218197561367648
Variance explain with 80 n_compononets:  4.2271203280268


In [None]:
# Plotting the Data
def plotVariance(vr,k):
  plt.figure(1, figsize=(14, 8))
  plt.plot(vr, linewidth=2, c="r")
  plt.xlabel('n_components')
  plt.ylabel('explained_ratio_')

  # Plotting line with 98% e.v.
  plt.axvline(k,linestyle=':', label='n_components - 98% explained', c ="blue")
  plt.legend(prop=dict(size=12))

  # adding arrow
  plt.annotate('55 eigenvectors used to explain 98% variance', xy=(k, vr[k]), 
              xytext=(58, vr[10]),
              arrowprops=dict(facecolor='blue', shrink=0.05))

  plt.show()

In [None]:
kPCA=5
#plotVariance(vr,kPCA)

In [None]:
def runPCA(X_train,X_test):
  pca = PCA(n_components=kPCA, random_state = 453)
  X_rtrain = pca.fit(X_train).transform(X_train)
  X_rtest  = pca.transform(X_test)
  return X_rtrain,X_rtest

In [None]:
X_rtrain,X_rtest = runPCA(X_train,X_test)

# K-means Clustering

In [None]:
# Running K means with multible seeds
no_of_clusters = 2
def runKmeans(X_train):

  best_seed = None


  min_inertia=0.0
  
  seeds = [0,9500,10000,10500,11000,15000,18000 , 20000, 40000, 60000, 80000,90000, 120000]
  for seed in seeds:

    
    print('\n seed= {} \n'.format(seed))
        
    kmeans = KMeans(n_clusters=no_of_clusters, init='k-means++',random_state=seed)
    kmeans = kmeans.fit(X_train)
    
    inertia = kmeans.inertia_
    print("The innertia for : 2 Clusters is:", inertia) 
    # if current measurement of heterogeneity is lower than previously seen,
    # update the minimum record of heterogeneity.
    if min_inertia == 0 :
        min_inertia = inertia
        best_seed = seed
    if inertia < min_inertia:
        min_inertia = inertia
        best_seed = seed
        

  print("/nMin inertia : ", min_inertia)
  print("Best Seed : ",best_seed)
  return best_seed
 

In [None]:
#best_seed = runKmeans(X_train)
best_seed = runKmeans(X_rtrain)


 seed= 0 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 9500 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 10000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 10500 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 11000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 15000 

The innertia for : 2 Clusters is: 214201.7513953969

 seed= 18000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 20000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 40000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 60000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 80000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 90000 

The innertia for : 2 Clusters is: 214201.7514104742

 seed= 120000 

The innertia for : 2 Clusters is: 214201.7514104742
/nMin inertia :  214201.7513953969
Best Seed :  15000


In [None]:
#X_trainN = X_train
#X_testN = X_test
X_trainN = X_rtrain
X_testN = X_rtest

In [None]:
X_trainN.shape

(125964, 5)

In [None]:
# Running K means on K clusters
kmeans = KMeans(n_clusters=no_of_clusters,init='k-means++', random_state=0)
kmeans = kmeans.fit(X_trainN)

In [None]:
label0 = 1- kmeans.labels_
label1 = kmeans.labels_
test_label1 = kmeans.predict(X_testN)
test_label0 = 1 - test_label1


In [None]:
def accuracyScore():
  print("Accuracy Train: ",accuracy_score(Y_train,label0)*100)
  print("Accuracy Train: ",accuracy_score(Y_train,label1)*100)
  print("Accuracy Test: ",accuracy_score(Y_test,test_label0)*100)
  print("Accuracy Test: ",accuracy_score(Y_test,test_label1)*100)

In [None]:
accuracyScore()

Accuracy Train:  18.952240322631862
Accuracy Train:  81.04775967736813
Accuracy Test:  47.06978394924804
Accuracy Test:  52.930216050751966


## Temp


In [None]:
'''# Running K means with multible Ks
best_seed = None
final_centroids = None
final_cluster_assignment = None

min_inertia = []
X_value = X_train
seeds = [0, 20000, 40000, 60000, 80000, 100000, 120000]
for seed in seeds:
  inertia = []
  no_of_clusters = range(2,15)
  print('\n\n seed= {} \n'.format(seed))
  for f in no_of_clusters:
      kmeans = KMeans(n_clusters=f, init='k-means++',random_state=seed)
      kmeans = kmeans.fit(X_value)
      
      u = kmeans.inertia_
      inertia.append(u)
      print("The innertia for :", f, "Clusters is:", u) 
      # if current measurement of heterogeneity is lower than previously seen,
      # update the minimum record of heterogeneity.
  if len(min_inertia) == 0 :
      min_inertia = inertia
      
  if min(inertia,default=0) < min(min_inertia,default=0):
      min_inertia = inertia
      best_seed = seed
      final_centroids = kmeans.cluster_centers_
      final_cluster_assignment = kmeans.labels_

      '''

In [None]:
# Creating the screen plot for Intertia - elbow method
fig, (ax1) = plt.subplots(1, figsize=(16,6))
xx = np.arange(len(seeds))
ax1.plot(xx, min_inertia,linewidth=4)
ax1.set_xticks(xx)
ax1.set_xticklabels(no_of_clusters, rotation='vertical')
plt.xlabel('Seed value')
plt.ylabel('Inertia Score')
plt.title('Inertia Plot per 2 cluster for Seed ')

In [None]:
fig, (ax1) = plt.subplots(1, figsize=(16,6))
plt.plot(no_of_clusters, min_inertia,linewidth=4)
plt.plot([no_of_clusters[0], no_of_clusters[-1]], [min_inertia[0], 
                        min_inertia[-1]], 'ro-',linewidth=4)
plt.show()



In [None]:
# between-a-point-and-a-line-in-2-d/
def calc_distance(x1, y1, a, b, c):
  d = abs((a * x1 + b * y1 + c)) / (math.sqrt(a * a + b * b))
  return d

In [None]:
a = inertia[0] - inertia[-1]
b = no_of_clusters[-1] - no_of_clusters[0]
c1 = no_of_clusters[0] * inertia[-1]
c2 = no_of_clusters[-1] * inertia[0]
c = c1 - c2

In [None]:
r = no_of_clusters[-1]-1
r

In [None]:
distance_of_points_from_line = []

for k in range(int(r)):
  distance_of_points_from_line.append(
      calc_distance(no_of_clusters[k], inertia[k], a, b, c))
  
plt.plot(no_of_clusters, distance_of_points_from_line)

In [None]:
K =no_of_clusters[distance_of_points_from_line.index(max(distance_of_points_from_line))]
print("Best K value : {}".format(K))

In [None]:
# Running K means on K clusters

kmeans = KMeans(n_clusters=K,init='k-means++', random_state=best_seed)
kmeans = kmeans.fit(X_value)
kmeans.inertia_

predictions = kmeans.predict(X_value)




In [None]:
# calculating the Counts of the cluster
#unique, counts = np.unique(kmeans.labels_, return_counts=True)

print(np.bincount(kmeans.labels_))


In [None]:
# Running PCA to Visualize the data

X = X_value

y_num = predictions

pca = PCA(n_components=2, random_state = 453)
X_r = pca.fit(X).transform(X)

target_names = ["Cluster 0","Cluster 1","Cluster 2", "Cluster 3","Cluster 4","Cluster 5","Cluster 6","Cluster 7"]

# Percentage of variance explained for each components
print('Explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))

# Plotting the data
plt.figure()
plt.figure(figsize=(12,8))
colors = ['navy', 'turquoise', 'darkorange', 'red', 'black','blue','green','yellow']
lw = 2


for color, i, target_name in zip(colors, [0, 1, 2, 3, 4,5,6,7], target_names):
    plt.scatter(X_r[y_num == i, 0], X_r[y_num == i, 1], color=color, alpha=.8, lw=lw,label=target_name)
    
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.6)   
plt.title('PCA of 2 Items')
plt.show()

In [None]:
# Running PCA again

pca = PCA(n_components=40, random_state = 453)
X_r = pca.fit(X).transform(X)

inertia = []

#running Kmeans

for f in no_of_clusters:
    kmeans = KMeans(n_clusters=f, random_state=best_seed)
    kmeans = kmeans.fit(X_r)
    u = kmeans.inertia_
    inertia.append(u)
    print("The innertia for :", f, "Clusters is:", u)

# Creating the scree plot for Intertia - elbow method
fig, (ax1) = plt.subplots(1, figsize=(16,6))
xx = np.arange(len(no_of_clusters))
ax1.plot(xx, inertia)
ax1.set_xticks(xx)
ax1.set_xticklabels(no_of_clusters, rotation='vertical')
plt.xlabel('n_components Value')
plt.ylabel('Inertia Score')
plt.title("Inertia Plot per k")

In [None]:
fig, (ax1) = plt.subplots(1, figsize=(16,6))
plt.plot(no_of_clusters, inertia,linewidth=4)
plt.plot([no_of_clusters[0], no_of_clusters[-1]], [inertia[0], 
                        inertia[-1]], 'ro-',linewidth=4)
plt.show()


In [None]:
# between-a-point-and-a-line-in-2-d/
def calc_distance(x1, y1, a, b, c):
  d = abs((a * x1 + b * y1 + c)) / (math.sqrt(a * a + b * b))
  return d

In [None]:
a = inertia[0] - inertia[-1]
b = no_of_clusters[-1] - no_of_clusters[0]
c1 = no_of_clusters[0] * inertia[-1]
c2 = no_of_clusters[-1] * inertia[0]
c = c1 - c2

In [None]:
r = no_of_clusters[-1]-1
r

In [None]:
distance_of_points_from_line = []

for k in range(int(r)):
  distance_of_points_from_line.append(
      calc_distance(no_of_clusters[k], inertia[k], a, b, c))
  
plt.plot(no_of_clusters, distance_of_points_from_line)

In [None]:
K =no_of_clusters[distance_of_points_from_line.index(max(distance_of_points_from_line))]
print("Best K value : {}".format(K))

In [None]:
# Running PCA with 36 PC

# Running Kmeans with 5 Ks
kmeans = KMeans(n_clusters=5, random_state=best_seed)
kmeans = kmeans.fit(X_r)

print("Optimized K-mean innertia for cluster 5 :", kmeans.inertia_)

In [None]:
# Running PCA to Visualize the data


pca = PCA(n_components=2, random_state = 453)
X_r = pca.fit(X_r).transform(X_r)

target_names = ["Cluster 0","Cluster 1","Cluster 2", "Cluster 3","Cluster 4"]

# Percentage of variance explained for each components
print('Explained variance ratio (PCA 2 components): %s' % str(sum(pca.explained_variance_ratio_)))

# Plotting the data
plt.figure()
plt.figure(figsize=(12,8))
colors = ['navy', 'turquoise', 'darkorange', 'red', 'black']
lw = 2


for color, i, target_name in zip(colors, [0, 1, 2, 3, 4], target_names):
    plt.scatter(X_r[y_num == i, 0], X_r[y_num == i, 1], color=color, alpha=.8, lw=lw,label=target_name)
    
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.6)   
plt.title('PCA of 2 Items')
plt.show()