# Loading and preprocessing the dataset

In [None]:
import pandas as pd
import gzip
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import euclidean, cdist

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
dir = "/content/gdrive/My Drive/Network Anomaly Dataset"

Mounted at /content/gdrive


In [None]:

fileName = dir + '/kddcup.data.gz'

names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(fileName, compression='gzip', names = names)

trueLabels = df['label'].values
#df = df.drop('label', axis=1)


In [None]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0


In [None]:
df.shape

(4898431, 41)

In [None]:

#df.describe()
# df.head()

In [None]:
type(trueLabels)
np.unique(trueLabels)

array(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.',
       'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.',
       'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
       'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.',
       'warezclient.', 'warezmaster.'], dtype=object)

In [None]:
# data_cut.info()

In [None]:
# Select the categorical features to be encoded
cat_features = ['protocol_type', 'service', 'flag']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Apply the LabelEncoder to the categorical features
for feature in cat_features:
    df[feature] = encoder.fit_transform(df[feature])

df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,24,9,215,45076,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,24,9,162,4528,0,0,0,0,...,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,1,24,9,236,1228,0,0,0,0,...,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,0,1,24,9,233,2032,0,0,0,0,...,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0
4,0,1,24,9,239,486,0,0,0,0,...,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0


In [None]:
# data = df.to_numpy()
# del df
# import gc
# gc.collect()

In [None]:
def conditional_entropy(x1, y1):
    # Calculate the joint probability distribution
    joint_prob = np.zeros((len(np.unique(x1)), len(np.unique(y1))))
    for i, x in enumerate(np.unique(x1)):
        for j, y in enumerate(np.unique(y1)):
            joint_prob[i, j] = np.sum((x1 == x) & (y1 == y)) / len(x1)

    # Calculate the marginal probability distribution of X
    marginal_prob_X = np.sum(joint_prob, axis=1)

    # Calculate the conditional probability distribution of Y given X
    conditional_prob_Y_given_X = joint_prob / marginal_prob_X[:, None]

    # Calculate the conditional entropy of Y given X
    conditional_entropy_Y_given_X = 0
    for i in range(len(marginal_prob_X)):
        p_x = marginal_prob_X[i]
        for j in range(len(np.unique(y1))):
            p_y_given_x = conditional_prob_Y_given_X[i, j]
            if p_y_given_x != 0:
                conditional_entropy_Y_given_X -= p_x * p_y_given_x * np.log2(p_y_given_x)

    # Convert the entropy to bits
    conditional_entropy_Y_given_X_bits = conditional_entropy_Y_given_X / np.log2(np.e)

    print(f"Conditional entropy = {conditional_entropy_Y_given_X_bits:.2f} bits.")

# Kmeans

# How Kmeans work ?

### kmeans work by assigning random centroids to the data then loop over the samples and assign each sample to the nearest centroid then update the centroid by taking the average of the new clusters then loop again over the samples and assign the samples to the nearest centroid 
### The algorithm repeats this scenario until no update in the clusters or the update in the centroid is smaller then a small value (epsilon) or for specified numbers of iterations

In [None]:
data = df.to_numpy()
del df
import gc
gc.collect()

44

In [None]:
class myKmeans:
  
    def __init__(self, n_clusters, numOfFeatures, threshold = 0.01, maxIterations = 50):
        self.k = n_clusters
        self.numOfFeatures = numOfFeatures
        self.threshold = threshold
        self.maxIterations = maxIterations

    def randomInitialization(self, data):
      rows = np.random.choice(data.shape[0], self.k)
      self.centroidsRows = rows
      return data[rows]

    def nearestCentroid(self, centroids, data):
      # assign each data point to the nearest centroid
      distances = cdist(data, centroids, 'euclidean')
      labels = np.argmin(distances, axis=1)
      return labels   

    def updateCentroids(self, labels, data):
      #numOfFeatures=1
      print(labels)
      uniqueLabels = np.unique(labels)
      for label in range(self.k):
        if label not in uniqueLabels:
          row = self.centroidsRows[label]
          print(row)
          labels[row] = label

          
      sum = np.zeros((self.k, self.numOfFeatures))
      for label in range(self.k):
        indices = labels == label
        count = np.count_nonzero(indices)
        print(count)
        sum[label] = np.sum(data[indices], axis = 0) / count
      return sum

    def converge(self, prev, current):
      sub = current - prev
      val = np.sum(np.linalg.norm(sub, axis = 1) ** 2)
      if val < self.threshold:
        return True
      return False

    def fit(self, data):
      i = 0
      centroids = self.randomInitialization(data)
      while True:
        prevCentroids = centroids
        labels = self.nearestCentroid(centroids, data)
        centroids = self.updateCentroids(labels, data)
        if self.converge(prevCentroids, centroids) or i == self.maxIterations :
          break
        i += 1
        print(i)
      self.centroids = centroids
      self.labels = labels
      # return centroids, labels

    def predict(self, test):
        self.labels =  self.nearestCentroid(self.centroids, test)
        return self.labels
    
    def get_labels(self):
        return self.labels
    
    def get_cluster_centers(self):
        return self.centroids

In [None]:
def process_labels(labels, trueLabels, k) :
  processedLabels = np.empty(len(labels))#, dtype='U10')
  for label in range(k):
    indices = labels == label
    trueLabelsInCluster = trueLabels[indices]
    #print(trueLabelsInCluster)
    unique, counts = np.unique(trueLabelsInCluster, return_counts=True)
    if len(counts) != 0:
      # Find the index of the maximum count
      max_index = np.argmax(counts)

      # Get the most frequent string
      most_frequent = unique[max_index]

      print("most frequent is :", most_frequent)
      processedLabels[indices] = most_frequent

  return processedLabels

In [None]:
k = 2
numOfFeatures = 41
mykmeans = myKmeans(k, numOfFeatures, 0.1, 30)
mykmeans.fit(data)
# predLabels = mykmeans.predict(test)

[0 0 0 ... 0 0 0]
1979326
2919105
1
[0 0 0 ... 0 0 0]
1977953
2920478
2
[0 0 0 ... 0 0 0]
1973278
2925153
3
[0 0 0 ... 0 0 0]
1972696
2925735
4
[0 0 0 ... 0 0 0]
1972646
2925785


In [None]:
predLabels = mykmeans.get_labels()
print(len(predLabels))
print(np.unique(predLabels))

4898431
[0 1]


Number of samples in each class when 
K = 31


[ 1813   523 13731    67   246  2096 26518    10  6003   373   221    37
  8605 36863  1844    12 32896     1 41525  4808     6  1065    86    12    15 55324 76329]

  
[ 1  2  3  4  5  6  8  9 11 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 30]



```
k = 23
precision =  0.9040579836975224
recall score =  0.8458632474785309
f1 score =  0.8187529456202111
Conditional entropy = 0.50 bits

```



```
k = 31
precision =  0.9337409044466478
recall score =  0.9219686910223805
f1 score =  0.893826841409302
Conditional entropy = 0.28 bits.
```



```
k = 45
precision =  0.9426578323026894
recall score =  0.92774307218941
f1 score =  0.9049403160217782
Conditional entropy = 0.24 bits.
```






In [None]:
encoder = LabelEncoder()
trueLabels_coded = encoder.fit_transform(trueLabels)
processed_labels = process_labels(predLabels, trueLabels_coded, k)

most frequent is : 9
most frequent is : 18


In [None]:
print(np.unique(trueLabels_coded))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]


In [None]:
print(len(processed_labels))
print(np.unique(processed_labels))

4898431
[ 9. 18.]


In [None]:
inversed_pred_labels = encoder.inverse_transform([9,18])
inversed_pred_labels

array(['neptune.', 'smurf.'], dtype=object)

# comparision

## There are two classes clustered using kmeans clustering is smurf, neptune

### Number of detected anomalies : 2  

### Characteristics of smurf

In [None]:
df.loc[df['label'] == 'smurf.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,2807886.0,935.773096,200.021429,520.0,1032.0,1032.0,1032.0,1032.0
dst_bytes,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Characteristics of neptune

In [None]:
df.loc[df['label'] == 'neptune.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,1072017.0,1.865642e-06,0.001932,0.0,0.0,0.0,0.0,2.0
src_bytes,1072017.0,0.009994244,10.347866,0.0,0.0,0.0,0.0,10714.0
dst_bytes,1072017.0,0.0008208825,0.849927,0.0,0.0,0.0,0.0,880.0
land,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,1072017.0,9.32821e-07,0.000966,0.0,0.0,0.0,0.0,1.0
num_compromised,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Evaluation of Kmeans**

In [None]:
arr = np.unique(trueLabels_coded)
a = np.isin(processed_labels, arr)

false_count = np.count_nonzero(~a)
print(arr)
print(false_count)
print(np.unique(processed_labels))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
0
[ 9. 18.]


In [None]:
print(len(processed_labels) == len(trueLabels_coded))

print("precision = ",  precision_score(trueLabels_coded, processed_labels ,average='weighted', zero_division=1.0))

print("recall score = ", recall_score(trueLabels_coded, processed_labels,average='weighted', zero_division=1.0))

print("f1 score = ", f1_score(trueLabels_coded, processed_labels,average='weighted', zero_division=1.0))

print("conditional_entropy score = ", conditional_entropy(trueLabels_coded, processed_labels))

# Conditional Entropy

True
precision =  0.876983573571414
recall score =  0.7920703588557234
f1 score =  0.7155466910963125
Conditional entropy = 0.07 bits.
conditional_entropy score =  None


# testing on corrected

In [None]:
fileName = dir + '/corrected.gz'

names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


test = pd.read_csv(fileName, compression='gzip', names = names)

trueLabels_test = test['label'].values
#test = test.drop('label', axis=1)


In [None]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
1,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
2,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
4,0,udp,private,SF,105,146,0,0,0,0,...,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0


In [None]:
trueLabels_test

array(['normal.', 'normal.', 'normal.', ..., 'normal.', 'normal.',
       'normal.'], dtype=object)

In [None]:
# Select the categorical features to be encoded
cat_features = ['protocol_type', 'service', 'flag']

# Initialize the LabelEncoder
encoder_test = LabelEncoder()

# Apply the LabelEncoder to the categorical features
for feature in cat_features:
    test[feature] = encoder_test.fit_transform(test[feature])

test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,2,46,9,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,2,46,9,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,2,46,9,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3,0,2,46,9,105,146,0,0,0,0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,2,46,9,105,146,0,0,0,0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.


In [None]:
pred_test_labels = mykmeans.predict(test)
pred_test_labels

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
print(len(pred_test_labels))
print(np.unique(pred_test_labels))

311029
[0 1]


In [None]:
encoder = LabelEncoder()
trueLabels_test_coded = encoder_test.fit_transform(trueLabels_test)
processed_labels = process_labels(pred_test_labels, trueLabels_test_coded, k)

most frequent is : 14
most frequent is : 27


In [None]:
print(np.unique(trueLabels_test_coded))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37]


In [None]:
print(len(processed_labels))

print(np.unique(processed_labels))

311029
[14. 27.]


In [None]:
inversed_pred_labels = encoder_test.inverse_transform([14,27])
inversed_pred_labels

array(['neptune.', 'smurf.'], dtype=object)

In [None]:
arr = np.unique(trueLabels_test_coded)
a = np.isin(processed_labels, arr)

false_count = np.count_nonzero(~a)
print(arr)
print(false_count)
print(np.unique(processed_labels))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37]
0
[14. 27.]


In [None]:
print(len(processed_labels) == len(trueLabels_test_coded))

print("precision = ",  precision_score(trueLabels_test_coded, processed_labels ,average='weighted', zero_division=1.0))

print("recall score = ", recall_score(trueLabels_test_coded, processed_labels,average='weighted', zero_division=1.0))

print("f1 score = ", f1_score(trueLabels_test_coded, processed_labels,average='weighted', zero_division=1.0))

print("conditional_entropy score = ", conditional_entropy(trueLabels_test_coded, processed_labels))

# Conditional Entropy

True
precision =  0.8607244464525602
recall score =  0.7140556025322398
f1 score =  0.6223313651983369
Conditional entropy = 0.05 bits.
conditional_entropy score =  None


### charactristics of smurf and neptune

In [None]:
test.loc[test['label'] == 'smurf.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,164091.0,860.301516,242.12741,508.0,520.0,1032.0,1032.0,1032.0
dst_bytes,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,164091.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test.loc[test['label'] == 'neptune.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,58001.0,8.6e-05,0.020761,0.0,0.0,0.0,0.0,5.0
src_bytes,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dst_bytes,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,58001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Normalized Cut


## How spectral clustering work ?
### Spectral clustering work by building a similarity graph and constructing degree matrix of this graph then we use those matrices to get the laplacian matrix where
  la = delta_inverse*(delta-A)
### where delta_inverse is inverse of delta matrix and A is the constructed similarity graph

### then we ger the smallest k eigen vector correspoding to the eigen values
### where k is the number of clusters
### after that we apply k means on those clusters 
### becase the resulted eigen vectors are spherical they can be clustered using kmeans clustering

In [None]:
data_size = 0.0015
random_seed = 42
data_cut, _ = train_test_split(data, test_size = 1 - data_size, random_state=random_seed, stratify=[True]*len(data))
data_cut.shape

(7347, 42)

In [None]:
import gc
#del df_for_Spectral
del data
gc.collect()

0

In [None]:
data_cut = pd.DataFrame(data_cut)

In [None]:
data_cut.head()
data_cut.columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [None]:
true_labels_SC = data_cut['label']
data_cut.drop('label', axis = 1, inplace = True)
data_cut.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,icmp,ecr_i,SF,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,tcp,private,S0,0,0,0,0,0,0,...,255,3,0.01,0.07,0.0,0.0,1.0,1.0,0.0,0.0
2,0,icmp,ecr_i,SF,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,icmp,ecr_i,SF,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,icmp,ecr_i,SF,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
true_labels_SC

0         smurf.
1       neptune.
2         smurf.
3         smurf.
4         smurf.
          ...   
7342      smurf.
7343      smurf.
7344    neptune.
7345    neptune.
7346      smurf.
Name: label, Length: 7347, dtype: object

In [None]:
# Select the categorical features to be encoded
cat_features = ['protocol_type', 'service', 'flag']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Apply the LabelEncoder to the categorical features
for feature in cat_features:
    data_cut[feature] = encoder.fit_transform(data_cut[feature])

labels_encoded_SC = encoder.fit_transform(true_labels_SC)
data_cut.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,11,4,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,1,36,3,0,0,0,0,0,0,...,255,3,0.01,0.07,0.0,0.0,1.0,1.0,0.0,0.0
2,0,0,11,4,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,0,11,4,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,0,11,4,520,0,0,0,0,0,...,255,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:

encoder.inverse_transform([9])

array(['smurf.'], dtype=object)

In [None]:
labels_encoded_SC

array([9, 3, 9, ..., 3, 3, 9])

In [None]:
from sklearn.metrics import pairwise_distances
from scipy.sparse.linalg import eigsh
import cupy as cp

def SpectralClustering(X, n_clusters):

  #obtaining similarity matrix
  D = cp.asarray(pairwise_distances(X))
  sigma = cp.median(D)
  A = cp.exp(-D**2/(2*sigma**2))

  #obtaining degree matrix
  D = cp.diag(np.sum(A, axis=1))

  #laplacian matrix
  L= D-A
  
  #get eigen vectors from L
  #eig_val, eig_vec = np.linalg.eigh(L)
  #eig_vec_chosen = eig_vec[:1]

  #get eigen vectors from La
  La = cp.matmul(cp.linalg.inv(D),L)
  eig_val, eig_vec = cp.linalg.eigh(La)
  eig_vec_chosen = eig_vec[:n_clusters]

  #normalize eigen vectors
  u_normalized = cp.vstack([row/np.linalg.norm(row) for row in eig_vec_chosen.T])

  #u_normalized = u_normalized.reshape()
  print(u_normalized.shape)
  print(u_normalized)
  #apply kmeans
  threshold=0.1
  max_tierations=30
  num_of_features = n_clusters
  kms = myKmeans(n_clusters,num_of_features)
  kms.fit(u_normalized.get())
  labels = kms.get_labels()

  return labels

k=11
labels_pred = SpectralClustering(data_cut, k)

(7347, 11)
[[ 0.3238416   0.26401449  0.32383326 ...  0.26527778  0.26398767
   0.30896793]
 [ 0.          0.78430289 -0.23092268 ...  0.212211    0.18965939
   0.06213798]
 [ 0.          0.01761322 -0.09499339 ...  0.10923526  0.19887635
   0.6661852 ]
 ...
 [-0.29803494 -0.35060207 -0.2982624  ... -0.35273643 -0.35102184
  -0.17378593]
 [ 0.22186828  0.11714338  0.22193988 ...  0.11688699  0.11671247
   0.58981707]
 [-0.22491997 -0.41733875 -0.22514681 ... -0.41873414 -0.41844127
  -0.16529158]]
[4 4 7 ... 2 4 3]
367
720
599
1222
1456
890
551
277
456
465
344
1
[4 4 4 ... 2 4 2]
387
665
585
1181
1412
839
648
348
473
467
342
2
[4 4 4 ... 2 4 6]
408
620
556
1200
1398
784
646
416
480
512
327
3
[4 7 4 ... 2 4 6]
398
582
529
1211
1380
749
647
464
490
578
319
4
[4 7 4 ... 2 4 6]
385
559
517
1212
1378
756
643
466
493
623
315
5
[4 7 4 ... 2 4 6]
368
565
505
1202
1381
787
648
429
502
641
319
6
[4 7 4 ... 2 4 6]
362
582
497
1186
1384
816
652
393
515
639
321
7
[4 7 4 ... 2 4 6]
356
600
496
1170


In [None]:
unique, count = np.unique(labels_pred, return_counts=True)
print(count)
print(unique)

[ 356  600  496 1170 1388  802  653  401  519  638  324]
[ 0  1  2  3  4  5  6  7  8  9 10]


In [None]:
print((labels_encoded_SC))

[9 3 9 ... 3 3 9]


In [None]:
np.unique(labels_encoded_SC)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [None]:
processed_labels_SC = process_labels(labels_pred, labels_encoded_SC, 11)
print(processed_labels_SC)

most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
most frequent is : 9
[9. 9. 9. ... 9. 9. 9.]


In [None]:
processed_labels_SC

array([9., 9., 9., ..., 9., 9., 9.])

In [None]:
np.unique(processed_labels_SC)

array([9.])

In [None]:
sum=0
for i in range(len(processed_labels_SC)):
  #if labels__pred[i] == 1 : labels__pred[i] = 14
  sum += 1 if processed_labels_SC[i] != labels_encoded_SC[i] else 0
  #print("prediction :",processed_labels_SC[i],"actual :", labels_encoded_SC[i])

print(sum)

3157


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix, fowlkes_mallows_score
from sklearn.metrics import f1_score, fbeta_score, v_measure_score, precision_score, recall_score, accuracy_score,homogeneity_score
from sklearn.metrics import homogeneity_score, v_measure_score, completeness_score, adjusted_mutual_info_score, jaccard_score
from scipy.stats import entropy
from sklearn.preprocessing import StandardScaler

#true_labels = [0,0,0,1,1,1,2,2,2]
#pred = [1,1,1,0,0,0,7,7,7]
print("External meaures : ")
print("f1_meausure score is",f1_score(labels_encoded_SC, processed_labels_SC, average='weighted'))
print("accuracy_score  is",accuracy_score(labels_encoded_SC, processed_labels_SC))
print("precision score is",precision_score(labels_encoded_SC, processed_labels_SC, average='weighted', zero_division=1))
print("recall score is",recall_score(labels_encoded_SC, processed_labels_SC, average='weighted', zero_division=1))
print("conditional entropy score is",conditional_entropy(labels_encoded_SC, processed_labels_SC))
print("________________________________________________")

External meaures : 
f1_meausure score is 0.4142429339992611
accuracy_score  is 0.5703008030488634
precision score is 0.754942202909315
recall score is 0.5703008030488634
Conditional entropy = 0.00 bits.
conditional entropy score is None
________________________________________________


In [None]:
df.loc[df['label'] == 'smurf.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,2807886.0,935.773096,200.021429,520.0,1032.0,1032.0,1032.0,1032.0
dst_bytes,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# comparision

## Due to using majority voting for assigning labels to cluster, The only class clustered using spectral clustering was 9 which is smurf

### Number of detected anomalies : 1

### charactristics of smurf :
land,wrong_fragment,urgent,hot	,num_failed_logins,logged_in	,num_compromised	,root_shell	,su_attempted	,num_root	,num_file_creations	,num_shells	,num_access_files	,num_outbound_cmds	,is_host_login	,is_guest_login, duration, dst_bytes, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, diff_srv_rate, srv_diff_host_rate, dst_host_srv_diff_host_rate, dst_host_srv_serror_rate, dst_host_srv_rerror_rate

-> All these features are set to zero

src_bytes	

    mean  935.773096	
    std  200.021429	
    min  520.0	1032.0	
    25%  1032.0	
    50%  1032.0	
    75%  1032.00
    max  1032.00

count	, srv_count

    mean 507.002255	
    std  18.364356	
    min  1.0	
    25%  511.0	
    50%  511.0	
    75%  511.0	
    max  511.00

dst_host_rerror_rate

    mean 0.000028		
    std  0.003203		
    min  0.0
    25%  0.0
    50%  0.0
    75%  0.0
    max  0.66

dst_host_serror_rate

    mean 0.000011	
    std  0.001555	
    min  0.0
    25%  0.0
    50%  0.0
    75%  0.0
    max  0.41

same_srv_rate

    mean 1.000000	
    std  0.000000	
    min  1.0
    25%  1.0
    50%  1.0
    75%  1.0
    max  1.00

dst_host_count

    mean 254.981744	
    std  1.613447	
    min  1.0	
    25%  255.0	
    50%  255.0	
    75%  255.0	
    max  255.00

dst_host_srv_count

    mean 254.907880
    std  3.950032
    min  1.0	
    25%  255.0	
    50%  255.0	
    75%  255.0	
    max  255.00    

dst_host_same_srv_rate

    mean 0.999691	
    std  0.014137	
    min  0.0
    25%  1.0
    50%  1.0
    75%  1.0
    max  1.00

dst_host_diff_srv_rate

    mean 0.000034		
    std  0.002105	
    min  0.0
    25%  0.0
    50%  0.0
    75%  0.0
    max  0.26

dst_host_same_src_port_rate

    mean 0.999691		
    std  0.014137	
    min  0.0	
    25%  1.0	
    50%  1.0	
    75%  1.0	
    max  1.00

dst_host_serror_rate

    mean 2807886.0	
    std  0.000011	
    min  0.001555
    25%  0.0
    50%  0.0
    75%  0.0
    max  0.41

dst_host_rerror_rate

    mean 0.000028
    std  0.003203	
    min  0.0
    25%  0.0
    50%  0.0
    75%  0.0
    max  0.66

  

# New Clustering Algorithm (DBSCAN)

## **How DBSCAN work:**
This is an implementation of the Density-Based Spatial Clustering of Applications with Noise (DBSCAN) algorithm.

The DBSCAN class has two parameters in its constructor: eps and min_samples, which control the density threshold and minimum number of points required for a cluster.

The fit method of the DBSCAN class takes a feature matrix X as input and computes the clusters and noise points in the data. It first initializes some variables such as the number of samples and features, the labels for each sample, and a KDTree for nearest neighbor search.

Then, for each sample in X, the expand_cluster method is called if it hasn't been assigned a label yet (i.e., self.labels[i] == 0). The expand_cluster method finds the set of samples that are density-connected to the current sample by recursively adding neighbors that meet the density requirements until no more neighbors are found. If the number of neighbors is less than min_samples, the sample is labeled as noise (self.labels[i] = -1) and the noise counter is incremented. Otherwise, a new cluster is created (self.cluster_id += 1) and the current and all newly discovered samples are assigned to this cluster (self.labels[j] = self.cluster_id).

In [None]:
data_size = 0.0015
random_seed = 42
X, _ = train_test_split(df, test_size = 1 - data_size, random_state=random_seed, stratify=[True]*len(df))
y = X['label']
X = X.drop('label', axis=1)
#df_temp.describe()
# Select the categorical features to be encoded
cat_features = ['protocol_type', 'service', 'flag']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Apply the LabelEncoder to the categorical features
for feature in cat_features:
    X[feature] = encoder.fit_transform(X[feature])

X = X.values
trueLabels = y.values
labels_encoded = encoder.fit_transform(trueLabels)
trueLabels_coded = LabelEncoder().fit_transform(trueLabels)

In [None]:
class DBSCAN:
    def __init__(self, eps, min_samples):
        self.eps = eps
        self.min_samples = min_samples
    
    def fit(self, X):
        self.n_samples, self.n_features = X.shape
        self.labels = np.zeros(self.n_samples)
        self.cluster_id = 0
        self.noise = 0
        self.tree = KDTree(X)
        
        for i in range(self.n_samples):
            if self.labels[i] == 0:
                self.expand_cluster(i)

    def expand_cluster(self, i):
        neighbors = self.get_neighbors(i)
        if len(neighbors) < self.min_samples:
            self.labels[i] = -1
            self.noise += 1
        else:      
            self.cluster_id += 1
            self.labels[i] = self.cluster_id
            for j in neighbors:
                if self.labels[j] == 0:                
                    self.labels[j] = self.cluster_id
                    new_neighbors = self.get_neighbors(j)
                    if len(new_neighbors) >= self.min_samples:
                        neighbors += new_neighbors
            

    def get_neighbors(self, i):
        return self.tree.query_radius(X[i].reshape(1,-1), r=self.eps, count_only=False)[0].tolist()

In [None]:
# def process_labels(labels, trueLabels, k) :
#     processedLabels = np.empty(len(labels), dtype=trueLabels.dtype)
#     for label in range(k):
#         indices = labels == label
#         trueLabelsInCluster = trueLabels[indices]
#         if len(trueLabelsInCluster) > 0:
#             processedLabels[indices] = np.bincount(trueLabelsInCluster).argmax()
#     return processedLabels

### Work

In [None]:
# number of clusters
np.unique(dbscan.labels).shape[0] - 1 # minus 1 for anomalies

9

In [None]:
# dbscan = DBSCAN(eps=2.85, min_samples=8)
dbscan = DBSCAN(eps=6.6, min_samples=30)
# dbscan = DBSCAN(eps=5.85, min_samples=29)
dbscan.fit(X)
k = np.unique(dbscan.labels).shape[0]
predLabels = dbscan.labels

processed_labels = process_labels(predLabels, trueLabels_coded, k)

print("precision =",  precision_score(trueLabels_coded, processed_labels, average='weighted', zero_division=1))
print("recall score =", recall_score(trueLabels_coded, processed_labels, average='weighted', zero_division=1))
print("f1 score =", f1_score(trueLabels_coded, processed_labels,average='weighted'))
conditional_entropy(trueLabels_coded, dbscan.labels)

most frequent is : 9
most frequent is : 3
most frequent is : 9
most frequent is : 9
most frequent is : 5
most frequent is : 3
most frequent is : 9
most frequent is : 8
most frequent is : 5
precision = 0.9838470965562044
recall score = 0.768068599428338
f1 score = 0.7985650685297303
Conditional entropy = 0.69 bits.


In [None]:
unique, count = np.unique(predLabels, return_counts=True)
print(count)
print(unique)

[1726  557  610   80 3331   70  737  181   30   25]
[-1.  1.  2.  3.  4.  5.  6.  7.  8.  9.]


In [None]:
print(np.unique(processed_labels))
print(dbscan.noise)

[-1.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
1726


In [None]:
inversed_pred_labels = encoder.inverse_transform([1,  2,  3,  4,  5,  6,  7,  8,  9])
inversed_pred_labels

array(['ipsweep.', 'land.', 'neptune.', 'nmap.', 'normal.', 'pod.',
       'portsweep.', 'satan.', 'smurf.'], dtype=object)

## Comparison
## There are nine classes clustered using DBSCAN clustering is ['ipsweep.', 'land.', 'neptune.', 'nmap.', 'normal.', 'pod.', 'portsweep.', 'satan.', 'smurf.']



### Characteristic of Classes:

In [None]:
df.loc[df['label'] == 'ipsweep.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,12481.0,1.045509,114.063934,0.0,0.0,0.0,0.0,12743.0
src_bytes,12481.0,10.436584,37.094926,0.0,8.0,8.0,18.0,4113.0
dst_bytes,12481.0,4.394359,462.195799,0.0,0.0,0.0,0.0,51633.0
land,12481.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,12481.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,12481.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,12481.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,12481.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,12481.0,0.002243,0.047313,0.0,0.0,0.0,0.0,1.0
num_compromised,12481.0,0.00032,0.035804,0.0,0.0,0.0,0.0,4.0


In [None]:
df.loc[df['label'] == 'land.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dst_bytes,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,21.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
wrong_fragment,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.loc[df['label'] == 'neptune.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,1072017.0,1.865642e-06,0.001932,0.0,0.0,0.0,0.0,2.0
src_bytes,1072017.0,0.009994244,10.347866,0.0,0.0,0.0,0.0,10714.0
dst_bytes,1072017.0,0.0008208825,0.849927,0.0,0.0,0.0,0.0,880.0
land,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,1072017.0,9.32821e-07,0.000966,0.0,0.0,0.0,0.0,1.0
num_compromised,1072017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.loc[df['label'] == 'nmap.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,2316.0,24.424007,60.370525,0.0,0.0,8.0,8.0,215.0
dst_bytes,2316.0,0.132556,4.776124,0.0,0.0,0.0,0.0,207.0
land,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,2316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.loc[df['label'] == 'normal.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,972781.0,217.824724,1351.943873,0.0,0.0,0.0,0.0,58329.0
src_bytes,972781.0,1477.84625,110500.419401,0.0,147.0,231.0,313.0,89581520.0
dst_bytes,972781.0,3234.650111,34231.680611,0.0,135.0,422.0,2131.0,11730594.0
land,972781.0,7e-06,0.002683,0.0,0.0,0.0,0.0,1.0
wrong_fragment,972781.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,972781.0,3.6e-05,0.015999,0.0,0.0,0.0,0.0,14.0
hot,972781.0,0.049535,0.931913,0.0,0.0,0.0,0.0,77.0
num_failed_logins,972781.0,9.9e-05,0.013141,0.0,0.0,0.0,0.0,4.0
logged_in,972781.0,0.719268,0.449357,0.0,0.0,1.0,1.0,1.0
num_compromised,972781.0,0.038389,8.653573,0.0,0.0,0.0,0.0,7479.0


In [None]:
df.loc[df['label'] == 'pod.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,264.0,1462.651515,125.098044,564.0,1480.0,1480.0,1480.0,1480.0
dst_bytes,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,264.0,0.981061,0.13657,0.0,1.0,1.0,1.0,1.0
urgent,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.loc[df['label'] == 'portsweep.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,10413.0,2329.586286,8129.703,0.0,0.0,0.0,0.0,42908.0
src_bytes,10413.0,431708.311822,20383540.0,0.0,0.0,0.0,0.0,1379964000.0
dst_bytes,10413.0,202681.316431,13983600.0,0.0,0.0,0.0,0.0,1309937000.0
land,10413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,10413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,10413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,10413.0,0.000768,0.02770837,0.0,0.0,0.0,0.0,1.0
num_failed_logins,10413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,10413.0,0.000288,0.01697192,0.0,0.0,0.0,0.0,1.0
num_compromised,10413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.loc[df['label'] == 'satan.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,15892.0,0.031462,0.4475,0.0,0.0,0.0,0.0,12.0
src_bytes,15892.0,0.998742,35.927415,0.0,0.0,0.0,0.0,1710.0
dst_bytes,15892.0,2.127486,145.103157,0.0,0.0,0.0,0.0,18056.0
land,15892.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,15892.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,15892.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,15892.0,0.000692,0.0327,0.0,0.0,0.0,0.0,3.0
num_failed_logins,15892.0,0.000252,0.03173,0.0,0.0,0.0,0.0,4.0
logged_in,15892.0,0.003587,0.059784,0.0,0.0,0.0,0.0,1.0
num_compromised,15892.0,0.000189,0.013739,0.0,0.0,0.0,0.0,1.0


In [None]:
df.loc[df['label'] == 'smurf.'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
src_bytes,2807886.0,935.773096,200.021429,520.0,1032.0,1032.0,1032.0,1032.0
dst_bytes,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
land,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_failed_logins,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logged_in,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
num_compromised,2807886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Testing

In [None]:
# fit DBSCAN model
eps_list = [round(i * 0.05, 2) for i in range(151, 201)]
min_samples_list = [i for i in range(1, 11)]
print(eps_list)
print(min_samples_list)

[7.55, 7.6, 7.65, 7.7, 7.75, 7.8, 7.85, 7.9, 7.95, 8.0, 8.05, 8.1, 8.15, 8.2, 8.25, 8.3, 8.35, 8.4, 8.45, 8.5, 8.55, 8.6, 8.65, 8.7, 8.75, 8.8, 8.85, 8.9, 8.95, 9.0, 9.05, 9.1, 9.15, 9.2, 9.25, 9.3, 9.35, 9.4, 9.45, 9.5, 9.55, 9.6, 9.65, 9.7, 9.75, 9.8, 9.85, 9.9, 9.95, 10.0]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
best_eps, best_min_samples, f1, best_f1 = 0.6, 6, 0, 0

for e in eps_list:
    for n in min_samples_list:
        dbscan = DBSCAN(eps=e, min_samples=n)
        dbscan.fit(X)
        k = np.unique(dbscan.labels).shape[0]
        predLabels = dbscan.labels        
        processed_labels = process_labels(predLabels, trueLabels_coded, k)        
        f1 = f1_score(trueLabels_coded, processed_labels, average='weighted')
        if f1 > best_f1:
            best_eps, best_min_samples, best_f1 = e, n, f1
print(best_eps)
print(best_min_samples)
print(best_f1)

# Draft

In [None]:
def get_normalized_cut_Y_vectors_3NN_similarity_measure(df, data_size, random_seed, k_clusters):
    train, _ = train_test_split(df, test_size = 1-data_size, random_state=random_seed)
    dis = 1 * rbf_kernel(train, gamma=1)
    print(df.shape)
    size_of_sim_matrix = int(df.shape[0]*data_size)
    ind = np.argpartition(dis[size_of_sim_matrix-1], 4)[-4:]
  
    #A matrix
    A = np.zeros((size_of_sim_matrix,size_of_sim_matrix))
    for i in range(size_of_sim_matrix) :
        ind = np.argpartition(dis[i], -4)[-4:] 
        for  j in range(size_of_sim_matrix) :
            if j in ind and i!=j :
                A[i][j] = 1
    #print(A)
    
    
    #delta matrix
    delta = np.zeros((size_of_sim_matrix,size_of_sim_matrix))
    for i in range(size_of_sim_matrix) :
        delta[i][i] = np.sum(A[i])   
    #print(delta)
    
    #delta inverse
    delta_inv = np.linalg.inv(delta)
    
    #normalized assymetric laplacian matrix
    La = np.dot(delta_inv, delta-A)
    #print(La)
    
    #eigen vectors and eigen values
    eig_val, eig_vec = np.linalg.eig(La)
    
    #get magnitue of complex number
    eig_val_mag = np.abs(eig_val)
    eig_vec_mag = np.abs(eig_vec)

    #get indecies of smallest k_clusters

    ind_second_smallest_eig_val_mag = np.argpartition(eig_val_mag, k_clusters)[-1*k_clusters:]

    U = pd.DataFrame()
    for i in range(k_clusters) :
      U['u'+str(i)] = eig_vec_mag[ind_second_smallest_eig_val_mag[i]]

    #normalize U vectors to get Y
    for i in range(len(delta)) :
        if U.loc[i].sum() > 0:
            base = 0
            for j in range(k_clusters) :
              base += U['u'+str(j)][i]**2
            base = np.sqrt(base)
            for j in range(k_clusters) :
              U['u'+str(j)][i] = U['u'+str(j)][i] / base
    return U

'''
#3d scatter plot
my_cmap = plt.get_cmap('hsv')
fig = plt.figure(figsize=(10,7))
ax = plt.axes(projection="3d") 
sctt=ax.scatter3D(U['u1'], U['u2'], U['u3'], 
                  label='u1 with u2' ,
                  c = (U['u1']+ U['u2']+ U['u3']),
                  cmap = my_cmap, 
                  marker ='^', 
                  alpha = 0.8)
plt.title("3D scatter for u1 u2 u3")
ax.set_xlabel("u1", fontweight = 'bold')
ax.set_ylabel("u2", fontweight = 'bold')
ax.set_ylabel("u3", fontweight = 'bold')
fig.colorbar(sctt, ax = ax, shrink = 0.5, aspect = 5)
plt.legend()
plt.show()
'''

random_seed=42
data_size=0.01
k=23
    
U = get_normalized_cut_Y_vectors_3NN_similarity_measure(data_cut, data_size,random_seed,k)
labels_encoded_per, _ = train_test_split(labels_encoded, test_size = 1-data_size, random_state=random_seed)

In [None]:
data_cut.shape
#print("rand score is",adjusted_rand_score(labels_encoded_SC, processed_labels_SC))
#print("jaccard score is",jaccard_score(labels_encoded_SC, processed_labels_SC, average='micro'))
#print("homogeneity_score  is",homogeneity_score(labels_encoded_SC, processed_labels_SC))
#print("v_measure_score is",v_measure_score(labels_encoded_SC, processed_labels_SC))
#print("adjusted_mutual_info_score is",adjusted_mutual_info_score(labels_encoded_SC, processed_labels_SC))
#print("completeness_score is",completeness_score(labels_encoded_SC, processed_labels_SC))

#train, _ = train_test_split(data_cut, test_size = 1-data_size, random_state=random_seed, stratify=[True]*len(data_cut))
#labels_encoded_per, _ = train_test_split(labels_encoded, test_size = 1-data_size, random_state=random_seed, stratify=[True]*len(labels_encoded))
#print(train.shape)
#print(train.head())

#kmeans = KMeans(n_clusters).fit(u_normalized)
#print("resulted labels : ")
#print(kmeans.labels_)
#return kmeans.labels_

#random_seed=42
#data_size=0.01

(489843, 41)

In [None]:
labels_encoded_per

In [None]:
U

In [None]:
len(labels_encoded_per)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix, fowlkes_mallows_score
from sklearn.metrics import jaccard_score, f1_score, fbeta_score, v_measure_score, precision_score, recall_score, accuracy_score
from scipy.stats import entropy

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
scaled_U = StandardScaler().fit_transform(U)
scaled_U = pd.DataFrame(scaled_U)

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "random_state": 1
}

k=23
sse = []
kmeans_results_U = []
#df['true_labels'] = labels_encoded_per

print("K way k =",k)
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_U)
sse.append(kmeans.inertia_)
#print(len(np.unique(kmeans.labels_)))
#df['cluster'] = kmeans.labels_

print(len((kmeans.labels_)))

print("External meaures : ")
print("rand score is",adjusted_rand_score(labels_encoded_per, kmeans.labels_))
print("jaccard score is",jaccard_score(labels_encoded_per, kmeans.labels_, average='micro'))
print("f1_meausure score is",f1_score(labels_encoded_per, kmeans.labels_, average='micro'))

print("accuracy score is",accuracy_score(labels_encoded_per, kmeans.labels_))
print("precision score is",precision_score(labels_encoded_per, kmeans.labels_, average='micro'))
print("recall score is",recall_score(labels_encoded_per, kmeans.labels_, average='micro'))
kmeans_results_U.append(kmeans.labels_)
print("________________________________________________")

In [None]:
labels_encoded_per

In [None]:
len(kmeans.labels_)

In [None]:
pip install --upgrade jupyter_http_over_ws>=0.0.7 && jupyter serverextension enable --py jupyter_http_over_ws

In [None]:
jupyter notebook --NotebookApp.allow_origin="https://colab.research.google.com" --port=8888 --NotebookApp.port_retries=0

# 

In [None]:
labels_encoded_per.iloc[0]

In [None]:
labels_encoded_per.iloc[len(labels_encoded_per)-1]

In [None]:
labels_encoded_per

In [None]:
labels

In [None]:
labels_encoded_per

dic_names = {}
used_set = []
for i in range(len(labels)):
  label = labels_encoded_per.iloc[i]
  label_val = labels[i]
  if (dic_names.get(label) == None) and (label_val not in used_set):
      used_set.append(label_val)
      dic_names[label] = label_val

dic_names

In [None]:
final_coded_labels = []
for i in labels_encoded_per :
  if dic_names.get(i) != None :
    final_coded_labels.append(dic_names.get(i))
  else :
    final_coded_labels.append(23)

np.unique(final_coded_labels)

In [None]:
sum=0
for i in range(len(labels)):
  if labels[i] == 17 : labels[i] = 14
  sum += 1 if labels[i] != final_coded_labels[i] else 0
  print("prediction :",labels[i],"actual :", final_coded_labels[i])

print(sum)

In [None]:
# spectral clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import SpectralClustering, KMeans
from matplotlib import pyplot

random_seed=42
data_size=0.01
k=23
train, _ = train_test_split(data_cut, test_size = 1-data_size, random_state=random_seed)
# define the model
model = SpectralClustering(n_clusters=23)
# fit model and predict clusters
yhat = model.fit_predict(train)
# retrieve unique clusters
clusters = unique(yhat)
print(clusters)
# create scatter plot for samples from each cluster

In [None]:
len(labels)

## Draft

In [None]:
def shifted_mean_clustering(X, kernel_bandwidth):
    # Initialize centroids to every data point
    centroids = X.copy()
    # Loop until convergence
    while True:
        # Compute distances between each point and each centroid
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        # Compute weights using Gaussian kernel
        weights = np.exp(-0.5 * (distances / kernel_bandwidth)**2)
        # Normalize weights for each point
        weights /= weights.sum(axis=0)
        # Compute new centroids as weighted means of points
        new_centroids = (weights[:,:,np.newaxis] * X[np.newaxis,:,:]).sum(axis=1)
        # Check for convergence
        if np.allclose(new_centroids, centroids):
            break
        centroids = new_centroids
        return centroids

In [None]:
def hierarchical_clustering(data):
    # Compute the pairwise distance matrix
    distance_matrix = np.zeros((len(data), len(data)))
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            distance_matrix[i,j] = distance_matrix[j,i] = euclidean_distance(data[i], data[j])
    
    # Initialize each point as its own cluster
    clusters = [{i} for i in range(len(data))]
    
    # Merge the two closest clusters until there is only one cluster left
    while len(clusters) > 1:
        # Find the two closest clusters
        closest_distance = float('inf')
        for i in range(len(clusters)):
            for j in range(i+1, len(clusters)):
                distance = closest_pair_distance(clusters[i], clusters[j], distance_matrix)
                if distance < closest_distance:
                    closest_distance = distance
                    closest_clusters = (i, j)
        
        # Merge the two closest clusters
        clusters[closest_clusters[0]].update(clusters[closest_clusters[1]])
        del clusters[closest_clusters[1]]
    
    # Convert the final cluster to a list of lists of points
    final_cluster = list(clusters[0])
    return [[data[i] for i in final_cluster]]

def euclidean_distance(a, b):
    return np.sqrt(np.sum((a-b)**2))

def closest_pair_distance(cluster1, cluster2, distance_matrix):
    return min(distance_matrix[i,j] for i in cluster1 for j in cluster2)

