# Network Anomaly Detection using Clustering

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import random
random.seed(42)
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
import numpy as np
import math
from scipy.optimize import linear_sum_assignment
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import SpectralClustering
import pandas as pd
import matplotlib.pyplot as plt

# 1. Importing Data and Understanding Format

In [67]:
from ipynb.fs.full.data_preprocessing import preprocess_data_10, preprocess_data, category_map

In [126]:
data_k_means, labels_kmeans = preprocess_data_10()
data_spectral, labels_spectral = preprocess_data()

In [69]:
# from collections import Counter, OrderedDict
# d = Counter(labels_spectral)
# OrderedDict(sorted(d.items()))

# 2.  Clustering Using K-Means and Normalized Cut (Your implementation)


### K-Means algorithm

In [70]:
#It takes two attrs k number of centroids and the whole data set number of samples x features

def kMeans_implemented(k,data):
    centroids=[]
    num_points=data.shape[0]
    num_features=data.shape[1]
    
    #Appending random points to be our centroids according to the number of ks
    for i in range(k):
        centroids.append(data[random.randint(0, num_points)])
    clusters={}
    t=0
    while(True):
        labels=[]
        #Initialize empty clusters
        for i in range (k):
            clusters[i]=[]
            
        #Classify the points according to the closest centroid
        for i in range(num_points):
            distances=[]
            for j in range(k):
                distances.append(np.linalg.norm(data[i]-centroids[j]))
            clusters[distances.index(min(distances))].append(data[i])
            labels.append(distances.index(min(distances)))
        new_centroids=np.zeros((k,num_features))
        
        #Measuring the new centroids
        for i in range(k):
            new_centroids[i]=np.mean(clusters[i],axis=0)
        if(centroids==new_centroids).all():
            break
        else:
            centroids=new_centroids
    return labels

### Spectral Clustering algorithm

In [71]:
from sklearn.cluster import KMeans
def spectral_clustering(A,k):
        
    #--------------computing the degree matrix-------------
    d = np.diag(np.sum(A, axis=1))

    #--------------------computing L-----------------------
    L = d-A

    #---------------------computing La---------------------
    #computing the inverse of the dgree matrix
    inv_degree = np.linalg.inv(d)
    La = np.dot(inv_degree, L)

    #---computing the eigenValues and eigenVectors of La---
    e_val, evec = np.linalg.eig(La)

    #----------sorting the eigenValues ascending----------- 
    idx = np.argsort(eval)
    e_val = e_val[idx]

    #---sorting the eigenVectors according to their corresponding eigenValues---
    evec = evec[:, idx]

    #--slicing the eigenVectors to the desired number of clusters--
    evec_new = evec[:, :k]

    #-------------normalizing the eigenVectors--------------
    system = evec.real / np.sqrt(np.linalg.norm(evec.real))

    kmeans = KMeans(n_clusters=k)
    system_labels = kmeans.fit_predict(system)


    return system, system_labels

## GMM algorithm

In [115]:
import numpy as np
from scipy.stats import multivariate_normal

class GMM:
    def __init__(self, k, max_iter=5):
        self.k = k
        self.max_iter = int(max_iter)

    def initialize(self, X):
        self.shape = X.shape
        self.n, self.m = self.shape

        self.phi = np.full(shape=self.k, fill_value=1/self.k)
        self.weights = np.full( shape=self.shape, fill_value=1/self.k)
        
        random_row = np.random.randint(low=0, high=self.n, size=self.k)
        self.mu = [  X[row_index,:] for row_index in random_row ]
        self.sigma = [ np.cov(X.T) for _ in range(self.k) ]

    def e_step(self, X):
        # E-Step: update weights and phi holding mu and sigma constant
        self.weights = self.predict_proba(X)
        self.phi = self.weights.mean(axis=0)
    
    def m_step(self, X):
        # M-Step: update mu and sigma holding phi and weights constant
        for i in range(self.k):
            weight = self.weights[:, [i]]
            total_weight = weight.sum()
            self.mu[i] = (X * weight).sum(axis=0) / total_weight
            self.sigma[i] = np.cov(X.T, 
                aweights=(weight/total_weight).flatten(), 
                bias=True)

    def fit(self, X):
        self.initialize(X)
        
        for iteration in range(self.max_iter):
            self.e_step(X)
            self.m_step(X)
            
    def predict_proba(self, X):
        likelihood = np.zeros( (self.n, self.k) )
        for i in range(self.k):
            distribution = multivariate_normal(
                mean=self.mu[i], 
                cov=self.sigma[i],
                allow_singular=True
            )
            likelihood[:,i] = distribution.pdf(X)
        
        numerator = likelihood * self.phi
        denominator = numerator.sum(axis=1)[:, np.newaxis]
        weights = numerator / denominator
        return weights
    
    def predict(self, X):
        weights = self.predict_proba(X)
        return np.argmax(weights, axis=1)

In [117]:
from sklearn.mixture import GaussianMixture

class GMM:
    def __init__(self, n_components=1, max_iter=100, tol=1e-4):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol
        self.gmm = None

    def fit(self, X):
        self.gmm = GaussianMixture(n_components=self.n_components, max_iter=self.max_iter, tol=self.tol)
        self.gmm.fit(X)

    def predict_proba(self, X):
        return self.gmm.predict_proba(X)

    def predict(self, X):
        return self.gmm.predict(X)


# Testing

## K-Means Testing

In [74]:
labels1 = kMeans_implemented(7,np.array(data_k_means))
labels2 = kMeans_implemented(15,np.array(data_k_means))
labels3 = kMeans_implemented(23,np.array(data_k_means))
labels4 = kMeans_implemented(31,np.array(data_k_means))
labels5 = kMeans_implemented(45,np.array(data_k_means))

In [85]:
#labels = kMeans_implemented(7,np.array(data_k_means))
contingency_matrix = get_contingency(labels1,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.35235793694748563
----------------Purity---------------------------
Per cluster purity: [0.63, 0.93, 0.99, 1.0, 0.98, 0.72, 0.83]
Purity: 0.9272251452749578
--------------Recalls---------------------------
Per cluster Recall: [0.189, 0.0892, 0.3665, 0.8109, 0.362, 0.0454, 0.0839]
--------------F-measure---------------------------
F: 0.37865756371357984


In [86]:
#labels = kMeans_implemented(15,np.array(data_k_means))
contingency_matrix = get_contingency(labels2,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.19353415547078842
----------------Purity---------------------------
Per cluster purity: [0.97, 0.95, 0.97, 0.98, 0.7, 0.99, 0.98, 0.98, 0.91, 0.97, 1.0, 0.53, 1.0, 0.57, 1.0]
Purity: 0.9672770733449645
--------------Recalls---------------------------
Per cluster Recall: [0.0583, 0.0446, 0.0531, 0.0524, 0.7551, 0.0723, 0.2908, 0.294, 0.189, 0.1071, 0.3826, 0.0045, 0.9913, 0.0203, 0.4283]
--------------F-measure---------------------------
F: 0.324018694422339


In [87]:
#labels = kMeans_implemented(23,np.array(data_k_means))
contingency_matrix = get_contingency(labels3,np.array(labels_kmeans))
evaluation(data_k_means,contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.2619780402966085
----------------Purity---------------------------
Per cluster purity: [0.98, 1.0, 1.0, 0.67, 0.93, 0.96, 1.0, 0.98, 0.94, 0.97, 1.0, 1.0, 0.67, 1.0, 1.0, 0.54, 0.95, 0.74, 0.98, 0.7, 0.98, 0.69, 1.0]
Purity: 0.9368002417814901
--------------Recalls---------------------------
Per cluster Recall: [0.252, 0.0557, 0.2481, 0.189, 0.7859, 0.0411, 0.1737, 0.1007, 0.0203, 0.1515, 0.0676, 0.1846, 0.8356, 0.1782, 0.0567, 0.0042, 0.0052, 0.0348, 0.0222, 0.7582, 0.0259, 0.0252, 0.106]
--------------F-measure---------------------------
F: 0.245611790960462


In [88]:
#labels = kMeans_implemented(31,np.array(data_k_means))
contingency_matrix = get_contingency(labels4,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.14174278422676723
----------------Purity---------------------------
Per cluster purity: [1.0, 1.0, 1.0, 0.95, 0.99, 0.65, 1.0, 1.0, 1.0, 0.98, 1.0, 0.98, 1.0, 0.9, 0.95, 0.99, 0.96, 0.87, 0.95, 1.0, 1.0, 0.99, 1.0, 1.0, 0.97, 0.95, 1.0, 0.69, 0.54, 0.93, 0.71]
Purity: 0.9756638687785916
--------------Recalls---------------------------
Per cluster Recall: [0.0024, 0.1489, 0.1019, 0.0565, 0.0789, 0.75, 0.0295, 0.0458, 0.9913, 0.005, 0.1252, 0.2148, 0.1567, 0.0042, 0.0174, 0.0289, 0.037, 0.0137, 0.624, 0.0777, 0.0585, 0.0528, 0.0001, 0.1569, 0.189, 0.7826, 0.3062, 0.8356, 0.0042, 0.0624, 0.0253]
--------------F-measure---------------------------
F: 0.24337134966674756


In [89]:
#labels = kMeans_implemented(45,np.array(data_k_means))
contingency_matrix = get_contingency(labels5,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.1250806575035399
----------------Purity---------------------------
Per cluster purity: [0.96, 0.68, 1.0, 0.95, 1.0, 0.79, 1.0, 0.99, 0.95, 0.9, 1.0, 0.99, 1.0, 0.85, 0.87, 0.98, 0.96, 1.0, 1.0, 1.0, 0.74, 0.54, 0.96, 1.0, 0.47, 1.0, 1.0, 0.98, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 0.92, 1.0, 0.72, 1.0, 1.0, 0.69, 1.0, 0.97, 1.0]
Purity: 0.9765155990273789
--------------Recalls---------------------------
Per cluster Recall: [0.0368, 0.0041, 0.0288, 0.624, 0.0533, 0.8326, 0.7362, 0.226, 0.0348, 0.0042, 0.0859, 0.0234, 0.0422, 0.0049, 0.0136, 0.0151, 0.0497, 0.0987, 0.1554, 0.1554, 0.0151, 0.0042, 0.0969, 0.0921, 0.0004, 0.0562, 0.0244, 0.0127, 0.0585, 0.0401, 0.0668, 0.0295, 0.0946, 0.005, 0.0135, 0.0316, 0.0047, 0.125, 0.0213, 0.0271, 0.9913, 0.2885, 0.0307, 0.016, 0.0997]
--------------F-measure---------------------------
F: 0.1657574004027622


## Spectral Clustering Testing

In [90]:
# Splitting data to use it in spectral clustering
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral, test_size=0.995, train_size=0.005,stratify=labels_spectral,random_state=42)

In [91]:
# Explore the classes in training dataset after splitting
for i in y_train.unique():
    print(category_map(i))

neptune.
normal.
teardrop.
ipsweep.
back.
satan.
smurf.
portsweep.
pod.
nmap.
warezclient.


In [92]:
sim_matrix=rbf_kernel(X_train)

In [93]:
system,labels_spectral=spectral_clustering(sim_matrix,11)

In [94]:
dict_y={}
y=list(y_train.unique())
for i in range (len(y)):
    dict_y[y_train.unique()[i]]=i

In [95]:
new_y_train=np.array([dict_y[label] for label in y_train])

In [96]:
contingency_matrix = get_contingency(labels_spectral,new_y_train)
evaluation(X_train,contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.5290454824191261
----------------Purity---------------------------
Per cluster purity: [0.75, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.66, 0.6]
Purity: 0.8262002232973576
--------------Recalls---------------------------
Per cluster Recall: [0.0347, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 0.0015, 0.7128, 0.0861, 0.8596]
--------------F-measure---------------------------
F: 0.4024869886301618


## Comparison between K-means and Spectral Clustering

In [97]:
labels6 = kMeans_implemented(11,np.array(X_train))
contingency_matrix = get_contingency(labels6,new_y_train)
evaluation(X_train,contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.17327577432161273
----------------Purity---------------------------
Per cluster purity: [1.0, 0.52, 1.0, 0.72, 1.0, 1.0, 0.99, 0.98, 1.0, 0.9, 0.55]
Purity: 0.9518049869743208
--------------Recalls---------------------------
Per cluster Recall: [0.351, 0.0482, 0.0081, 0.0135, 0.0827, 0.0047, 0.3281, 0.1329, 0.8629, 0.0281, 0.9474]
--------------F-measure---------------------------
F: 0.29239279534715756


In [98]:
new_labels_spectral,mapped_spectral = map_and_change(y_train,labels_spectral)

{0: 11, 1: 20, 2: 20, 3: 20, 4: 20, 5: 20, 6: 14, 7: 11, 8: 11, 9: 11, 10: 9}


In [99]:
new_labels_spectral=cluster_to_class_name(new_labels_spectral)

In [100]:
new_labels_spectral

{0: 'normal.',
 1: 'teardrop.',
 2: 'teardrop.',
 3: 'teardrop.',
 4: 'teardrop.',
 5: 'teardrop.',
 6: 'pod.',
 7: 'normal.',
 8: 'normal.',
 9: 'normal.',
 10: 'neptune.'}

In [101]:
new_labels_kmeans,mapped_kmeans = map_and_change(y_train,labels6)

{0: 11, 1: 11, 2: 11, 3: 11, 4: 11, 5: 11, 6: 11, 7: 11, 8: 9, 9: 11, 10: 5}


In [102]:
new_labels_kmeans=cluster_to_class_name(new_labels_kmeans)

In [103]:
new_labels_kmeans

{0: 'normal.',
 1: 'normal.',
 2: 'normal.',
 3: 'normal.',
 4: 'normal.',
 5: 'normal.',
 6: 'normal.',
 7: 'normal.',
 8: 'neptune.',
 9: 'normal.',
 10: 'ipsweep.'}

In [104]:
normal_spectral,abnormal_spectral=classify_normality(mapped_spectral)

In [105]:
normal_kmeans,abnormal_kmeans=classify_normality(mapped_kmeans)

In [106]:
print(f"Number of normal samples using spectral clustering= {normal_spectral}")
print(f"Number of abnormal samples using spectral clustering= {abnormal_spectral}")
print(f"Number of normal samples using kmeans clustering= {normal_kmeans}")
print(f"Number of abnormal samples using kmeans clustering= {abnormal_kmeans}")

Number of normal samples using spectral clustering= 3637
Number of abnormal samples using spectral clustering= 1737
Number of normal samples using kmeans clustering= 4296
Number of abnormal samples using kmeans clustering= 1078


## GMM Testing

In [118]:
gmm = GMM(23)
labels = gmm.fit(np.array(data_k_means))

In [119]:
labels = gmm.predict(np.array(data_k_means))

In [120]:
contingency_matrix = get_contingency(labels,labels_kmeans)
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.17864866395510653
----------------Purity---------------------------
Per cluster purity: [0.6, 1.0, 1.0, 1.0, 0.54, 0.76, 0.95, 0.76, 1.0, 1.0, 0.71, 0.87, 0.89, 1.0, 0.5, 1.0, 0.91, 0.91, 0.85, 0.89, 0.45, 0.97, 0.4]
Purity: 0.956383168711277
--------------Recalls---------------------------
Per cluster Recall: [0.0158, 0.7774, 0.0024, 0.1838, 0.0042, 0.0118, 1.0, 0.0346, 0.0, 0.6992, 0.75, 0.0539, 0.066, 0.9913, 0.125, 0.0001, 0.0001, 0.0005, 0.9811, 0.0622, 0.0233, 0.0445, 0.6]
--------------F-measure---------------------------
F: 0.30014664845251915


# Evaluation

In [75]:
def cluster_to_class_name(labels_dict):
    for key in labels_dict.keys():
        value=labels_dict[key]
        labels_dict[key]=category_map(value)
    return labels_dict

In [76]:
def classify_normality(mapped_labels):
    normal=0
    abnormal=0
    for label in mapped_labels:
        if label==11:
            normal+=1
        else:
            abnormal+=1
    return normal,abnormal

In [77]:
# map labels resulting in k-means to true labels in able to do predictions
def map_and_change(y_train, labels):
    mapping = {}
    labels = np.array(list(labels))
    for i in np.unique(labels):
        binary = [int(x) for x in labels == i]
        mapping[i] = np.bincount([value for value, flag in zip(y_train, binary) if flag == 1]).argmax()

    # Map the cluster labels to the true class labels
    mapped_labels = np.array([mapping[label] for label in labels])

    # Print the mapped labels
    print(mapping)
    return mapping, mapped_labels

In [78]:
def map_and_change_test(mapping, labels):
    mapped_labels = np.array([mapping[label] for label in labels])
    return mapped_labels

In [121]:
def get_contingency(labels,true):
    labels = list(labels)
    true = list(true)
    true_len=np.unique(np.array(true)).shape[0]
    cluster_len=np.unique(np.array(labels)).shape[0]
    num_elements = len(labels)
    contingency_matrix = np.zeros((true_len,cluster_len))
    for i in range(num_elements):
        contingency_matrix[true[i],labels[i]] += 1
    return contingency_matrix

In [84]:
def evaluation(data, contingency_matrix):
    n_total = data.shape[0]
    gt_classes=contingency_matrix.shape[0]
    predicted_classes=contingency_matrix.shape[1]
#     TP, TN, FP, FN = 0, 0, 0, 0
#     # True Positive 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if contingency_matrix[i][j] != 1 and contingency_matrix[i][j] != 0:
#                 TP += math.comb(int(contingency_matrix[i][j]),2)

#     # True Negative 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if i != j:
#                 for k in range(predicted_classes):
#                     temp = contingency_matrix[k,i]*(np.sum(contingency_matrix[:,j]) - contingency_matrix[k,j])
#                     TN += temp
#     TN = TN/2

#     # False Positive 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             temp = contingency_matrix[j,i]*(np.sum(contingency_matrix[:,i])-contingency_matrix[j,i])/2
#             FP += temp

#     # False Negative 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if i != j:
#                 for k in range(predicted_classes):
#                     temp = contingency_matrix[k,i]*(contingency_matrix[k,j])
#                     FN += temp
#     FN /= 2
#     print(f"Fake purity= {(TP)/(TP+FP)}")
#     print(f"Fake recall= {(TP)/(TP+FN)}")
#     # Jaccard Index
#     jacc = TP / (TP + FN + FP)

#     # Rand Index
#     rand = (TP + TN)/ (TP + FN + FP + TN)
#     print('---------Confusion Matrix----------------------')
#     print(f"Rand Index: {rand}")
    
#     print(f"Jaccard Index: {jacc}")
#     print(f'TP= {TP},TN= {TN},FN= {FN},FP= {FP}')
    
    ht_c = 0
    for i in range(predicted_classes):
        cluster_elem = np.sum(contingency_matrix[:,i])
        for j in range(gt_classes):  
            temp = contingency_matrix[j][i]/cluster_elem
            if temp != 0:
                ht_c += temp*math.log(temp,2)*(cluster_elem/n_total)
    ht_c = -1*ht_c
    print('---------Conditional Entropy--------------------')
    print(f"Conditional Entropy: {ht_c}")
    
    print('----------------Purity---------------------------')
    purity=0
    purities=[]
    recalls=[]
    for i in range(predicted_classes):
        cluster_sum=np.sum(contingency_matrix[:,i])
        class_max=np.max(contingency_matrix[:,i])
        a=contingency_matrix[:,i]
        max_index=a.argmax()
        purities.append(round((class_max/cluster_sum),2))
        recalls.append(round((class_max/np.sum(contingency_matrix[max_index,:])),4))
        purity+=(class_max/cluster_sum) * (cluster_sum/n_total)
    #purity = np.sum(np.max(contingency_matrix, axis =0))/np.sum(contingency_matrix)
    print(f"Per cluster purity: {purities}")
    print(f"Purity: {purity}")
    print('--------------Recalls---------------------------')
    print(f"Per cluster Recall: {recalls}")
    print('--------------F-measure---------------------------')
    # a row for each cluster, and columns are precision, recall and F-measure respectively
    
    f_measure=0
    for i in range(predicted_classes):
        f_measure+=(2*purities[i]*recalls[i])/(purities[i]+recalls[i])
    f_measure=f_measure/predicted_classes
    print(f"F: {f_measure}")
    
#     print('--------------Max matching------------------------')
#     row_ind, col_ind = linear_sum_assignment(contingency_matrix, maximize=True)
#     contingency_reordered = contingency_matrix[row_ind][:, col_ind]
#     #print(contingency_reordered)
#     max_match = np.sum(np.diag(contingency_reordered))/np.sum(contingency_matrix)
#     print(f"Max Matching: {max_match}")

In [None]:
def contingency_matrix_metrics(contingency_matrix):
    # Compute the number of true labels and the number of cluster labels
    num_true_labels, num_cluster_labels = contingency_matrix.shape

    # Compute the total number of samples
    N = contingency_matrix.sum()

    # Compute the true positives (samples correctly assigned to the same cluster as their true label)
    TP = sum([contingency_matrix[i, j] * (contingency_matrix[i, j] - 1) / 2 for i in range(num_true_labels) for j in range(num_cluster_labels)])

    # Compute the false negatives (samples incorrectly assigned to a different cluster than their true label)
    FN = sum([contingency_matrix[i, j] * (contingency_matrix[i, :].sum() - contingency_matrix[i, j]) for i in range(num_true_labels) for j in range(num_cluster_labels)])

    # Compute the false positives (samples assigned to the same cluster as another true label)
    FP = sum([contingency_matrix[i, j] * (contingency_matrix[:, j].sum() - contingency_matrix[i, j]) for i in range(num_true_labels) for j in range(num_cluster_labels)])

    # Compute the true negatives (samples correctly assigned to a different cluster than any true label)
    TN = N * (N - 1) / 2 - TP - FN - FP

    # Return the TP, FN, FP, and TN
    return TP, FN, FP, TN

# Testing K-means using Test Data set and Mapping Clusters to Classes

In [125]:
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral,test_size=0.8,train_size=0.2,stratify=labels_spectral,random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [1074992, 5374]

In [None]:
kmeans = KMeans(n_clusters=23, random_state=42)

In [None]:
kmeans.fit(X_train)

In [None]:
train_labels = kmeans.labels_

In [None]:
mapping, train_labels = map_and_change(y_train, train_labels)

In [None]:
test_labels = kmeans.predict(X_test)

In [None]:
test_labels = map_and_change_test(mapping,test_labels)

In [None]:
accuracy = accuracy_score(test_labels, y_test)

In [None]:
print(f"Accuracy: {accuracy}")