# Network Anomaly Detection using Clustering

In [294]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import random
random.seed(42)
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
import numpy as np
import math
from scipy.optimize import linear_sum_assignment
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import SpectralClustering
import pandas as pd
import matplotlib.pyplot as plt

# 1. Importing Data and Understanding Format

In [295]:
from ipynb.fs.full.data_preprocessing import preprocess_data_10, preprocess_data, category_map

In [296]:
data_k_means, labels_kmeans = preprocess_data_10()
data_spectral, labels_spectral = preprocess_data()

In [6]:
# from collections import Counter, OrderedDict
# d = Counter(labels_spectral)
# OrderedDict(sorted(d.items()))

# 2.  Clustering Using K-Means and Normalized Cut (Your implementation)


### K-Means algorithm

In [10]:
#It takes two attrs k number of centroids and the whole data set number of samples x features

def kMeans_implemented(k,data):
    centroids=[]
    num_points=data.shape[0]
    num_features=data.shape[1]
    
    #Appending random points to be our centroids according to the number of ks
    for i in range(k):
        centroids.append(data[random.randint(0, num_points)])
    clusters={}
    t=0
    while(True):
        labels=[]
        #Initialize empty clusters
        for i in range (k):
            clusters[i]=[]
            
        #Classify the points according to the closest centroid
        for i in range(num_points):
            distances=[]
            for j in range(k):
                distances.append(np.linalg.norm(data[i]-centroids[j]))
            clusters[distances.index(min(distances))].append(data[i])
            labels.append(distances.index(min(distances)))
        new_centroids=np.zeros((k,num_features))
        
        #Measuring the new centroids
        for i in range(k):
            new_centroids[i]=np.mean(clusters[i],axis=0)
        if(centroids==new_centroids).all():
            break
        else:
            centroids=new_centroids
    return labels

### Spectral Clustering algorithm

In [11]:
from sklearn.cluster import KMeans
def spectral_clustering(A,k):
        
    #--------------computing the degree matrix-------------
    d = np.diag(np.sum(A, axis=1))

    #--------------------computing L-----------------------
    L = d-A

    #---------------------computing La---------------------
    #computing the inverse of the dgree matrix
    inv_degree = np.linalg.inv(d)
    La = np.dot(inv_degree, L)

    #---computing the eigenValues and eigenVectors of La---
    e_val, evec = np.linalg.eig(La)

    #----------sorting the eigenValues ascending----------- 
    idx = np.argsort(eval)
    e_val = e_val[idx]

    #---sorting the eigenVectors according to their corresponding eigenValues---
    evec = evec[:, idx]

    #--slicing the eigenVectors to the desired number of clusters--
    evec_new = evec[:, :k]

    #-------------normalizing the eigenVectors--------------
    system = evec.real / np.sqrt(np.linalg.norm(evec.real))

    kmeans = KMeans(n_clusters=k)
    system_labels = kmeans.fit_predict(system)


    return system, system_labels

## GMM algorithm

In [314]:
import numpy as np
from scipy.stats import multivariate_normal

class GMM:
    def __init__(self, k, max_iter=5):
        self.k = k
        self.max_iter = int(max_iter)

    def initialize(self, X):
        self.shape = X.shape
        self.n, self.m = self.shape

        self.phi = np.full(shape=self.k, fill_value=1/self.k)
        self.weights = np.full( shape=self.shape, fill_value=1/self.k)
        
        random_row = np.random.randint(low=0, high=self.n, size=self.k)
        self.mu = [  X[row_index,:] for row_index in random_row ]
        self.sigma = [ np.cov(X.T) for _ in range(self.k) ]

    def e_step(self, X):
        # E-Step: update weights and phi holding mu and sigma constant
        self.weights = self.predict_proba(X)
        self.phi = self.weights.mean(axis=0)
    
    def m_step(self, X):
        # M-Step: update mu and sigma holding phi and weights constant
        for i in range(self.k):
            weight = self.weights[:, [i]]
            total_weight = weight.sum()
            self.mu[i] = (X * weight).sum(axis=0) / total_weight
            self.sigma[i] = np.cov(X.T, 
                aweights=(weight/total_weight).flatten(), 
                bias=True)

    def fit(self, X):
        self.initialize(X)
        
        for iteration in range(self.max_iter):
            self.e_step(X)
            self.m_step(X)
            
    def predict_proba(self, X):
        likelihood = np.zeros( (self.n, self.k) )
        for i in range(self.k):
            distribution = multivariate_normal(
                mean=self.mu[i], 
                cov=self.sigma[i])
            likelihood[:,i] = distribution.pdf(X)
        
        numerator = likelihood * self.phi
        denominator = numerator.sum(axis=1)[:, np.newaxis]
        weights = numerator / denominator
        return weights
    
    def predict(self, X):
        weights = self.predict_proba(X)
        return np.argmax(weights, axis=1)

In [297]:
from sklearn.mixture import GaussianMixture

class GMM:
    def __init__(self, n_components=1, max_iter=100, tol=1e-4):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol
        self.gmm = None

    def fit(self, X):
        self.gmm = GaussianMixture(n_components=self.n_components, max_iter=self.max_iter, tol=self.tol)
        self.gmm.fit(X)

    def predict_proba(self, X):
        return self.gmm.predict_proba(X)

    def predict(self, X):
        return self.gmm.predict(X)


# Testing

## K-Means Testing

In [17]:
labels1 = kMeans_implemented(7,np.array(data_k_means))
labels2 = kMeans_implemented(15,np.array(data_k_means))
labels3 = kMeans_implemented(23,np.array(data_k_means))
labels4 = kMeans_implemented(31,np.array(data_k_means))
labels5 = kMeans_implemented(45,np.array(data_k_means))

In [144]:
#labels = kMeans_implemented(7,np.array(data_k_means))
contingency_matrix = get_contingency(labels1,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.35235793694748563
----------------Purity---------------------------
Per cluster purity: [0.63, 0.93, 0.99, 1.0, 0.98, 0.72, 0.83]
Purity: 0.9272251452749578
--------------Recalls---------------------------
Per cluster Recall: [0.189, 0.0892, 0.3665, 0.8109, 0.362, 0.0454, 0.0839]
--------------F-measure---------------------------
F: 0.37865756371357984
--------------Max matching------------------------
Max Matching: 0.5314934128281565


In [39]:
#labels = kMeans_implemented(15,np.array(data_k_means))
contingency_matrix = get_contingency(15,labels2,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.18373219846534355
----------------Purity---------------------------
Purity: 0.9736444438338849
--------------F-measure---------------------------
F: 0.41680386111088813
--------------Max matching------------------------
Max Matching: 0.42480733037517343


In [33]:
#labels = kMeans_implemented(23,np.array(data_k_means))
contingency_matrix = get_contingency(23,labels3,np.array(labels_kmeans))
evaluation(data_k_means,contingency_matrix)

---------Confusion Matrix----------------------
Rand Index: 0.6102798232289484
Jaccard Index: 0.21108268697057267
TP= 1105045771,TN= 5362436707.0,FN= 4097006802.0,FP= 33079625.0
---------Conditional Entropy--------------------
Conditional Entropy: 0.15197479620500717
----------------Purity---------------------------
Purity: 0.9755677056859864
--------------F-measure---------------------------
F: 0.2899171883013299
--------------Max matching------------------------
Max Matching: 0.33036830464467737


In [40]:
#labels = kMeans_implemented(31,np.array(data_k_means))
contingency_matrix = get_contingency(31,labels4,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.1373032330683246
----------------Purity---------------------------
Purity: 0.9748739576607641
--------------F-measure---------------------------
F: 0.2474324622398687
--------------Max matching------------------------
Max Matching: 0.29583888560713256


In [41]:
#labels = kMeans_implemented(45,np.array(data_k_means))
contingency_matrix = get_contingency(45,labels5,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.14802197270368977
----------------Purity---------------------------
Purity: 0.9694476117209071
--------------F-measure---------------------------
F: 0.16965070847395466
--------------Max matching------------------------
Max Matching: 0.21982882969516299


## Spectral Clustering Testing

In [71]:
# Splitting data to use it in spectral clustering
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral, test_size=0.995, train_size=0.005,stratify=labels_spectral,random_state=42)

In [72]:
# from collections import Counter, OrderedDict
# d = Counter(y_train)
# OrderedDict(sorted(d.items()))

In [73]:
# from collections import Counter, OrderedDict
# d = Counter(y_test)
# OrderedDict(sorted(d.items()))

In [74]:
# Explore the classes in training dataset after splitting
for i in y_train.unique():
    print(category_map(i))

neptune.
normal.
teardrop.
ipsweep.
back.
satan.
smurf.
portsweep.
pod.
nmap.
warezclient.


In [75]:
sim_matrix=rbf_kernel(X_train)

In [76]:
system,labels_spectral=spectral_clustering(sim_matrix,11)

In [77]:
dict_y={}
y=list(y_train.unique())
for i in range (len(y)):
    dict_y[y_train.unique()[i]]=i

In [78]:
new_y_train=np.array([dict_y[label] for label in y_train])

In [131]:
contingency_matrix = get_contingency(labels_spectral,new_y_train)
evaluation(X_train,contingency_matrix)

---------Confusion Matrix----------------------
Rand Index: 0.6721302414150727
Jaccard Index: 0.5180510746707863
TP= 5088121,TN= 4615592.0,FN= 3897270.0,FP= 836268.0
---------Conditional Entropy--------------------
Conditional Entropy: 0.5290578842897368
----------------Purity---------------------------
Per cluster purity: [0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.75, 0.65, 0.61]
Purity: 0.8286192780052105
--------------Recalls---------------------------
Per cluster Recall: [0.7179, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 0.0015, 0.0347, 0.0849, 0.8571]
--------------F-measure---------------------------
F: 0.4028356780262166
--------------Max matching------------------------
Max Matching: 0.7415333085225159


## Comparison between K-means and Spectral Clustering

In [130]:
labels6 = kMeans_implemented(11,np.array(X_train))
contingency_matrix = get_contingency(labels6,new_y_train)
evaluation(X_train,contingency_matrix)

---------Confusion Matrix----------------------
Rand Index: 0.5630365503792931
Jaccard Index: 0.3000355943197324
TP= 2704123,TN= 5424577.0,FN= 6281268.0,FP= 27283.0
---------Conditional Entropy--------------------
Conditional Entropy: 0.11399033527778417
----------------Purity---------------------------
Per cluster purity: [0.74, 0.91, 0.95, 0.98, 1.0, 0.98, 1.0, 0.77, 1.0, 1.0, 0.85]
Purity: 0.9813918868626721
--------------Recalls---------------------------
Per cluster Recall: [0.0042, 0.033, 0.0837, 0.1351, 0.8629, 0.0482, 0.0047, 0.0116, 0.3524, 0.3271, 0.1371]
--------------F-measure---------------------------
F: 0.2512789124016813
--------------Max matching------------------------
Max Matching: 0.4719017491626349


In [121]:
new_labels_spectral,mapped_spectral = map_and_change(y_train,labels_spectral)

{0: 11, 1: 20, 2: 20, 3: 20, 4: 20, 5: 20, 6: 14, 7: 11, 8: 11, 9: 11, 10: 9}


In [122]:
new_labels_spectral=cluster_to_class_name(new_labels_spectral)

In [123]:
new_labels_spectral

{0: 'normal.',
 1: 'teardrop.',
 2: 'teardrop.',
 3: 'teardrop.',
 4: 'teardrop.',
 5: 'teardrop.',
 6: 'pod.',
 7: 'normal.',
 8: 'normal.',
 9: 'normal.',
 10: 'neptune.'}

In [88]:
new_labels_kmeans,mapped_kmeans = map_and_change(y_train,labels6)

{0: 11, 1: 11, 2: 11, 3: 11, 4: 11, 5: 9, 6: 11, 7: 11, 8: 9, 9: 11, 10: 11}


In [125]:
new_labels_kmeans=cluster_to_class_name(new_labels_kmeans)

In [126]:
new_labels_kmeans

{0: 'normal.',
 1: 'normal.',
 2: 'normal.',
 3: 'normal.',
 4: 'normal.',
 5: 'neptune.',
 6: 'normal.',
 7: 'normal.',
 8: 'neptune.',
 9: 'normal.',
 10: 'normal.'}

In [96]:
normal_spectral,abnormal_spectral=classify_normality(mapped_spectral)

In [100]:
normal_kmeans,abnormal_kmeans=classify_normality(mapped_kmeans)

In [128]:
print(f"Number of normal samples using spectral clustering= {normal_spectral}")
print(f"Number of abnormal samples using spectral clustering= {abnormal_spectral}")
print(f"Number of normal samples using kmeans clustering= {normal_kmeans}")
print(f"Number of abnormal samples using kmeans clustering= {abnormal_kmeans}")

Number of normal samples using spectral clustering= 3656
Number of abnormal samples using spectral clustering= 1718
Number of normal samples using kmeans clustering= 4149
Number of abnormal samples using kmeans clustering= 1225


## GMM Testing

In [308]:
gmm = GMM(23)
labels = gmm.fit(np.array(data_k_means))

In [311]:
labels = gmm.predict(np.array(data_k_means))

In [313]:
contingency_matrix = get_contingency(labels,labels_kmeans)
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.24063773251691306
----------------Purity---------------------------
Per cluster purity: [1.0, 1.0, 0.78, 0.99, 1.0, 1.0, 0.39, 0.9, 0.98, 1.0, 0.85, 0.91, 0.95, 0.87, 1.0, 0.5, 0.51, 0.93, 0.81, 0.71, 0.49, 0.85, 0.91]
Purity: 0.9529625101314686
--------------Recalls---------------------------
Per cluster Recall: [0.2508, 0.0683, 0.0191, 0.4894, 0.0001, 0.0, 0.6, 0.0503, 0.8862, 0.0024, 0.0437, 0.0364, 1.0, 0.0546, 0.9913, 0.125, 0.0034, 0.0394, 0.0008, 0.75, 0.0241, 0.9811, 0.0005]
--------------F-measure---------------------------
F: 0.3009557073048685
--------------Max matching------------------------
Max Matching: 0.6357616803813554


# Evaluation

In [298]:
def cluster_to_class_name(labels_dict):
    for key in labels_dict.keys():
        value=labels_dict[key]
        labels_dict[key]=category_map(value)
    return labels_dict

In [299]:
def classify_normality(mapped_labels):
    normal=0
    abnormal=0
    for label in mapped_labels:
        if label==11:
            normal+=1
        else:
            abnormal+=1
    return normal,abnormal

In [300]:
# map labels resulting in k-means to true labels in able to do predictions
def map_and_change(y_train, labels):
    mapping = {}
    labels = np.array(list(labels))
    for i in np.unique(labels):
        binary = [int(x) for x in labels == i]
        mapping[i] = np.bincount([value for value, flag in zip(y_train, binary) if flag == 1]).argmax()

    # Map the cluster labels to the true class labels
    mapped_labels = np.array([mapping[label] for label in labels])

    # Print the mapped labels
    print(mapping)
    return mapping, mapped_labels

In [301]:
def map_and_change_test(mapping, labels):
    mapped_labels = np.array([mapping[label] for label in labels])
    return mapped_labels

In [302]:
new_labels,mapped = map_and_change(y_train,labels)

{9: 9, 11: 11, 20: 20}


In [303]:
def get_contingency(labels,true):
    labels = list(labels)
    true = list(true)
    true_len=np.unique(np.array(true)).shape[0]
    cluster_len=np.unique(np.array(labels)).shape[0]
    num_elements = len(labels)
    contingency_matrix = np.zeros((true_len,cluster_len))
    for i in range(num_elements):
        contingency_matrix[true[i],labels[i]] += 1
    return contingency_matrix

In [304]:
def evaluation(data, contingency_matrix):
    n_total = data.shape[0]
    gt_classes=contingency_matrix.shape[0]
    predicted_classes=contingency_matrix.shape[1]
#     TP, TN, FP, FN = 0, 0, 0, 0
#     # True Positive 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if contingency_matrix[i][j] != 1 and contingency_matrix[i][j] != 0:
#                 TP += math.comb(int(contingency_matrix[i][j]),2)

#     # True Negative 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if i != j:
#                 for k in range(predicted_classes):
#                     temp = contingency_matrix[k,i]*(np.sum(contingency_matrix[:,j]) - contingency_matrix[k,j])
#                     TN += temp
#     TN = TN/2

#     # False Positive 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             temp = contingency_matrix[j,i]*(np.sum(contingency_matrix[:,i])-contingency_matrix[j,i])/2
#             FP += temp

#     # False Negative 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if i != j:
#                 for k in range(predicted_classes):
#                     temp = contingency_matrix[k,i]*(contingency_matrix[k,j])
#                     FN += temp
#     FN /= 2

#     # Jaccard Index
#     jacc = TP / (TP + FN + FP)

#     # Rand Index
#     rand = (TP + TN)/ (TP + FN + FP + TN)
#     print('---------Confusion Matrix----------------------')
#     print(f"Rand Index: {rand}")
    
#     print(f"Jaccard Index: {jacc}")
#     print(f'TP= {TP},TN= {TN},FN= {FN},FP= {FP}')
    
    ht_c = 0
    for i in range(predicted_classes):
        cluster_elem = np.sum(contingency_matrix[:,i])
        for j in range(gt_classes):  
            temp = contingency_matrix[j][i]/cluster_elem
            if temp != 0:
                ht_c += temp*math.log(temp,2)*(cluster_elem/n_total)
    ht_c = -1*ht_c
    print('---------Conditional Entropy--------------------')
    print(f"Conditional Entropy: {ht_c}")
    
    print('----------------Purity---------------------------')
    purity=0
    purities=[]
    recalls=[]
    for i in range(predicted_classes):
        cluster_sum=np.sum(contingency_matrix[:,i])
        class_max=np.max(contingency_matrix[:,i])
        a=contingency_matrix[:,i]
        max_index=a.argmax()
        purities.append(round((class_max/cluster_sum),2))
        recalls.append(round((class_max/np.sum(contingency_matrix[max_index,:])),4))
        purity+=(class_max/cluster_sum) * (cluster_sum/n_total)
    #purity = np.sum(np.max(contingency_matrix, axis =0))/np.sum(contingency_matrix)
    print(f"Per cluster purity: {purities}")
    print(f"Purity: {purity}")
    print('--------------Recalls---------------------------')
    print(f"Per cluster Recall: {recalls}")
    print('--------------F-measure---------------------------')
    # a row for each cluster, and columns are precision, recall and F-measure respectively
    
    f_measure=0
    for i in range(predicted_classes):
        f_measure+=(2*purities[i]*recalls[i])/(purities[i]+recalls[i])
    f_measure=f_measure/predicted_classes
    print(f"F: {f_measure}")
    
    print('--------------Max matching------------------------')
    row_ind, col_ind = linear_sum_assignment(contingency_matrix, maximize=True)
    contingency_reordered = contingency_matrix[row_ind][:, col_ind]
    #print(contingency_reordered)
    max_match = np.sum(np.diag(contingency_reordered))/np.sum(contingency_matrix)
    print(f"Max Matching: {max_match}")

# Testing K-means using Test Data set and Mapping Clusters to Classes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral, test_size=0.995, train_size=0.005,stratify=labels_spectral,random_state=42)

In [None]:
kmeans = KMeans(n_clusters=23, random_state=42)

In [None]:
kmeans.fit(X_train)

In [None]:
train_labels = kmeans.labels_

In [None]:
mapping, train_labels = map_and_change(y_train, train_labels)

In [None]:
test_labels = kmeans.predict(X_test)

In [None]:
test_labels = map_and_change_test(mapping,test_labels)

In [None]:
accuracy = accuracy_score(test_labels, y_test)

In [None]:
print(f"Accuracy: {accuracy}")