# Network Anomaly Detection using Clustering

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import random
random.seed(42)
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
import numpy as np
import math
from scipy.optimize import linear_sum_assignment
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import SpectralClustering
import pandas as pd
import matplotlib.pyplot as plt

# 1. Importing Data and Understanding Format

In [3]:
from ipynb.fs.full.data_preprocessing import preprocess_data_10, preprocess_data

In [4]:
data_k_means, labels_kmeans = preprocess_data_10()
data_spectral, labels_spectral = preprocess_data()

In [16]:
from collections import Counter, OrderedDict
d = Counter(labels_spectral)
OrderedDict(sorted(d.items()))


OrderedDict([(0, 968),
             (1, 30),
             (2, 8),
             (3, 53),
             (4, 12),
             (5, 3723),
             (6, 19),
             (7, 9),
             (8, 7),
             (9, 242149),
             (10, 1554),
             (11, 812814),
             (12, 3),
             (13, 4),
             (14, 206),
             (15, 3564),
             (16, 10),
             (17, 5019),
             (18, 3007),
             (19, 2),
             (20, 918),
             (21, 893),
             (22, 20)])

# 2.  Clustering Using K-Means and Normalized Cut (Your implementation)


### K-Means algorithm

In [4]:
#It takes two attrs k number of centroids and the whole data set number of samples x features

def kMeans_implemented(k,data):
    centroids=[]
    num_points=data.shape[0]
    num_features=data.shape[1]
    
    #Appending random points to be our centroids according to the number of ks
    for i in range(k):
        centroids.append(data[random.randint(0, num_points)])
    clusters={}
    t=0
    while(True):
        labels=[]
        #Initialize empty clusters
        for i in range (k):
            clusters[i]=[]
            
        #Classify the points according to the closest centroid
        for i in range(num_points):
            distances=[]
            for j in range(k):
                distances.append(np.linalg.norm(data[i]-centroids[j]))
            clusters[distances.index(min(distances))].append(data[i])
            labels.append(distances.index(min(distances)))
        new_centroids=np.zeros((k,num_features))
        
        #Measuring the new centroids
        for i in range(k):
            new_centroids[i]=np.mean(clusters[i],axis=0)
        if(centroids==new_centroids).all():
            break
        else:
            centroids=new_centroids
    return labels

### Spectral Clustering algorithm

In [5]:
from sklearn.cluster import KMeans
def spectral_clustering(A,k):
        
    #--------------computing the degree matrix-------------
    d = np.diag(np.sum(A, axis=1))

    #--------------------computing L-----------------------
    L = d-A

    #---------------------computing La---------------------
    #computing the inverse of the dgree matrix
    inv_degree = np.linalg.inv(d)
    La = np.dot(inv_degree, L)

    #---computing the eigenValues and eigenVectors of La---
    e_val, evec = np.linalg.eig(La)

    #----------sorting the eigenValues ascending----------- 
    idx = np.argsort(eval)
    e_val = e_val[idx]

    #---sorting the eigenVectors according to their corresponding eigenValues---
    evec = evec[:, idx]

    #--slicing the eigenVectors to the desired number of clusters--
    evec_new = evec[:, :k]

    #-------------normalizing the eigenVectors--------------
    system = evec.real / np.sqrt(np.linalg.norm(evec.real))

    kmeans = KMeans(n_clusters=k)
    system_labels = kmeans.fit_predict(system)


    return system, system_labels

## GMM algorithm

In [None]:
from sklearn.mixture import GaussianMixture

# Create a Gaussian Mixture Model with 23 components
gmm = GaussianMixture(n_components=23)

# Fit the model to the data
gmm.fit(X_train)

# Predict the cluster labels for the data
labels = gmm.predict(X_train)


In [None]:
mapping, labels = map_and_change(y_train,labels)

# Testing

## K-Means Testing

In [17]:
labels1 = kMeans_implemented(7,np.array(data_k_means))
labels2 = kMeans_implemented(15,np.array(data_k_means))
labels3 = kMeans_implemented(23,np.array(data_k_means))
labels4 = kMeans_implemented(31,np.array(data_k_means))
labels5 = kMeans_implemented(45,np.array(data_k_means))

In [38]:
#labels = kMeans_implemented(7,np.array(data_k_means))
contingency_matrix = get_contingency(7,labels1,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.3560348999697181
----------------Purity---------------------------
Purity: 0.9272320140672868
--------------F-measure---------------------------
F: 0.37852397344067573
--------------Max matching------------------------
Max Matching: 0.5359237838803181


In [39]:
#labels = kMeans_implemented(15,np.array(data_k_means))
contingency_matrix = get_contingency(15,labels2,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.18373219846534355
----------------Purity---------------------------
Purity: 0.9736444438338849
--------------F-measure---------------------------
F: 0.41680386111088813
--------------Max matching------------------------
Max Matching: 0.42480733037517343


In [33]:
#labels = kMeans_implemented(23,np.array(data_k_means))
contingency_matrix = get_contingency(23,labels3,np.array(labels_kmeans))
evaluation(data_k_means,contingency_matrix)

---------Confusion Matrix----------------------
Rand Index: 0.6102798232289484
Jaccard Index: 0.21108268697057267
TP= 1105045771,TN= 5362436707.0,FN= 4097006802.0,FP= 33079625.0
---------Conditional Entropy--------------------
Conditional Entropy: 0.15197479620500717
----------------Purity---------------------------
Purity: 0.9755677056859864
--------------F-measure---------------------------
F: 0.2899171883013299
--------------Max matching------------------------
Max Matching: 0.33036830464467737


In [40]:
#labels = kMeans_implemented(31,np.array(data_k_means))
contingency_matrix = get_contingency(31,labels4,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.1373032330683246
----------------Purity---------------------------
Purity: 0.9748739576607641
--------------F-measure---------------------------
F: 0.2474324622398687
--------------Max matching------------------------
Max Matching: 0.29583888560713256


In [41]:
#labels = kMeans_implemented(45,np.array(data_k_means))
contingency_matrix = get_contingency(45,labels5,np.array(labels_kmeans))
evaluation(np.array(data_k_means),contingency_matrix)

---------Conditional Entropy--------------------
Conditional Entropy: 0.14802197270368977
----------------Purity---------------------------
Purity: 0.9694476117209071
--------------F-measure---------------------------
F: 0.16965070847395466
--------------Max matching------------------------
Max Matching: 0.21982882969516299


## Spectral Clustering Testing

In [17]:
# Splitting data to use it in spectral clustering
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral, test_size=0.995, train_size=0.005,stratify=labels_spectral,random_state=42)

In [18]:
from collections import Counter, OrderedDict
d = Counter(y_train)
OrderedDict(sorted(d.items()))


OrderedDict([(0, 5),
             (5, 19),
             (9, 1211),
             (10, 8),
             (11, 4063),
             (14, 1),
             (15, 18),
             (17, 25),
             (18, 15),
             (20, 5),
             (21, 4)])

In [19]:
from collections import Counter, OrderedDict
d = Counter(y_test)
OrderedDict(sorted(d.items()))


OrderedDict([(0, 963),
             (1, 30),
             (2, 8),
             (3, 53),
             (4, 12),
             (5, 3704),
             (6, 19),
             (7, 9),
             (8, 7),
             (9, 240938),
             (10, 1546),
             (11, 808751),
             (12, 3),
             (13, 4),
             (14, 205),
             (15, 3546),
             (16, 10),
             (17, 4994),
             (18, 2992),
             (19, 2),
             (20, 913),
             (21, 889),
             (22, 20)])

In [None]:
sim_matrix=rbf_kernel(X_train)

In [None]:
system,labels=spectral_clustering(sim_matrix,23)

In [None]:
contingency_matrix = get_contingency(23,labels,y_train)
evaluation(X_train,contingency_matrix)

## Comparison between K-means and Spectral Clustering

In [None]:
labels = kMeans_implemented(23,np.array(X_train))
contingency_matrix = get_contingency(23,labels,np.array(y_train))
evaluation(X_train,contingency_matrix)

## GMM Testing

In [None]:
labels = gmm(23,np.array(X_train))
contingency_matrix = get_contingency(23,labels,np.array(y_train))
evaluation(X_train,contingency_matrix)

# Evaluation

In [6]:
# map labels resulting in k-means to true labels in able to do predictions
def map_and_change(y_train, labels):
    mapping = {}
    labels = np.array(list(labels))
    for i in np.unique(labels):
        binary = [int(x) for x in labels == i]
        mapping[i] = np.bincount([value for value, flag in zip(y_train, binary) if flag == 1]).argmax()

    # Map the cluster labels to the true class labels
    mapped_labels = np.array([mapping[label] for label in labels])

    # Print the mapped labels
    print(mapping)
    return mapping, mapped_labels

In [7]:
def map_and_change_test(mapping, labels):
    mapped_labels = np.array([mapping[label] for label in labels])
    return mapped_labels

In [8]:
new_labels = map_and_change(y_train,labels)

NameError: name 'y_train' is not defined

In [18]:
def get_contingency(k,labels,true):
    labels = list(labels)
    true = list(true)
    num_classes = k
    num_elements = len(labels)
    contingency_matrix = np.zeros((23,k))
    for i in range(num_elements):
        contingency_matrix[true[i],labels[i]] += 1
    return contingency_matrix

In [36]:
def evaluation(data, contingency_matrix):
    n_total = data.shape[0]
    gt_classes=contingency_matrix.shape[0]
    predicted_classes=contingency_matrix.shape[1]
#     TP, TN, FP, FN = 0, 0, 0, 0
#     # True Positive 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if contingency_matrix[i][j] != 1 and contingency_matrix[i][j] != 0:
#                 TP += math.comb(int(contingency_matrix[i][j]),2)

#     # True Negative 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if i != j:
#                 for k in range(predicted_classes):
#                     temp = contingency_matrix[k,i]*(np.sum(contingency_matrix[:,j]) - contingency_matrix[k,j])
#                     TN += temp
#     TN = TN/2

#     # False Positive 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             temp = contingency_matrix[j,i]*(np.sum(contingency_matrix[:,i])-contingency_matrix[j,i])/2
#             FP += temp

#     # False Negative 
#     for i in range(gt_classes):
#         for j in range(predicted_classes):
#             if i != j:
#                 for k in range(predicted_classes):
#                     temp = contingency_matrix[k,i]*(contingency_matrix[k,j])
#                     FN += temp
#     FN /= 2

#     # Jaccard Index
#     jacc = TP / (TP + FN + FP)

#     # Rand Index
#     rand = (TP + TN)/ (TP + FN + FP + TN)
#     print('---------Confusion Matrix----------------------')
#     print(f"Rand Index: {rand}")
    
#     print(f"Jaccard Index: {jacc}")
#     print(f'TP= {TP},TN= {TN},FN= {FN},FP= {FP}')
    
    ht_c = 0
    for i in range(predicted_classes):
        cluster_elem = np.sum(contingency_matrix[:,i])
        for j in range(gt_classes):  
            temp = contingency_matrix[j][i]/cluster_elem
            if temp != 0:
                ht_c += temp*math.log(temp,2)*(cluster_elem/n_total)
    ht_c = -1*ht_c
    print('---------Conditional Entropy--------------------')
    print(f"Conditional Entropy: {ht_c}")
    
    print('----------------Purity---------------------------')
    purity=0
    purities=[]
    recalls=[]
    for i in range(predicted_classes):
        cluster_sum=np.sum(contingency_matrix[:,i])
        class_max=np.max(contingency_matrix[:,i])
        a=contingency_matrix[:,i]
        max_index=a.argmax()
        purities.append(class_max/cluster_sum)
        recalls.append(class_max/np.sum(contingency_matrix[max_index,:]))
        purity+=(class_max/cluster_sum) * (cluster_sum/n_total)
    #purity = np.sum(np.max(contingency_matrix, axis =0))/np.sum(contingency_matrix)
    print(f"Purity: {purity}")
    
    print('--------------F-measure---------------------------')
    # a row for each cluster, and columns are precision, recall and F-measure respectively
    
    f_measure=0
    for i in range(predicted_classes):
        f_measure+=(2*purities[i]*recalls[i])/(purities[i]+recalls[i])
    f_measure=f_measure/predicted_classes
    print(f"F: {f_measure}")
    
    print('--------------Max matching------------------------')
    row_ind, col_ind = linear_sum_assignment(contingency_matrix, maximize=True)
    contingency_reordered = contingency_matrix[row_ind][:, col_ind]
    #print(contingency_reordered)
    max_match = np.sum(np.diag(contingency_reordered))/np.sum(contingency_matrix)
    print(f"Max Matching: {max_match}")

# Testing K-means using Test Data set and Mapping Clusters to Classes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral, test_size=0.995, train_size=0.005,stratify=labels_spectral,random_state=42)

In [None]:
kmeans = KMeans(n_clusters=23, random_state=42)

In [None]:
kmeans.fit(X_train)

In [None]:
train_labels = kmeans.labels_

In [None]:
mapping, train_labels = map_and_change(y_train, train_labels)

In [None]:
test_labels = kmeans.predict(X_test)

In [None]:
test_labels = map_and_change_test(mapping,test_labels)

In [None]:
accuracy = accuracy_score(test_labels, y_test)

In [None]:
print(f"Accuracy: {accuracy}")