# Network Anomaly Detection using Clustering

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import random
random.seed(42)
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
import numpy as np
import math
from scipy.optimize import linear_sum_assignment

# 1. Importing Data and Understanding Format

In [19]:
from ipynb.fs.full.data_preprocessing import preprocess_data_10, preprocess_data

In [20]:
data_k_means, labels_kmeans = preprocess_data_10()
data_spectral, labels_spectral = preprocess_data()

# 2.  Clustering Using K-Means and Normalized Cut (Your implementation)


We are given a dataset of network traffic data represented as feature vectors of 41 dimensions. Our goal is to cluster this data using K-Means and identify any anomalies present in the data. 

The K-Means algorithm is a popular clustering algorithm that partitions the data into K clusters based on their similarities. It works by iteratively assigning each data point to its nearest cluster centroid and updating the centroids based on the newly assigned data points. This process continues until the centroids no longer move significantly.

To perform K-Means clustering on the network traffic data, we will vary the value of K between 7, 15, 23, 31, and 45 clusters. This will produce different sets of clusters, allowing us to analyze the data at different levels of granularity. We will then evaluate the clusters to identify any anomalies present in the data.



In [42]:
#It takes two attrs k number of centroids and the whole data set number of samples x features

def kMeans_implemented(k,data):
    centroids=[]
    num_points=data.shape[0]
    num_features=data.shape[1]
    
    #Appending random points to be our centroids according to the number of ks
    for i in range(k):
        centroids.append(data[random.randint(0, num_points)])
    clusters={}
    t=0
    while(True):
        labels=[]
        #Initialize empty clusters
        for i in range (k):
            clusters[i]=[]
            
        #Classify the points according to the closest centroid
        for i in range(num_points):
            distances=[]
            for j in range(k):
                distances.append(np.linalg.norm(data[i]-centroids[j]))
            clusters[distances.index(min(distances))].append(data[i])
            labels.append(distances.index(min(distances)))
        new_centroids=np.zeros((k,num_features))
        
        #Measuring the new centroids
        for i in range(k):
            new_centroids[i]=np.mean(clusters[i],axis=0)
        if(centroids==new_centroids).all():
            break
        else:
            centroids=new_centroids
    return labels

# description missing

In [43]:
from sklearn.cluster import KMeans
def spectral_clustering(A,k):
        
    #--------------computing the degree matrix-------------
    d = np.diag(np.sum(A, axis=1))

    #--------------------computing L-----------------------
    L = d-A

    #---------------------computing La---------------------
    #computing the inverse of the dgree matrix
    inv_degree = np.linalg.inv(d)
    La = np.dot(inv_degree, L)

    #---computing the eigenValues and eigenVectors of La---
    e_val, evec = np.linalg.eig(La)

    #----------sorting the eigenValues ascending----------- 
    idx = np.argsort(eval)
    e_val = e_val[idx]

    #---sorting the eigenVectors according to their corresponding eigenValues---
    evec = evec[:, idx]

    #--slicing the eigenVectors to the desired number of clusters--
    evec_new = evec[:, :k]

    #-------------normalizing the eigenVectors--------------
    system = evec.real / np.sqrt(np.linalg.norm(evec.real))

    kmeans = KMeans(n_clusters=k)
    system_labels = kmeans.fit_predict(system)


    return system, system_labels

In [46]:
X_train, X_test, y_train, y_test = train_test_split(data_spectral, labels_spectral, test_size=0.995, train_size=0.005,stratify=labels_spectral,random_state=42)

In [47]:
sim_matrix=rbf_kernel(X_train)

In [48]:
system,labels=spectral_clustering(sim_matrix,23)

# Evaluation

In [58]:
def evaluation(data,contingency_matrix):
    
    n_total = data.shape[0]
    gt_classes=contingency_matrix.shape[0]
    predicted_classes=contingency_matrix.shape[1]
    TP, TN, FP, FN = 0, 0, 0, 0
    # True Positive 
    for i in range(gt_classes):
        for j in range(predicted_classes):
            if contingency_matrix[i][j] != 1 and contingency_matrix[i][j] != 0:
                TP += math.comb(int(contingency_matrix[i][j]),2)

    # True Negative 
    for i in range(gt_classes):
        for j in range(predicted_classes):
            if i != j:
                for k in range(predicted_classes):
                    temp = contingency_matrix[k,i]*(np.sum(contingency_matrix[:,j]) - contingency_matrix[k,j])
                    TN += temp
    TN = TN/2

    # False Positive 
    for i in range(gt_classes):
        for j in range(predicted_classes):
            temp = contingency_matrix[j,i]*(np.sum(contingency_matrix[:,i])-contingency_matrix[j,i])/2
            FP += temp

    # False Negative 
    for i in range(gt_classes):
        for j in range(predicted_classes):
            if i != j:
                for k in range(predicted_classes):
                    temp = contingency_matrix[k,i]*(contingency_matrix[k,j])
                    FN += temp
    FN /= 2

    # Jaccard Index
    jacc = TP / (TP + FN + FP)

    # Rand Index
    rand = (TP + TN)/ (TP + FN + FP + TN)
    print('---------Confusion Matrix----------')
    print(f"Rand Index: {rand}")
    
    print(f"Jaccard Index: {jacc}")
    print(f'TP= {TP},TN= {TN},FN= {FN},FP= {FP}')
    
    ht_c = 0
    for i in range(gt_classes):
        cluster_elem = np.sum(contingency_matrix[:,i])
        for j in range(predicted_classes):  
            temp = contingency_matrix[j][i]/cluster_elem
            if temp != 0:
                ht_c += temp*math.log(temp,2)*cluster_elem/n_total
    ht_c = -1*ht_c
    print('---------Conditional Entropy----------')
    print(f"Conditional Entropy: {ht_c}")
    
    print('----------------Purity----------------')
    purity=0
    purities=[]
    recalls=[]
    for i in range(predicted_classes):
        cluster_sum=np.sum(contingency_matrix[:,i])
        class_max=np.max(contingency_matrix[:,i])
        a=contingency_matrix[:,i]
        max_index=a.argmax()
        purities.append(class_max/cluster_sum)
        recalls.append(class_max/np.sum(contingency_matrix[max_index,:]))
        purity+=(class_max/cluster_sum) * (cluster_sum/n_total)
    #purity = np.sum(np.max(contingency_matrix, axis =0))/np.sum(contingency_matrix)
    print(f"Purity: {purity}")
    print("Purities",purities)
    
    print('--------------F-measure---------------')
    # a row for each cluster, and columns are precision, recall and F-measure respectively
    
    f_measure=0
    for i in range(predicted_classes):
        f_measure+=(2*purities[i]*recalls[i])/(purities[i]+recalls[i])
    f_measure=f_measure/predicted_classes
    print(f"F: {f_measure}")
    
    print('--------------Max matching---------------')
    row_ind, col_ind = linear_sum_assignment(contingency_matrix, maximize=True)
    contingency_reordered = contingency_matrix[row_ind][:, col_ind]
    #print(contingency_reordered)
    max_match = np.sum(np.diag(contingency_reordered))/np.sum(contingency_matrix)
    print(f"Max Matching: {max_match}")

In [59]:
num_classes = 23
num_elements = len(labels)
contingency_matrix = np.zeros((num_classes,num_classes))
for i in range(num_elements):
    contingency_matrix[y_train[i],labels[i]] += 1

In [60]:
evaluation(X_train,contingency_matrix)

---------Confusion Matrix----------
Rand Index: 0.4672537036309752
Jaccard Index: 0.16607228898941917
TP= 1531700,TN= 5214159.0,FN= 7453691.0,FP= 237701.0
---------Conditional Entropy----------
Conditional Entropy: 0.389318100386387
----------------Purity----------------
Purity: 0.8823967249720878
Purities [0.6153846153846154, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.76, 0.5086206896551724, 0.9956204379562044, 1.0, 0.9966923925027563, 0.7777777777777778, 0.9823321554770318, 0.6086956521739131, 0.7872340425531915, 0.8133498145859085, 0.875, 0.9649122807017544, 0.9829192546583851, 0.9987012987012988, 0.9565217391304348]
--------------F-measure---------------
F: 0.27426062013312574
--------------Max matching---------------
Max Matching: 0.2967994045403796
