<a href="https://colab.research.google.com/github/MiguelCarbo/Anomalies-Detection-TFG/blob/main/2_All_Services_Activity_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ANOMALIES DETECTION #
*Miguel Ivars Carbó, University of Zaragoza*

This Notebook has been written in the context of Miguel Ivars Carbó's final degree thesis. 
As a first approach towards the final goal, the notebook provides a way to apply anomaly detection algorithms to a Pandas Dataframe.
The dataframe consists on a Zeek log which is then processed with the 'Zeek Analysis Tool'.
The algorithms are implemented from the PYOD library and in the end, results are evaluated and projected into a bidimensional space which illustrates the classification performed.

This notebook is oriented towards seeing the results of clustering in the dataset.

In [None]:
# Reset Variables
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


### 0. NECESSARY IMPORTS ###

In [None]:
# Python Outlier Detection + Zeek Analysis Tool
!pip install pyod
!pip install zat

# PYOD models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.ecod import ECOD
from pyod.models.copod import COPOD
from pyod.models.gmm import GMM
from pyod.models.vae import VAE
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.lunar import LUNAR
from pyod.models.so_gaal import SO_GAAL

# Zeek Analysis Tool
import zat;
from zat.log_to_dataframe import LogToDataFrame;
from zat.json_log_to_dataframe import JSONLogToDataFrame;
from zat.dataframe_to_matrix import DataFrameToMatrix;

# Pandas
import pandas as pd;

# Numpy
import numpy as np;
from numpy import percentile;

# Scickit Learn
import sklearn;
from sklearn.ensemble import IsolationForest;
from sklearn.decomposition import PCA;
from sklearn.cluster import KMeans, DBSCAN;
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE

# Plotly
import plotly.express as px
import plotly.graph_objs as go

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# UMAP
!pip install umap-learn
from umap.umap_ import UMAP

# HDBSCAN
!pip install hdbscan
import hdbscan

# SyS
import sys

# Version Printing
print('zat: {:s}'.format(zat.__version__))
print('Pandas: {:s}'.format(pd.__version__))
print('Numpy: {:s}'.format(np.__version__))
print('Scikit Learn Version:', sklearn.__version__)

In [None]:
import warnings
warnings.filterwarnings('ignore')

### 1. DATAFRAME CREATION FROM ZEEK LOG ###

In [None]:
# Create a Pandas dataframe from the Zeek log

# Select JSON Log Format / Zeek Log Format
# -- log_to_df = LogToDataFrame()
log_to_df = JSONLogToDataFrame()

conn_df = log_to_df.create_dataframe('conn_label.log')
print('Dataframe total Rows: {:d} '.format(len(conn_df)))

Dataframe total Rows: 1316 


#### 1.1 DATAFRAME FILTERING BASED ON SERVICE ####


*In order to perform a more specific analysis, the user may choose the 'service' they want to keep, thus eliminating traffic that may be of little interest.*

In [None]:
# All Services ConnDF
all_conn_df = conn_df

# Group by service
grouped = conn_df.groupby('service')

# Create an empty dictionary to store the new dataframes
service_dfs = {}

# Loop over the groups and create a new dataframe for each service
for name, group in grouped:
    service_dfs[name] = group.copy()

print('Your services are: ')
for name, service_df in service_dfs.items()
    print(f" -- '{name}' service")

Your services are: 
 -- 'dhcp' service
 -- 'dns' service
 -- 'http' service
 -- 'ntp' service
 -- 'ssl' service


#### 1.2 DATAFRAME SELECTION ####

In [None]:
# Choose Full Dataframe or select specific service
conn_df = service_dfs['ssl']
# conn_df = service_dfs['dns']
# conn_df = all_conn_df

print('Dataframe total Rows: {:d} '.format(len(conn_df)))

Dataframe total Rows: 144 


#### 1.3 IP ADDRESS SHOW ####

*In addition, the user might also want to keep only certain IP Addresses to focus on connections of only specific devices.*

In [None]:
# Identifying IPs involved in traffic
source_df_ips = set(conn_df['sourceAddress'].unique())
destination_df_ips = set(conn_df['destinationAddress'].unique())

# Print all the Dataset IPs
print('Dataset Source IP Addresses:')
for ip in source_df_ips:
    print(' -- '+str(ip))

print('---------------------------------')

print('Dataset Destination IP Addresses:')
for ip in destination_df_ips:
    print(' -- '+str(ip))

Dataset Source IP Addresses:
---------------------------------
Dataset Destination IP Addresses:
 -- 20.223.54.233
 -- 13.107.237.43
 -- 74.125.160.199
 -- 92.122.44.209
 -- 142.250.201.67
 -- 13.37.25.97
 -- 142.250.184.162
 -- 52.112.214.18
 -- 152.199.21.175
 -- 108.157.109.102
 -- 142.250.184.14
 -- 108.157.109.38
 -- 20.54.24.69
 -- 13.107.226.43
 -- 216.239.32.36
 -- 140.82.121.5
 -- 13.107.253.43
 -- 142.250.184.3
 -- 142.250.178.164
 -- 216.58.215.130
 -- 13.69.239.73
 -- 142.250.200.138
 -- 142.250.200.69
 -- 192.229.221.185
 -- 52.112.174.4
 -- 2.17.153.99
 -- 17.253.125.203
 -- 35.241.9.150
 -- 52.109.76.228
 -- 216.58.215.164
 -- 149.81.15.121
 -- 216.58.209.74
 -- 142.250.185.5
 -- 142.250.184.170
 -- 35.155.125.97
 -- 40.101.92.18
 -- 92.123.56.23
 -- 185.43.181.56
 -- 40.65.233.137
 -- 52.142.125.222
 -- 52.112.214.26
 -- 52.142.124.215
 -- 23.51.230.81
 -- 20.190.159.3
 -- 52.114.76.110
 -- 34.210.253.191
 -- 91.235.133.182
 -- 173.194.76.156
 -- 152.199.19.160
 -- 172.

#### 1.4 IP ADDRESSES SELECTION ####

In [None]:
# IP Addresses Connections to Keep
selected_traffic_ips = ['192.168.1.56','192.168.1.52']

# Filter dataframe
conn_df = conn_df.loc[conn_df[['sourceAddress', 'destinationAddress']].isin(selected_traffic_ips).any(axis=1)]

### 2. FEATURE SELECTION ###

#### 2.1 CONN.LOG FEATURES ####

In [None]:
# All features of interest - 4 Flow Tuple + Tunnel Parents included
full_features = ['sourceAddress', 'sourcePort', 'destinationAddress', 'destinationPort', 'uid', 'protocol', 'service', 'duration', 'orig_bytes', 'resp_bytes',
                 'conn_state', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'sourceDnsDomain', 'destinationDnsDomain', 
                 'sourceHostName', 'destinationHostName', 'mediaOrigen', 'mediaResp','desvOrigen', 'desvResp', 'noceroOrigen', 'noceroResp', 'mediaTime', 'desvTime',
                 'label']

# Features to train Dataset with
features = ['duration', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts', 'mediaOrigen', 'mediaResp','desvOrigen', 'desvResp', 'noceroOrigen', 'noceroResp', 'mediaTime', 'desvTime']

### 3. DATA PREPARATION ###

#### 3.1 DATA CLEANSING ####

In [None]:
# Feature Filtering + NaNs removal
# -- We will work only on Benign Connections 
conn_df = conn_df.loc[conn_df['label'] == 'BENIGN']
# -- Keep only features of interest
conn_df = conn_df[full_features]        
conn_df = conn_df.dropna()

# Consider all Connections
# ## conn_df = conn_df[(conn_df['service'] != 'ayiya') & (conn_df['sourcePort'] != 5353) & (conn_df['destinationPort'] != 5353) & (conn_df['destinationAddress'] != '224.0.0.251') & (conn_df['service'] != 'dhcp') & (conn_df['service'] != 'dtls') & (conn_df['service'] != 'http') & (conn_df['sourceAddress'] != '0.0.0.0') & (conn_df['destinationAddress'] != '255.255.255.255')]
# -- Dataset to train only has features of interest
conn_features_df = conn_df[features]    

print('Features Dataframe Shape: ' +str(conn_features_df.shape))

In [None]:
# Additional data preprocessing
# -- Add 'ANOMALOUS' label as possible label value 
conn_df['label'] = conn_df['label'].cat.add_categories(['ANOMALOUS'])

# Timedelta to numeric time value conversion
for feature in features:
    if pd.api.types.is_timedelta64_dtype(conn_features_df[feature]): 
        conn_features_df[feature] = conn_features_df[feature].dt.total_seconds().astype(float)

#### 3.2 LABEL ENCODING ####

In [None]:
# LabelEncoding
le = LabelEncoder()
for feature in features:
    # Assign to a categorical value a numeric value
    # -- Required to work with PYOD algorithms
    if (conn_features_df[feature].dtype.name == 'category') :
        print(feature)
        conn_features_df[feature] = le.fit_transform(conn_features_df[feature])

#### 3.3 NUMPY ARRAY CREATION ####

In [None]:
# Dataframe conversion to float 
conn_features_df = conn_features_df.astype(float)

# Convert the dataframe to a numpy array
X = conn_features_df.to_numpy()
print('NDarray shape: ' +str(X.shape))

(2117, 7)


In [None]:
# Column Normalization
# -- Calculate the highest value in each column
column_max = np.max(X, axis=0)

# Normalize the matrix by dividing each element by the highest value in its column
X_norm = X / column_max
X = X_norm

### 4. CLASSIFICATION ALGORITHMS ###

In [None]:
# Outlier detectetion algorithms
rs = np.random.RandomState(42)
clf = {
    'Variational Auto Encoders (VAE)': VAE(encoder_neurons=[16,8,4],decoder_neurons=[4,8,16], contamination = 0.02),
    'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=0.02, check_estimator=False, random_state=rs, n_clusters = 5),
    'Isolation Forest': IForest(contamination=0.02, random_state=rs),
    'Gaussian Mixture Model (GMM)': GMM(contamination=0.02,random_state=rs),
}

### 5. MODEL FITTING - TRAINING - PREDICTION ###

In [None]:
# Boolean mask to identify NaN values in each row
nan_mask = np.isnan(X)
nan_indices = np.where(nan_mask)

# Check for NaN values
print(nan_indices)

(array([], dtype=int64), array([], dtype=int64))


In [None]:
# Parameters
clusters_separation = [0]
clf_names = list(clf.keys())
outliers_pred = np.zeros((X.shape[0], len(clf_names)))

# Fit the models with the generated data and compare model performances
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)

    # Fit the model
    for i, (classifier_name, classifier) in enumerate(clf.items()):

        print(i + 1, 'fitting', classifier_name)

        # Fit the data and tag outliers
        classifier.fit(X)

        # Scores prediction
        scores_pred = classifier.decision_function(X) * -1
        
        # Outliers detection (based on prediction)
        y_pred = classifier.predict(X)
        outliers_pred[:,i] = y_pred;

1 fitting Cluster-based Local Outlier Factor (CBLOF)
2 fitting Isolation Forest
3 fitting Gaussian Mixture Model (GMM)


### 6. RESULT ANALYSIS ###

In [None]:
# Odd-Dataframes List 
odd_dfs_list = []

# Odd_UID and IP sets
odd_uid_total = []
odd_ip_total = set()

# Iteration for each algorithm
for j in range(len(clf_names)):

    # New OddDataframe
    odd_out_df = pd.DataFrame()
    # Extract rows from original dataframe which are detected as anomalous
    odd_out_df = conn_df.loc[pd.Series(outliers_pred[:, j], index=conn_df.index) == 1];
            
    # DataQuantification adjustment
    odd_out_df['sourcePort'] = np.uint16(odd_out_df['sourcePort'])
    odd_out_df['destinationPort'] = np.uint16(odd_out_df['destinationPort'])

    # Keep the odd dataframe to work on it later
    odd_dfs_list.append(odd_out_df)

    # Labeling original conn_df with Anomalous detected traffic
    conn_df.loc[conn_df['uid'].isin(odd_out_df['uid']), 'label'] = 'ANOMALOUS'

    # Identifying 'anomalous' uid connection values
    odd_uid_values = odd_out_df['uid'].unique()

    # Identifying 'anomalous' IPs involved in traffic
    odd_ips = set(odd_out_df['sourceAddress'].unique()) | set(odd_out_df['destinationAddress'].unique())

    # Storing new values of IP / UIDS
    for odd_uid in odd_uid_values:
        if odd_uid not in odd_uid_total:
            odd_uid_total.append(odd_uid)
    
    for odd_ip in odd_ips:
        if odd_ip not in odd_ip_total:
            odd_ip_total.add(odd_ip)

# Printing Results - UID
print(' --- Results based on UID:')
print('     Your Anomalous UIDs are: ' +str(odd_uid_total))
print(' ')

# Printing Results - IPs
print(' --- Results based on IP addresses: ')
print('     Your odd activity IPs are: ' +str(set(odd_ip_total)))

 --- Results based on UID ---
      Your Anomalous UIDs Have been: ['Cbm2sx4b78sNFZ7NU9', 'CZlMiZ3BMquICJWL4i', 'Cy0ZQJ2Ez055VlzWvc', 'ChaOEh4vORslQDQ2e5', 'CARoI72m3BJZCZOCy', 'C58oRT3DgATRglY8O1', 'CK6zWh3Zi2LVmw7xFg', 'CkXrqU1VAwbCbOPqPa', 'CsFYvn2fYModuIEKC7', 'CPTN7W3vWEhQ60Ueql', 'CWpmGd1bzdJqjm2Ji6', 'CkyGhEPcVAkmUouCk', 'Cp05Oh1a0KLJGIyoy3', 'CYpDck4BkQ2fmgES86', 'C9D4XQ3p4iBbAEeLfk', 'CQGSsh15ZfMUooz0L', 'CN7Fva402c8fmzk8y5', 'Cljvnj25dPlDAbMqY7', 'C2UPHqFGizWxf2rze', 'CdkRDr3MCjQYA7ZUk3', 'C3SbTV1seLBxRs5I1a', 'CDcH35wmY1gCLC292', 'CmaRMs1lR4LL4ug59', 'C6Y5sMR5eQIu5wgeg', 'CXqmmQjzyViKq2nB9', 'C52Te6XqrZeXIeoXb', 'Cbv9w7a8R0nPirFyl', 'CQQS98StOC8Ra10Cj', 'CdfPUg2fotjzpnpEyc', 'C8f1lb3QAkm31FX3t1', 'CcOzoi2LEPqwbjUS73', 'CLDkvR3tmeGR69pXab', 'CbiJSF1dITUlQUNeCj', 'CQckQD3JBVXcyomQRa', 'C0XfNv46VTk8FAYUT4', 'CODOlE2PiOdJsjIqX', 'Ci6N5A2iDlqHdQJNOe', 'CUIUty2gGfouLGKmBk', 'CeRCFe2UVfW0ts6ttc', 'CfWmsb4Un0f2aXdcq7', 'CEQyf21w4pvCq2EExk', 'CeE1VN3UT536q5iCSg', 'CWqLM31RDCMNA7iZLi'

### 7. FULL DATAFRAME ###

#### *7.0 Dataframe Recheck* ####

In [None]:
# Dataframes Format and Checking
print(' conn_df Header: ')
print(conn_df.head())
print(' ')

print(' X numpy array: ')
print(X[:3,:])

 conn_df Header: 
                    sourceAddress  sourcePort destinationAddress  \
ts                                                                 
2023-03-20 15:26:50  192.168.1.38       63916      5.255.145.241   
2023-03-20 08:49:37  192.168.1.38       50520      52.113.205.21   
2023-03-20 13:05:08  192.168.1.38       55273     51.104.167.255   
2023-03-20 11:20:27  192.168.1.38       51414      104.83.217.59   
2023-03-20 07:56:26  192.168.1.38       50197     142.250.200.69   

                     destinationPort                 uid protocol service  \
ts                                                                          
2023-03-20 15:26:50              443  C2KVa21pkH3KwtttY6      tcp     ssl   
2023-03-20 08:49:37              443  C82ZWy2waJWw8KMzcc      tcp     ssl   
2023-03-20 13:05:08              443   C9ql9PihidwHq51n7      tcp     ssl   
2023-03-20 11:20:27              443  CZmFwY2ceAGgyeYmm1      tcp     ssl   
2023-03-20 07:56:26              443  CH8U1

#### 7.1 FULL DATAFRAME PCA PROJECTION ####

In [None]:
# PCA object with 2 components
pca_object = PCA(n_components=2)

# 2-dimensional PCA representation of X
X_pca = pca_object.fit_transform(X)

# Dictionary mapping labels to colors
colors = {'ANOMALOUS': 'red', 'BENIGN': 'blue'}

# List of colors corresponding to each label in the dataframe
label_colors = [colors[label] for label in conn_df['label']]

# Extract the x and y coordinates from the X_pca matrix
x_coords = X_pca[:, 0]
y_coords = X_pca[:, 1]

# Create a list of traces, one for each label
traces = []
for label in set(conn_df['label']):
    mask = (conn_df['label'] == label)
    trace = go.Scatter(x=x_coords[mask],
                       y=y_coords[mask],
                       mode='markers',
                       marker=dict(color=colors[label]),
                       name=label, text=conn_df.uid, hoverinfo='text')
    traces.append(trace)

# Create a layout with a legend
layout = go.Layout(title='PCA Projection of Dataframe', legend=dict(title='Label'))

# Create a figure and plot the traces
fig = go.Figure(data=traces, layout=layout)
fig.show()


#### 7.2 FULL DATAFRAME UMAP PROJECTION ####

In [None]:
# UMAP - Data Fitting
fit = UMAP(metric='euclidean', n_neighbors=80, n_components=2)
X_umap = fit.fit_transform(X)

# Dictionary mapping labels to colors
colors = {'ANOMALOUS': 'red', 'BENIGN': 'blue'}

# List of colors corresponding to each label in the dataframe
label_colors = [colors[label] for label in conn_df['label']]

# Extract the x and y coordinates from the X_umap matrix
x_coords = X_umap[:, 0]
y_coords = X_umap[:, 1]

# Create a list of traces, one for each label
traces = []
for label in set(conn_df['label']):
    mask = (conn_df['label'] == label)
    trace = go.Scatter(x=x_coords[mask],
                       y=y_coords[mask],
                       mode='markers',
                       marker=dict(color=colors[label]),
                       name=label, text=conn_df.uid, hoverinfo='text')
    traces.append(trace)

# Create a layout with a legend
layout = go.Layout(title='Umap Projection of Dataframe',
                   legend=dict(title='Label'))

# Create a figure and plot the traces
fig = go.Figure(data=traces, layout=layout)
fig.show()

#### 7.3 FULL DATAFRAME CLUSTERING ####

In [None]:
# Clustering using DBScan
import hdbscan
hdbscan = hdbscan.HDBSCAN(min_cluster_size=10, cluster_selection_epsilon=0)

# Clustering with HDBScan
conn_df['cluster_db'] =  hdbscan.fit_predict(X)

# Now use dataframe group by cluster
cluster_groups_db = conn_df.groupby('cluster_db')

# Cluster Information
cluster_counts = conn_df['cluster_db'].value_counts()

print('Number of Clusters: {:d}'.format(conn_df['cluster_db'].nunique()))
for cluster_value, count in cluster_counts.items():
    print(f" -- Cluster {cluster_value} connections: {count}")

Number of Clusters: 28
 -- Cluster -1 connections: 1028
 -- Cluster 4 connections: 131
 -- Cluster 9 connections: 125
 -- Cluster 23 connections: 116
 -- Cluster 21 connections: 83
 -- Cluster 11 connections: 77
 -- Cluster 13 connections: 66
 -- Cluster 1 connections: 50
 -- Cluster 25 connections: 50
 -- Cluster 5 connections: 44
 -- Cluster 0 connections: 37
 -- Cluster 26 connections: 30
 -- Cluster 16 connections: 27
 -- Cluster 2 connections: 23
 -- Cluster 22 connections: 23
 -- Cluster 20 connections: 21
 -- Cluster 7 connections: 20
 -- Cluster 3 connections: 20
 -- Cluster 19 connections: 19
 -- Cluster 24 connections: 18
 -- Cluster 12 connections: 17
 -- Cluster 14 connections: 16
 -- Cluster 18 connections: 15
 -- Cluster 15 connections: 14
 -- Cluster 8 connections: 13
 -- Cluster 6 connections: 12
 -- Cluster 10 connections: 11
 -- Cluster 17 connections: 11


#### 7.3.1 CLUSTERED UMAP PROJECTION ####

In [None]:
umap = UMAP(metric='euclidean', n_neighbors=80, n_components=2)

# Projections
projection = umap.fit_transform(X);
conn_df['x'] = projection[:, 0]; 
conn_df['y'] = projection[:, 1]; 

# Data export to CSV
conn_df.to_csv('text.csv', index=True)

# Get a list of colors
k = 0
colors = list(mcolors.TABLEAU_COLORS.values())
colors += [color for name, color in mcolors.CSS4_COLORS.items() if name not in colors]
ncolors = conn_df['cluster_db'].nunique()

# Create a dictionary of colors
color_dict = {f"{k-1}": color for k, color in enumerate(colors[:ncolors])}

# Create a scatter plot using plotly
fig = px.scatter(conn_df, x='x', y='y', color='cluster_db', color_discrete_map=color_dict)
fig.update_layout(title='SSL Dataframe HDBScan Clustering')
fig.show()

#### 7.4 FULL DATAFRAME STATISTICS ####

In [None]:
pd.set_option('display.width', 1000)

# Features to extract statistics from
stats_features = ['sourceAddress', 'sourcePort', 'destinationAddress', 'destinationPort', 'protocol', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'sourceDnsDomain', 'destinationDnsDomain', 'sourceHostName',
       'destinationHostName', 'mediaOrigen', 'mediaResp','desvOrigen', 'desvResp', 'noceroOrigen', 'noceroResp', 'mediaTime', 'desvTime','label']


# Export the stats to a .txt file
old_stdout = sys.stdout

with open('filename.txt', 'w') as f:
    # Redirect the standard output to the file
    sys.stdout = f
    
    for key, group in cluster_groups_db:

        print("*"*40)
        print('GROUP '+str(key))
        print("*"*40)
        print('')
        print('- Number of samples in this group: '+ str(len(group)))

        for feature in stats_features:
            
            if pd.api.types.is_timedelta64_dtype(group[feature]): 
                group[feature] = group[feature].dt.total_seconds().astype(float)

            print(f"Feature: {feature}")
            print("-"*30)

            if conn_df[feature].dtype.name == 'category':
                # Categorical feature
                top_values = group[feature].value_counts().head(5)
                print(f"Top 5 values by {feature}:")
                print(top_values)
            else:
                # Numerical feature
                data = group[feature].astype(float)
                print(f"Mean: {data.mean()}")
                print(f"Std Deviation: {data.std()}")
                print(f"Variance: {data.var()}")
                print(f"Range: {data.max() - data.min()}")
                print(f"First Quartile: {np.percentile(data, 25)}")
            print("\n")
    sys.stdout = old_stdout

#### 7.5 FULL DATAFRAME FEATURES STATISTICS PLOT ####

In [None]:
# Print Distribution of each feature
# -- Each feature values are ordered from lower to higher

for feature in features:
    # Count the frequency of each unique value in the column
    value_counts = conn_features_df[feature]
    values_list = sorted(value_counts)

    # X-Axis normalization
    x_axis = list(range(1, len(values_list) + 1))
    x_axis = np.divide(x_axis,int(len(values_list)+1))

    pos_09 = int(len(values_list) * 0.9)
    y_max = values_list[pos_09]

    # Create a bar chart
    fig = go.Figure(data=[go.Bar(x=x_axis, y=values_list, orientation='v', marker=dict(color='red'))])

    # Set the plot title and axis labels
    fig.update_layout(title= str(feature) + ' Distribution', xaxis = dict(title='Samples'), 
                      yaxis = dict(title = 'Value', gridcolor = 'rgba(255, 0, 0, 0.2)', range=[0, y_max]), 
                      plot_bgcolor='white', font=dict(family="Calibri, sans-serif", size=16))

    # show the plot
    fig.show()

### 8. ANOMALOUS CONNECTIONS ###

#### 8.1 ANOMALOUS DATAFRAME PREPARATION ####

In [None]:
# Anomalous Dataset Preparation
# -- Keep only labeled anomalous connections
anomalous_conn_df = conn_df[conn_df['label'] == 'ANOMALOUS'];
print('Anomalous Dataframe size: '+str(anomalous_conn_df.shape))

# Keep only relevant features
anomalous_conn_df = anomalous_conn_df[full_features]       
anomalous_conn_df = anomalous_conn_df.dropna()

for feature in features:
    # Timedelta to numeric time value conversion
    if pd.api.types.is_timedelta64_dtype(anomalous_conn_df[feature]): 
        anomalous_conn_df[feature] = anomalous_conn_df[feature].dt.total_seconds().astype(float)

# Unique Anomalous Connection Identifier
anomalous_conn_uid_values = anomalous_conn_df['uid'].unique()

# Anomalous Connection IPs
anomalous_ips = set(anomalous_conn_df['sourceAddress'].unique()) | set(anomalous_conn_df['destinationAddress'].unique())
all_distinct_ips = set(conn_df['sourceAddress'].unique()) | set(conn_df['destinationAddress'].unique())

# Anomalous NDarray
anomalous_indices = np.where(conn_df['label'] == 'ANOMALOUS')[0]
X_anomalous = X[anomalous_indices]
print(X_anomalous.shape)

Anomalous Dataframe size: (61, 33)
(61, 7)


#### 8.2 ANOMALOUS DATAFRAME CLUSTERING ####

In [None]:
# Create a PCA + UMAP objects
umap = UMAP(metric='euclidean', n_neighbors=15, n_components=2)

# Clustering using HDBScan
import hdbscan
hdbscan = hdbscan.HDBSCAN(min_cluster_size = 8)

anomalous_conn_df['cluster_db'] =  hdbscan.fit_predict(X_anomalous)
X_anomalous_umap = umap.fit_transform(X_anomalous);
print('Number of Clusters: {:d}'.format(anomalous_conn_df['cluster_db'].nunique()))

# Exporting Malign_Df to CSV
anomalous_conn_df.to_csv('anomaloustext.csv', index=False)

# Now use dataframe group by cluster
anomalous_cluster_groups_db = anomalous_conn_df.groupby('cluster_db')

# Projections
anomalous_conn_df['x'] = X_anomalous_umap[:, 0]; # Projection X Column
anomalous_conn_df['y'] = X_anomalous_umap[:, 1]; # Projection Y Column

# Get a list of colors
k = 0
colors = list(mcolors.TABLEAU_COLORS.values())
colors += [color for name, color in mcolors.CSS4_COLORS.items() if name not in colors]
ncolors = anomalous_conn_df['cluster_db'].nunique()

# Create a dictionary of colors
color_dict = {f"{k-1}": color for k, color in enumerate(colors[:ncolors])}

# Create a scatter plot using plotly
fig = px.scatter(anomalous_conn_df, x='x', y='y', color='cluster_db',
                 color_discrete_map=color_dict)
fig.update_layout(title='Anomalous Connections DBScan Clustering')
fig.show()

Number of Clusters: 3


#### 8.3 ANOMALOUS DATAFRAME STATISTICS ####

In [None]:
# Now print out the details for each cluster
pd.set_option('display.width', 1000)

# Features to extract statistics from
stats_features = ['sourceAddress', 'sourcePort', 'destinationAddress', 'destinationPort', 'protocol', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'sourceDnsDomain', 'destinationDnsDomain', 'sourceHostName',
       'destinationHostName', 'mediaOrigen', 'mediaResp','desvOrigen', 'desvResp', 'noceroOrigen', 'noceroResp', 'mediaTime', 'desvTime','label']

# Statistics results print
for key, group in anomalous_cluster_groups_db:

    print("*"*40)
    print('GROUP '+str(key))
    print("*"*40)
    print('')
    print('- Number of samples in this group: '+ str(len(group)))

    for feature in stats_features:
        
        if pd.api.types.is_timedelta64_dtype(group[feature]): 
            group[feature] = group[feature].dt.total_seconds().astype(float)

        print(f"Feature: {feature}")
        print("-"*30)

        if anomalous_conn_df[feature].dtype.name == 'category':
            # Categorical feature
            top_values = group[feature].value_counts().head(5)
            print(f"Top 5 values by {feature}:")
            print(top_values)
        else:
            # Numerical feature
            data = group[feature].astype(float)
            print(f"Mean: {data.mean()}")
            print(f"Std Deviation: {data.std()}")
            print(f"Variance: {data.var()}")
            print(f"Range: {data.max() - data.min()}")
            print(f"First Quartile: {np.percentile(data, 25)}")
        print("\n")

****************************************
GROUP -1
****************************************

- Number of samples in this group: 32
Feature: sourceAddress
------------------------------
Top 5 values by sourceAddress:
192.168.1.38    31
10.2.7.101       1
::               0
5.205.18.110     0
5.205.18.41      0
Name: sourceAddress, dtype: int64


Feature: sourcePort
------------------------------
Mean: 53190.5
Std Deviation: 4109.675899630042
Variance: 16889436.0
Range: 14028.0
First Quartile: 50924.75


Feature: destinationAddress
------------------------------
Top 5 values by destinationAddress:
77.111.247.7       4
52.113.205.21      4
20.60.194.10       3
151.101.134.248    2
192.229.220.133    2
Name: destinationAddress, dtype: int64


Feature: destinationPort
------------------------------
Mean: 443.0
Std Deviation: 0.0
Variance: 0.0
Range: 0.0
First Quartile: 443.0


Feature: protocol
------------------------------
Top 5 values by protocol:
tcp     32
icmp     0
udp      0
Name: pr

#### 8.4 ANOMALOUS FEATURES STATISTICS PLOT ####

In [None]:
# Print Distribution of each feature
# -- Each feature values are ordered from lower to higher

for feature in features:
    # Count the frequency of each unique value in the column
    value_counts = anomalous_conn_df[feature]
    values_list = sorted(value_counts)

    # X-Axis normalization
    x_axis = list(range(1, len(values_list) + 1))
    x_axis = np.divide(x_axis,int(len(values_list)+1))

    pos_09 = int(len(values_list) * 0.9)
    y_max = values_list[pos_09]

    # Create a bar chart
    fig = go.Figure(data=[go.Bar(x=x_axis, y=values_list, orientation='v', marker=dict(color='red'))])

    # Set the plot title and axis labels
    fig.update_layout(title= str(feature) + ' Anomalous Distribution', xaxis = dict(title='Samples'), 
                      yaxis = dict(title = 'Value', gridcolor = 'rgba(255, 0, 0, 0.2)', range=[0, y_max]), 
                      plot_bgcolor='white', font=dict(family="Calibri, sans-serif", size=16))

    # Show the plot
    fig.show()

### 9. NON ANOMALOUS DATAFRAME ###

#### 9.1 NON ANOMALOUS DATAFRAME PREPARATION ####

In [None]:
# NonAnomalous Dataset Preparation
# -- Keep only labeled anomalous connections
nanomalous_conn_df = conn_df[conn_df['label'] == 'BENIGN'];
nanomalous_conn_df = nanomalous_conn_df[full_features]

for feature in features:
    # Timedelta to numeric time value conversion
    if pd.api.types.is_timedelta64_dtype(nanomalous_conn_df[feature]): 
        nanomalous_conn_df[feature] = nanomalous_conn_df[feature].dt.total_seconds().astype(float)

# NDarray For Clustering
nanomalous_indices = np.where(conn_df['label'] == 'BENIGN')[0]
X_nanomalous = X[nanomalous_indices]
print('Non anomalous dataframe size: '+str(X_nanomalous.shape))

Non anomalous dataframe size: (2056, 7)


#### 9.2 NON ANOMALOUS DF CLUSTERING ####

In [None]:
# UMAP and HDBSCAN 
umap = UMAP(metric='euclidean', n_neighbors=25, n_components=2)

import hdbscan
hdbscan = hdbscan.HDBSCAN(min_cluster_size = 10)
# -- min_cluster_size = 5, cluster_selection_epsilon = 0.02

# Clustering using HDBScan
nanomalous_conn_df['cluster_db'] =  hdbscan.fit_predict(X_nanomalous)
print('Number of Clusters: {:d}'.format(nanomalous_conn_df['cluster_db'].nunique()))

# UMAP Projection
X_nanomalous_umap = umap.fit_transform(X_nanomalous);

# Export Data to CSV
nanomalous_conn_df.to_csv('nanomalousfile.csv', index=False)

# Now use dataframe group by cluster
nanomalous_cluster_groups_db = nanomalous_conn_df.groupby('cluster_db')

# Projections
nanomalous_conn_df['x'] = X_nanomalous_umap[:, 0]; # Projection X Column
nanomalous_conn_df['y'] = X_nanomalous_umap[:, 1]; # Projection Y Column

# Get a list of colors
k = 0
colors = list(mcolors.TABLEAU_COLORS.values())
colors += [color for name, color in mcolors.CSS4_COLORS.items() if name not in colors]
ncolors = nanomalous_conn_df['cluster_db'].nunique()

# Create a dictionary of colors
color_dict = {f"{k-1}": color for k, color in enumerate(colors[:ncolors])}

# Create a scatter plot using plotly
fig = px.scatter(nanomalous_conn_df, x='x', y='y', color='cluster_db',
                 color_discrete_map=color_dict)
fig.update_layout(title='Non Anomalous Connections DBScan Clustering')
fig.show()

Number of Clusters: 28


#### 9.3 NON ANOMALOUS DF STATISTICS ####

In [None]:
# Now print out the details for each cluster
pd.set_option('display.width', 1000)

# Features to extract stats from
stats_features = ['sourceAddress', 'sourcePort', 'destinationAddress', 'destinationPort', 'protocol', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'sourceDnsDomain', 'destinationDnsDomain', 'sourceHostName',
       'destinationHostName', 'mediaOrigen', 'mediaResp','desvOrigen', 'desvResp', 'noceroOrigen', 'noceroResp', 'mediaTime', 'desvTime','label']

# Export to txt file
old_stdout = sys.stdout
with open('nanomalousstatsfile.txt', 'w') as f:
    # Redirect the standard output to the file
    sys.stdout = f
    
    for key, group in nanomalous_cluster_groups_db:

        print("*"*40)
        print('GROUP '+str(key))
        print("*"*40)
        print('')

        for feature in stats_features:
            
            if pd.api.types.is_timedelta64_dtype(group[feature]): 
                group[feature] = group[feature].dt.total_seconds().astype(float)

            print(f"Feature: {feature}")
            print("-"*30)

            if nanomalous_conn_df[feature].dtype.name == 'category':
                # Categorical feature
                top_values = group[feature].value_counts().head(5)
                print(f"Top 5 values by {feature}:")
                print(top_values)
            else:
                # Numerical feature
                data = group[feature].astype(float)
                print(f"Mean: {data.mean()}")
                print(f"Median: {data.median()}")
                print(f"Variance: {data.var()}")
                print(f"Range: {data.max() - data.min()}")
                print(f"First Quartile: {np.percentile(data, 25)}")
            print("\n")

    # Restore the standard output
    sys.stdout = old_stdout


#### 9.4 NON ANOMALOUS FEATURES STATISTICS PLOT ####

In [None]:
for feature in features:
    # count the frequency of each unique value in the column
    value_counts = nanomalous_conn_df[feature]
    values_list = sorted(value_counts)

    x_axis = list(range(1, len(values_list) + 1))
    x_axis = np.divide(x_axis,int(len(values_list)+1))

    pos_09 = int(len(values_list) * 0.9)
    y_max = values_list[pos_09]

    # Create a bar chart
    fig = go.Figure(data=[go.Bar(x=x_axis, y=values_list, orientation='v', marker=dict(color='red'))])

    # Set the plot title and axis labels
    fig.update_layout(title= str(feature) + ' SSL DF Distribution', xaxis = dict(title='Samples'), 
                      yaxis = dict(title = 'Value', gridcolor = 'rgba(255, 0, 0, 0.2)'), 
                      plot_bgcolor='white', font=dict(family="Calibri, sans-serif", size=16))

    # Show the plot
    fig.show()

#### 9.5 ANOMALOUS VS NON ANOMALOUS FEATURES STATISTICS PLOT ####

*These plots allow to directly compare the statisics of the anomalous and the non-anomalous connections therefore giving the picture of the ML classification results.*

In [None]:
for feature in features:
    # Count the frequency of each unique value in the column
    anomalous_value_counts = anomalous_conn_df[feature]
    anomalous_values_list = sorted(anomalous_value_counts)

    nanomalous_value_counts = nanomalous_conn_df[feature]
    nanomalous_values_list = sorted(nanomalous_value_counts)

    x_axis_anomalous = list(range(1, len(anomalous_values_list) + 1))
    x_axis_anomalous = np.divide(x_axis_anomalous,int(len(anomalous_values_list)+1))

    x_axis_nanomalous = list(range(1, len(nanomalous_values_list) + 1))
    x_axis_nanomalous = np.divide(x_axis_nanomalous,int(len(nanomalous_values_list)+1))

    pos_095 = int(len(anomalous_values_list) * 0.95)
    y_max = anomalous_values_list[pos_095]

    # Create traces
    anomalous_trace = go.Bar(x=x_axis_anomalous, y=anomalous_values_list, name='Anomalous Trace', orientation='v', marker=dict(color='red'))
    nanomalous_trace = go.Bar(x=x_axis_nanomalous, y=nanomalous_values_list, name='Non Anomalous Trace', orientation='v', marker=dict(color='blue'))

    # Set the plot title and axis labels
    layout = go.Layout(title= str(feature) + ' SSL DF Distribution', xaxis = dict(title='Samples'), 
                      yaxis = dict(title = 'Value', gridcolor = 'rgba(255, 0, 0, 0.2)', range=[0, y_max]), 
                      plot_bgcolor='white', font=dict(family="Calibri, sans-serif", size=16), barmode = 'overlay')

    # Add traces to a list
    data = [anomalous_trace, nanomalous_trace]

    # Create figure and plot
    fig = go.Figure(data=data, layout=layout)

    fig.show()

### 10. HISTOGRAM ANALYSIS - FULL DATAFRAME ###

In [None]:
# Histogram plot for each features
# -- ylabel = frequency of the feature
# -- xlabel = value of the feature (ascending order)

for col in conn_features_df.columns:
    fig = px.histogram(conn_features_df, x=col, title=f'Histogram for {col}, DNS DF', marginal="rug", height=300, width=1000)
    fig.update_xaxes(showticklabels=False, title='Value',row=1, col=1)
    fig.update_yaxes(title = 'Frequency of '+str(col), row=1, col=1)
    fig.update_layout_images(margin=dict(l=40, r=40, t=40, b=40))
    fig.show()