<a href="https://colab.research.google.com/github/MiguelCarbo/Anomalies-Detection-TFG/blob/main/2_IForest_Sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ANOMALY EXPLORATION - TFG #
### <i> Miguel Ivars Carbo, 795802@unizar.es </i> ###  
<i> The main idea of this notebook is to provide a simple and understandable script to be able to perform a basic Isolation forest on a connection log. </i>  
<i> Further exploration on this topic must be accomplished in order to obtain the expected results </i>

In [None]:
# Reset Variables
%reset

In [None]:
# C:\Users\mivar\Documents\04_MIGUEL\01UNIVERSIDAD\TFG\Zeek\Packages\zat-main\zat-main

# Packet Unzipping
!unzip zat.zip;

# Local Imports
import zat;
from zat.log_to_dataframe import LogToDataFrame;
from zat.dataframe_to_matrix import DataFrameToMatrix;

# Packet Imports
import pandas as pd;
import numpy as np;
from numpy import savetxt;
import sklearn;
from sklearn.ensemble import IsolationForest;
from sklearn.decomposition import PCA;
from sklearn.cluster import KMeans, DBSCAN;

# Version Printing
print('zat: {:s}'.format(zat.__version__))
print('Pandas: {:s}'.format(pd.__version__))
print('Numpy: {:s}'.format(np.__version__))
print('Scikit Learn Version:', sklearn.__version__)

### 1. CHOOSING LOG TO CREATE DATAFRAME FROM ###



In [None]:
# Create a Pandas dataframe from the Zeek log

log_to_df = LogToDataFrame();
# // conn_df = log_to_df.create_dataframe('conn.log')
conn_df = log_to_df.create_dataframe('mpli_conn_ts.log');

# Keep only half of the rows in the DataFrame
# // conn_df = conn_df.sample(frac=0.75, random_state=42)
print('Read in {:d} Rows...'.format(len(conn_df)))
print(conn_df.columns)

### 2. FEATURE SELECTION ###

#### 2.A CONN.LOG ####

In [None]:
# Pick some features that might be interesting
features = ['proto','service','duration','orig_bytes','resp_bytes','conn_state',
       'local_orig','local_resp','missed_bytes','history','orig_pkts',
       'resp_pkts', 'tunnel_parents'];
# features = ['proto', 'service','duration','orig_bytes','resp_bytes','missed_bytes', 'history','orig_pkts','resp_pkts',];   

#### 2.B AMPLI_CONN.LOG ####

In [None]:
# Features of the ampli_conn.log file

# Complete Features
# // features = ['ts','sourceAddress','sourcePort','destinationAddress','destinationPort','service','duration','orig_bytes','resp_bytes',
#           'history','orig_pkts','resp_pkts','mediaOrigen','mediaResp','desvOrigen','desvResp','mediaTime','desvTime' ]

features = ['service','duration','orig_bytes','resp_bytes','history','orig_pkts','resp_pkts','mediaOrigen','mediaResp','desvOrigen','desvResp','mediaTime','desvTime' ]

###  3. DATA PREPRATION ####

#### 3.1 DATA CLEANSING ####

In [None]:
# Feature Filtering + NaNs removal
conn_features_df = conn_df[features]
conn_features_df = conn_features_df.dropna()

In [None]:
# Necessary to convert TimeDelta to Int + DataCleansing
for feature in features:
    if (conn_features_df[feature].dtype.name == 'category') and ('0' not in conn_features_df[feature].cat.categories):
        conn_features_df[feature] = conn_features_df[feature].cat.add_categories(['0'])
        conn_features_df[feature] = conn_features_df[feature].fillna('0')
    elif pd.api.types.is_timedelta64_dtype(conn_features_df[feature]): 
        conn_features_df[feature] = conn_features_df[feature].fillna(pd.Timedelta(0))
        conn_features_df[feature] = conn_features_df[feature].dt.total_seconds().astype(int)
    else: 
        conn_features_df[feature] = conn_features_df[feature].fillna(0)

#### 3.2 CREATION OF MATRIX BASED ON FEATURES ####

In [None]:
# Use the DataframeToMatrix class (handles categorical data)
to_matrix = DataFrameToMatrix();
conn_feature_matrix = to_matrix.fit_transform(conn_features_df, normalize=True);
conn_feature_matrix.shape
# -- Note that we have built the matrix using only the selected features

### 4. MODEL TRAINING - ISOLATION FOREST ###


In [None]:
# Train/fit and Predict anomalous instances using the Isolation Forest model
odd_clf = IsolationForest(contamination=0.05) # Normal is around 10% odd
odd_clf.fit(conn_feature_matrix)

In [None]:
# -- Now we create a new dataframe using the prediction from our classifier
odd_df = conn_features_df[odd_clf.predict(conn_feature_matrix) == -1]
# print(odd_df.shape)
# odd_df.head()

# -- The command is using the trained odd_clf Isolation Forest model to predict 
# the anomalies in the conn_feature_matrix dataset, then selecting those anomalous 
# data points from the original conn_df dataset and storing them in the odd_df variable.

# -- More specifically, the odd_clf.predict(conn_feature_matrix) == -1 part of 
# the command applies the Isolation Forest model to the conn_feature_matrix dataset, 
# and returns an array of predictions with the same length as the number of rows in the dataset. 
# Each element of the array is either 1 or -1, indicating whether the corresponding data point 
# is predicted to be normal (1) or anomalous (-1).

# Se obtiene el dataframe como matriz
odd_matrix = to_matrix.fit_transform(odd_df)

### 5. CLUSTERING THE ODD DATAFRAME ####

#### 5.A CLUSTERING WITH KMEANS + DIM.REDUCTION USING PCA / TSNE ####

##### 5.A.1 SILHOUETTE SCORING #####

In [None]:
from sklearn.metrics import silhouette_score

scores = []
clusters = range(2,14)

for K in clusters:
    clusterer = KMeans(n_clusters=K)
    cluster_labels = clusterer.fit_predict(odd_matrix)
    score = silhouette_score(odd_matrix, cluster_labels)
    scores.append(score)

# -- The first command creates an instance of the KMeans clustering algorithm 
# with K specified clusters. 
# KMeans is a popular clustering algorithm that partitions a set of data points 
# into K clusters based on the similarity of their features. 

# -- The second command applies the KMeans algorithm to the odd_matrix matrix 
# of numerical features derived from the anomalous connections. 
# Specifically, the fit_predict() method of the clusterer object is used 
# to fit the KMeans model to the data in odd_matrix, and to predict the cluster 
# labels for each data point in the matrix.

# -- The resulting cluster_labels array contains the predicted cluster 
# labels for each data point in odd_matrix. 
# The labels are integers ranging from 0 to K-1, indicating the cluster 
# to which each data point has been assigned by the algorithm.

# Plot it out
pd.DataFrame({'Num Clusters':clusters, 'score':scores}).plot(x='Num Clusters', y='score')

In [None]:
# Same step but with fixed number of clusters
kmeans_labels = KMeans(n_clusters=10).fit_predict(odd_matrix)  

# 2D - Projection with PCA / TSNE
n_components = odd_df.shape[1]
projection = PCA(n_components).fit_transform(odd_matrix)
# projection = TSNE().fit_transform(odd_matrix);

# -- Overall, these commands are performing dimensionality reduction 
# on the anomalous connections, using PCA to transform the high-dimensional 
# feature vectors into a lower-dimensional space defined by the principal 
# components. 

# Now we can put our ML results back onto our dataframe!
odd_df['x'] = projection[:, 0] # Projection X Column
odd_df['y'] = projection[:, 1] # Projection Y Column
odd_df['cluster'] = kmeans_labels
odd_df.head()

# -- The first command creates a new column called 'x' in the odd_df dataframe 
# and assigns to it the values from the first column of the projection matrix, 
# which correspond to the first principal component of the anomalous connections. 
# This effectively adds a new column to the dataframe representing 
# the x-coordinates of each connection in the lower-dimensional space defined 
# by the principal components.

# -- The third command creates a new column called 'cluster' in the odd_df 
# dataframe and assigns to it the cluster labels obtained from the KMeans algorithm. 
# This effectively adds a new column to the dataframe indicating the cluster to 
# which each connection has been assigned.

# -- Overall, these commands are adding new columns to the odd_df dataframe 
# to store the x- and y-coordinates of each anomalous connection in the 
# lower-dimensional space defined by the principal components, as well as the 
# cluster labels obtained from the KMeans algorithm. 
# This can be useful for visualizing the anomalies and the clusters 
# to which they belong, and for further analyzing the patterns and features 
# associated with the anomalous behavior.

##### 5.A.2 PLOTTING DEFAULTS #####

In [None]:
# Plotting Configurations
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14.0
plt.rcParams['figure.figsize'] = 12.0, 6.0

##### 5.A.3 GRAPHICAL CLUSTER VISUALIZATION #####

In [None]:
# Helper method for scatter/beeswarm plot
def jitter(arr):
    stdev = .02*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev
    
# Jitter so we can see instances that are projected coincident in 2D
odd_df['jx'] = jitter(odd_df['x'])
odd_df['jy'] = jitter(odd_df['y'])

# Now use dataframe group by cluster
cluster_groups = odd_df.groupby('cluster')

# Plot the Machine Learning results
# colors = {0:'green', 1:'blue', 2:'red', 3:'orange', 4:'purple', 5:'brown', 6:'yellow', 7:'magenta', 8:'grey', 9:'cyan', 10:'pink', 11:'olive', 12:'navy', 13:'aquamarine'}
colors = {0:'green', 1:'blue', 2:'red', 3:'orange', 4:'purple', 5:'brown', 6:'yellow', 7:'magenta', 8:'grey', 9:'cyan', 10:'pink', 11:'olive'}

fig, ax = plt.subplots()
for key, group in cluster_groups:
    group.plot(ax=ax, kind='scatter', x='jx', y='jy', alpha=0.5, s=250,
               label='Cluster: {:d}'.format(key), color=colors[key])

###### 5.A.4 CLUSTER DETAIL PRINTING #####

In [None]:
# print out the details for each cluster
pd.set_option('display.width', 1000)
for key, group in cluster_groups:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    print(group[features].head())

#### 5.B USING DBSCAN ####

In [None]:
# Now try DBScan
odd_df['cluster_db'] = DBSCAN().fit_predict(odd_matrix)
print('Number of Clusters: {:d}'.format(odd_df['cluster_db'].nunique()))

##### 5.B.1 PLOTTING DEFAULTS #####

In [None]:
# Plotting Configurations
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14.0
plt.rcParams['figure.figsize'] = 12.0, 6.0

##### 5.B.2 CLUSTER RESULTS VISUALIZATION #####

In [None]:
from sklearn.manifold import TSNE;

# Projection using TSNE or PCA algorithm
projection = TSNE().fit_transform(odd_matrix);

odd_df['x'] = projection[:, 0]; # Projection X Column
odd_df['y'] = projection[:, 1]; # Projection Y Column

# Helper method for scatter/beeswarm plot
def jitter(arr):
   stdev = .02*(max(arr)-min(arr))
   return arr + np.random.randn(len(arr)) * stdev

# Jitter so we can see instances that are projected coincident in 2D
odd_df['jx'] = jitter(odd_df['x'])
odd_df['jy'] = jitter(odd_df['y'])

# Now use dataframe group by cluster
cluster_groups_db = odd_df.groupby('cluster_db')

# Plot the Machine Learning results
# colors = {-1:'green', 0:'blue', 1:'red', 2:'orange', 3:'purple', 4:'brown', 5:'yellow', 6:'magenta', 7:'grey', 8:'cyan', 9:'pink', 10:'olive', 11:'navy', 12:'aquamarine'}
colors = {-1:'green', 0:'blue', 1:'red', 2:'orange', 3:'purple'}
fig, ax = plt.subplots()
for key, group in cluster_groups_db:
    group.plot(ax=ax, kind='scatter', x='jx', y='jy', alpha=0.5, s=250,
               label='Cluster_DB: {:d}'.format(key), color=colors[key])

##### 5.B.3 CLUSTER RESULTS PRINTING #####

In [None]:
# Now print out the details for each cluster
pd.set_option('display.width', 1000)
for key, group in cluster_groups_db:
    print('\nCluster {:d}: {:d} observations'.format(key, len(group)))
    print(group[features].head())

#### 6. HISTOGRAM OBTENTION ####  
<i> Recuerda que esto es válido solo para datos numéricos y no categóricos </i>

In [None]:
# Distribution of the request body length
conn_df[['orig_bytes']].hist()
print('\nFor this small demo dataset almost all request_body_len are 0\nCluster 2 represents outliers')

### 7. RESULT ANALYSIS ###

#### 7.A FROM CSV LABELED DATA ####

##### 7.1 ORIGINAL LABELED DATA IMPORTING #####

In [None]:
# Original Source with Labeled Data

original_df = pd.read_csv('Tuesday-WorkingHours.pcap_ISCX.csv')
print(original_df.columns)

# Drop all rows in which column 'feature' does not fit a criteria
malign_original_df = original_df.drop(original_df[original_df[' Label'] == 'BENIGN'].index)

# Connection = SrcIP + DstIP
if (' Source IP' and ' Destination IP' and ' Source Port' and ' Destination Port') in original_df.columns:
    malign_original_df = malign_original_df.rename(columns={' Source IP': 'sourceAddress'})
    malign_original_df = malign_original_df.rename(columns={' Destination IP': 'destinationAddress'})
    malign_original_df = malign_original_df.rename(columns={' Source Port': 'sourcePort'})
    malign_original_df = malign_original_df.rename(columns={' Destination Port': 'destinationPort'})

# Unique malign labeled Flows
malign_original_df['flux_id'] = malign_original_df['sourceAddress'].astype(str) + malign_original_df['destinationAddress'].astype(str) + malign_original_df['sourcePort'].astype(str) + malign_original_df['destinationPort'].astype(str)
malign_flux_id_values = malign_original_df['flux_id'].unique()


##### 7.2 ODD DATAFRAME PREPARATION #####

In [None]:
# Rebuild the odd_df to include original features
merged_df = pd.merge(odd_df, conn_df, on=features)

# New Field = IP Src + IP Dest + PSrc + PDest
merged_df['flux_id'] = merged_df['sourceAddress'].astype(str) + merged_df['destinationAddress'].astype(str) + merged_df['sourcePort'].astype(str) + merged_df['destinationPort'].astype(str)

# Unique detected odd Flows
odd_flux_id_values = merged_df['flux_id'].unique()


##### 7.3 ANOMALIES COMPARISON #####

In [None]:
# Positive Detection
detected_positives = 0
for value in odd_flux_id_values:

    # Check if the concatenated value is in the list of ip_src_dest values
    if value in malign_flux_id_values:
        detected_positives+=1

print('Number of successes:', detected_positives)
print('False Positives: ', odd_df.shape[0] - detected_positives)
print('Undetected Malign:', malign_original_df.shape[0] - detected_positives)

#### 7.B FROM LABELED CONN.LOG ####

In [None]:
# Dataframe etiquetado según el Script creado
labeled_conn_df = log_to_df.create_dataframe('mpli_conn_label.log');
malign_conn_df = labeled_conn_df[labeled_conn_df['label'] == 'MALIGN'];
malign_conn_df['flux_id'] = malign_conn_df['sourceAddress'].astype(str) + malign_conn_df['destinationAddress'].astype(str) + malign_conn_df['sourcePort'].astype(str) + malign_conn_df['destinationPort'].astype(str)

In [None]:
# Keep only columns with 'MALIGN' label to optimize operations
features_ts = features; 
features_ts.append('ts')
odd_features_df = pd.merge(odd_df, conn_df, on=features_ts)
odd_features_df['flux_id'] = odd_features_df['sourceAddress'].astype(str) + odd_features_df['destinationAddress'].astype(str) + odd_features_df['sourcePort'].astype(str) + odd_features_df['destinationPort'].astype(str)


In [None]:
malign_common_df_1 = pd.merge(odd_features_df, malign_conn_df, on='flux_id')
malign_common_df_2 = pd.merge(odd_features_df, malign_conn_df, on=features)

positives1 = len(malign_common_df_1)
positives2 = len(malign_common_df_2)

print(positives1)
print(positives2)