In [None]:
import nest_asyncio
nest_asyncio.apply()
from Kitsune import Kitsune
import numpy as np
import os
import pickle
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11,9)})
import ast
from tqdm.notebook import tqdm 
import pandas as pd
import pyshark
from py2neo import Graph, Node, Relationship
import webbrowser
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, precision_score, 
                             average_precision_score, roc_auc_score, roc_curve, RocCurveDisplay, 
                             precision_recall_curve, PrecisionRecallDisplay, mean_squared_error,
                             det_curve, DetCurveDisplay)
%matplotlib inline
%load_ext snakeviz

In [None]:
data_dir = 'data_sdc11073/results'
pcapng_traintestData_path = os.path.join(data_dir, 'capture_OPtable_traintest2.pcapng')
pcap_traintestData_path = os.path.join(data_dir, 'capture_OPtable_traintest.pcap')
csv_traintestData_path = os.path.join(data_dir, 'capture_OPtable_traintest2.csv')
csv_traintestData_gt_path = os.path.join(data_dir, 'capture_OPtable_traintest2_gt.csv')
tsv_traintestData_path = os.path.join(data_dir, 'capture_OPtable_traintest2.pcapng.tsv')
pcap_realData_path = os.path.join(data_dir, 'capture_randomRealData.pcap')
pcapng_trainData_path = os.path.join(data_dir, 'capture_OPtable_2.pcapng')
csv_hyperparaOpt_infos = os.path.join(data_dir, 'hyperpara_optimization2_infos.csv')

Anomaly_startIdx = 97790 
FMgrace = 5000 
ADgrace = 60000 

In [None]:
id2t_data_dir = 'data_sdc11073/ID2T_synth_attack/OPtable_merged_and_gt'
synthAnomaly_data = os.path.join(id2t_data_dir, 'capture_OPtable_PortscanAttack.pcapng')
synthAnomaly_gt = os.path.join(id2t_data_dir, 'capture_OPtable_PortscanAttack_gt.csv')
synthAnomaly_infos = os.path.join(id2t_data_dir, 'capture_OPtable_PortscanAttack.csv')

Anomaly_startIdx = 70000
FMgrace = 5000 
ADgrace = 55000 

In [None]:
packet_limit = np.Inf
#maxAE = 10 # default Value
#maxAE = 9 # first tuning
maxAE = 4 # second tuning
#learning_rate = 0.1 # default Value
#learning_rate = 0.0053 # first tuning
learning_rate = 0.1560 # second tuning
#hidden_ratio = 0.75 # default Value
#hidden_ratio = 0.2264 # first tuning
hidden_ratio = 0.5306 # second tuning
#sensitivity = 1 # default Value
#sensitivity = 0.1979 # first tuning
#sensitivity = 0.2121 # second tuning
sensitivity = 0.45

## Dataset Visualization

In [None]:
#df = pd.read_csv(csv_traintestData_path, delimiter=',', encoding='utf8', header=0)
df = pd.read_csv(synthAnomaly_infos, delimiter=',', encoding='utf8', header=0)
df.sample(10)

In [None]:
df = pd.read_csv(tsv_traintestData_path, delimiter='\t', encoding='utf8', header=0)
df.sample(10)

In [None]:
plt.rcParams['figure.figsize'] = (11,9)
plt.plot(df['Source Port'], df['Destination Port'], 'o')
#plt.title('Connections between different clients and devices')
plt.xlabel('Source Port')
plt.ylabel('Destination Port')
#plt.savefig('PortscanDataConnections.pdf', bbox_inches='tight')
plt.show()

In [None]:
plt.bar(df['Time'], df['Time delta from previous captured frame'])
#plt.plot(df['Time'], df['Time delta from previous captured frame'], '-')
plt.title('Time delays between consecutive data frames')
plt.xlabel('Time [s]')
plt.ylabel('Delay [s]')
plt.show()

In [None]:
df_ = df.groupby('Source Port')['Length'].sum()
df_mb = df_/ (1024*1024)
df_mb.plot(kind='barh', title='Sum of packet length [MB] per source port')

In [None]:
plt.bar(df['Time'], df['Length'])
plt.title('Packet Length over time')
plt.xlabel('Time [s]')
plt.ylabel('Packet Length [B]')
plt.show()

In [None]:
df_ = df.groupby('Protocol')['No.'].sum()
df_scaled = df_/(130286/2)
df_scaled.plot(kind='barh', title='Distribution [count] of different protocols')

### Network Traffic Graph in Neo4j Browser

In [None]:
neo4jGraph = Graph(password='******')
url = 'http://localhost:7474/db/data/'

In [None]:
# parsing with pyshark: visualize connections between different ports 
packets = pyshark.FileCapture(input_file=pcapng_traintestData_path)
for packet in packets:
    #if 'UDP' in str(packet.layers):
    if hasattr(packet, 'udp'):
        protocol = packet.transport_layer
        srcPort = str(packet[protocol].srcport)
        dstPort = str(packet[protocol].dstport)
        firstNode = Node('Host', name=srcPort)
        secondNode = Node('Host', name=dstPort)
        SENDudp = Relationship.type('UDP')
        neo4jGraph.merge(SENDudp(firstNode, secondNode), 'Host', 'name')
    #elif 'TCP' in str(packet.layers):
    elif hasattr(packet, 'tcp'):
        protocol = packet.transport_layer
        srcPort = str(packet[protocol].srcport)
        dstPort = str(packet[protocol].dstport)
        firstNode = Node('Host', name=srcPort)
        secondNode = Node('Host', name=dstPort)
        SENDtcp = Relationship.type('TCP')
        neo4jGraph.merge(SENDtcp(firstNode, secondNode), 'Host', 'name')
    #elif 'HTTP' in str(packet.layers):
    #elif hasattr(packet, 'http') or hasattr(packet, 'http/xml'):
    elif hasattr(packet, 'tcp') and hasattr(packet, 'http'):
        protocol = packet.transport_layer
        field_names = packet.http._all_fields
        http_method = {val for key, val in field_names.items() if key == 'http.request.method'}
        srcPort = str(packet[protocol].srcport)
        dstPort = str(packet[protocol].dstport)
        firstNode = Node('Host', name=srcPort)
        secondNode = Node('Host', name=dstPort)
        if 'GET' in str(http_method):
            SENDhttp = Relationship.type('HTTP_GET')
        elif 'POST' in str(http_method):
            SENDhttp = Relationship.type('HTTP_POST')
        neo4jGraph.merge(SENDhttp(firstNode, secondNode), 'Host', 'name')

In [None]:
# parsing with pyshark: visualize connections between different IP addresses
# not suitable to visualize traffic from loopback adapter, because only ports change
packets = pyshark.FileCapture(input_file=pcap_realData_path)
for packet in packets:
    if 'IPv4' in str(packet.layers[0]) and ('TCP' in str(packet.layers) or 'UDP' in str(packet.layers) or 'HTTP' in str(packet.layers)):
        proto = packet.transport_layer
        srcIP = packet.ip.src
        dstIP = packet.ip.dst
        firstNode = Node('Host', name=srcIP)
        secondNode = Node('Host', name=dstIP)
        SENDipv4 = Relationship.type(str(proto)+'_IPv4')
        neo4jGraph.merge(SENDipv4(firstNode, secondNode), 'Host', 'name')
    elif 'IPv6' in str(packet.layers[0]) and ('TCP' in str(packet.layers) or 'UDP' in str(packet.layers) or 'HTTP' in str(packet.layers)):
        proto = packet.transport_layer
        srcIP = packet.ipv6.src
        dstIP = packet.ipv6.dst
        firstNode = Node('Host', name=srcIP)
        secondNode = Node('Host', name=dstIP)
        SENDipv6 = Relationship.type(str(proto)+'_IPv6')
        neo4jGraph.merge(SENDipv6(firstNode, secondNode), 'Host', 'name')

In [None]:
test_cypher_command = neo4jGraph.run('MATCH (n:Host) RETURN n')
webbrowser.open(url, new=2)

## Runtime Analysis

In [None]:
#%%snakeviz
#NIDS = Kitsune(pcapng_traintestData_path, packet_limit, maxAE, FMgrace, ADgrace, learning_rate, hidden_ratio, sensitivity)
NIDS = Kitsune(synthAnomaly_data, packet_limit, maxAE, FMgrace, ADgrace, learning_rate, hidden_ratio, sensitivity)

packet_count = NIDS.packet_count

RMSEs = []

for i in tqdm(range(packet_count)):
    rmse = NIDS.proc_next_packet()
    if rmse == -1:
        break
    RMSEs.append(rmse)

threshold_phi = NIDS.phi * sensitivity
model = NIDS.AnomDetector
logs = NIDS.logs
feature_map = NIDS.feature_map

In [None]:
%%snakeviz
NIDS = Kitsune(pcapng_trainData_path, packet_limit, maxAE, FMgrace, ADgrace, learning_rate, hidden_ratio, sensitivity)

RMSEs_train = NIDS.proc_packets_train()
RMSEs_exec = NIDS.proc_packets_live(timeout=60)

threshold_phi = NIDS.phi * sensitivity
model = NIDS.AnomDetector

## Hyperparameter Tuning

### Bayesian Optimization

In [None]:
with open('models/hyperpara_27092021.pkl', 'rb') as f:
    best_hyperpara, trials = pickle.load(f)
max_AE = best_hyperpara['max_AE']
learning_rate = best_hyperpara['learning_rate']
hidden_ratio = best_hyperpara['hidden_ratio']
sensitivity = best_hyperpara['sensitivity']
print(f'optimized size of autoencoders:         {max_AE}')
print(f'optimized learning rate:                {learning_rate}')
print(f'optimized hidden ratio:                 {hidden_ratio}')
print(f'optimized sensitivity (threshold):      {sensitivity}')

In [None]:
df = pd.read_csv(csv_hyperparaOpt_infos, delimiter=',', encoding='utf8', header=0)
df.sort_values('loss', ascending = True, inplace = True)
df.reset_index(inplace = True, drop = True)
df_hyperparams = pd.DataFrame(columns = list(ast.literal_eval(df.loc[0, 'params']).keys()),
                            index = list(range(len(df))))

for i, param in enumerate(df['params']):
    df_hyperparams.loc[i, :] = list(ast.literal_eval(param).values())
    
df_hyperparams['loss'] = df['loss']
df_hyperparams['iteration'] = df['iteration']

df_hyperparams = df_hyperparams.astype(float)
df_hyperparams.head()

In [None]:
fig, axs = plt.subplots(1, 5, figsize = (25, 5))

for i, param in enumerate(['max_AE', 'learning_rate', 'hidden_ratio', 'sensitivity', 'loss']):
    sns.regplot(x=df_hyperparams['iteration'], y=df_hyperparams[param], ax = axs[i])
    axs[i].set(xlabel = 'Iteration', ylabel = '{}'.format(param), title = '{} while Optimization'.format(param))
plt.tight_layout()

## Evaluation of NIDS (offline)

In [None]:
#with open('models/model_id2t_SMBScan.pkl', 'wb') as f:
#    pickle.dump([model, RMSEs, threshold_phi, logs, feature_map], f)
#with open('models/model_id2t_Portscan.pkl', 'rb') as f:
#     model, RMSEs, threshold_phi, logs, feature_map = pickle.load(f)
with open('models/model_sdcDoS_optimized.pkl', 'rb') as f:
     model, RMSEs, threshold_phi, logs, feature_map = pickle.load(f)

### RMSEs and Threshold

In [None]:
cm = plt.cm.get_cmap('RdYlGn_r')
plt.figure(figsize=(10,5))
x = range(FMgrace+ADgrace+1,len(RMSEs))
y = RMSEs[FMgrace+ADgrace+1:]
fig = plt.scatter(x,y,c=y,norm=matplotlib.colors.LogNorm(),s=0.5,cmap=cm)
plt.axhline(y=threshold_phi, color='r', linestyle='--')
plt.yscale('log')
#plt.title('Anomaly Scores from Network IDS - Execution Phase')
plt.ylabel('RMSE (log scaled)')
plt.xlabel('Packet No.')
figbar=plt.colorbar()
#plt.annotate('Start of Scan attack', (Anomaly_startIdx, 0.25), xytext=(Anomaly_startIdx-2500, 0.25+0.3), 
#             arrowprops = dict(arrowstyle='fancy'))
#plt.axvspan(Anomaly_startIdx, len(RMSEs), color='red', alpha=0.1)
#plt.grid(True)
plt.savefig('DoSRMSEexec.pdf', bbox_inches='tight')
plt.show()

In [None]:
#cm = plt.cm.get_cmap('RdYlGn_r')
plt.figure(figsize=(10,5))
x = np.arange(FMgrace,ADgrace)
y = RMSEs[FMgrace:ADgrace]
fig = plt.scatter(x,y,s=0.5, alpha=0.3, color='black')
#plt.axhline(y=threshold_phi, color='r', linestyle='--')
plt.yscale('log')
#plt.title('Anomaly Scores from Network IDS - Train & Execution Phase')
plt.ylabel('Loss (RMSE log scaled)')
plt.xlabel('Packet No.')
#figbar=plt.colorbar()
sns.regplot(x=x, y=y, scatter=False, color='r')
#plt.annotate('Start of Scan attack', (Anomaly_startIdx, 0.25), xytext=(Anomaly_startIdx-16000, 0.25+0.3), 
#             arrowprops = dict(arrowstyle='fancy'))
#plt.grid(True)
#plt.savefig('trainingNIDS.pdf', bbox_inches='tight')
plt.show()

### Metrics 

In [None]:
preds_and_idxs = [item[0:2] for item in logs]
preds = [item[1] for item in logs]
#gt_data = pd.read_csv(csv_traintestData_gt_path, usecols=['anomaly'])
gt_data = pd.read_csv(synthAnomaly_gt, usecols=['anomaly'])
gt = gt_data.anomaly.tolist()
exec_start_idx = FMgrace + ADgrace
gt_exec = gt[exec_start_idx:]
scores_exec = RMSEs[exec_start_idx:]

In [None]:
#root_mean_square_error = mean_squared_error(squared=False)
print('Precision Score:          %.4f' % precision_score(gt_exec, preds))
print('Recall Score:             %.4f' % recall_score(gt_exec, preds))
print('F1 Score:                 %.4f' % f1_score(gt_exec, preds))
print('Average Precision Score:  %.4f' % average_precision_score(gt_exec, scores_exec))
print('ROC AUC Score:            %.4f' % roc_auc_score(gt_exec, scores_exec))

In [None]:
sns.reset_orig()
tn, fp, fn, tp = confusion_matrix(gt_exec, preds).ravel()
tpr = tp / (tp + fn)
fnr = fn / (fn + tp)
fpr = fp / (fp + tn)
tnr = tn / (tn + fp)
print('True Positive Rate:       %.4f' % tpr)
print('False Negative Rate:      %.4f' % fnr)
print('False Positive Rate:      %.4f' % fpr)
print('True Negative Rate:       %.4f' % tnr)
cm = confusion_matrix(gt_exec, preds)
ConfusionMatrixDisplay(cm).plot()
plt.savefig('confMatrixTestPortscan.pdf', bbox_inches='tight')

In [None]:
sns.set(rc={'figure.figsize':(11,9)})
fpr, tpr, _ = roc_curve(gt_exec, scores_exec)
RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
plt.savefig('ROCTestPortscan.pdf', bbox_inches='tight')

In [None]:
fpr, fnr, _ = det_curve(gt_exec, scores_exec)
DetCurveDisplay(fpr=fpr, fnr=fnr, estimator_name='NIDS offline').plot()

In [None]:
prec, recall, _ = precision_recall_curve(gt_exec, scores_exec)
PrecisionRecallDisplay(precision=prec, recall=recall, estimator_name='NIDS offline').plot()

### Feature Space

In [None]:
packetIndex = [item[0] for item in logs]
netState_vec = [item[2] for item in logs]
state = np.array(netState_vec)

# w1: 100ms, w2: 500ms, w3: 1,5s, w4: 10s, w5: 60s
# packet count, 1D statistic, named MIstat in netStat.py
pktRate_w1, pktRate_w2, pktRate_w3, pktRate_w4, pktRate_w5 = state[:,0:3], state[:,3:6], state[:,6:9], state[:,9:12], state[:,12:15] 
# packet size, 1D2D statistic, named HHstat in netStat.py
bw1_w1, bw1_w2, bw1_w3, bw1_w4, bw1_w5 = state[:,15:22], state[:,22:29], state[:,29:36], state[:,36:43], state[:,43:50]
# packet jitter, 1D statistic, named HHstat_jit in netStat.py
pktDelay_w1, pktDelay_w2, pktDelay_w3, pktDelay_w4, pktDelay_w5 = state[:,50:53], state[:,53:56], state[:,56:59], state[:,59:62], state[:,62:65]
# packet size, 1D2D statistic, named HpHpstat in netStat.py
bw2_w1, bw2_w2, bw2_w3, bw2_w4, bw2_w5 = state[:,65:72], state[:,72:79], state[:,79:86], state[:,86:93], state[:,93:100]

# same set of 20 features for all different windows 
weight_pktRate_w4, mean_pktRate_w4, std_pktRate_w4 = pktRate_w4[:,:1], pktRate_w4[:,1:2], pktRate_w4[:,2:3]
weight_bw1_w4, mean_bw1_w4, std_bw1_w4, rad_bw1_w4, magn_bw1_w4, cov_bw1_w4, pcc_bw1_w4 = bw1_w4[:,:1], bw1_w4[:,1:2], bw1_w4[:,2:3], bw1_w4[:,3:4], bw1_w4[:,4:5], bw1_w4[:,5:6], bw1_w4[:,6:7]
weight_pktDelay_w4, mean_pktDelay_w4, std_pktDelay_w4 = pktDelay_w4[:,:1], pktDelay_w4[:,1:2], pktDelay_w4[:,2:3]
weight_bw2_w4, mean_bw2_w4, std_bw2_w4, rad_bw2_w4, magn_bw2_w4, cov_bw2_w4, pcc_bw2_w4 = bw2_w4[:,:1], bw2_w4[:,1:2], bw2_w4[:,2:3], bw2_w4[:,3:4], bw2_w4[:,4:5], bw2_w4[:,5:6], bw2_w4[:,6:7]

pktRate_w4 = [weight_pktRate_w4, mean_pktRate_w4, std_pktRate_w4]
bw1_w4 = [weight_bw1_w4, mean_bw1_w4, std_bw1_w4, rad_bw1_w4, magn_bw1_w4, cov_bw1_w4, pcc_bw1_w4]
pktDelay_w4 = [weight_pktDelay_w4, mean_pktDelay_w4, std_pktDelay_w4]
bw2_w4 = [weight_bw2_w4, mean_bw2_w4, std_bw2_w4, rad_bw2_w4, magn_bw2_w4, cov_bw2_w4, pcc_bw2_w4]

In [None]:
plt.figure(figsize=(20,10))
colors_1D = ['r', 'g', 'b']
colors_2D = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
labels_1D = ['Weight', 'Mean', 'Std.']
labels_2D = ['Weight', 'Mean', 'Std.', 'Radius', 'Magnitude', 'Covariance', 'Correlation']

plt.subplot(221)
for i in range(3):
    plt.scatter(packetIndex, pktRate_w4[i], marker='o', color=colors_1D[i], alpha=1, s=1, label=labels_1D[i])
plt.title("Features related to packet's rate - time window 10s")
plt.xlabel('Network packet number')
plt.ylabel('Statistical features')
plt.legend(loc = 'upper left')
plt.annotate('Start of Scan attack', (Anomaly_startIdx, 4000), xytext=(Anomaly_startIdx-2000, 4000+15000), 
             arrowprops = dict(arrowstyle='fancy'))

plt.subplot(222)
for i in range(7):
    plt.scatter(packetIndex, bw1_w4[i], marker='o', color=colors_2D[i], alpha=1, s=1, label=labels_2D[i])
plt.title("Features related to packet's size (outbound) - time window 10s")
plt.xlabel('Network packet number')
plt.ylabel('Statistical features')
plt.legend(loc = 'upper left')
plt.annotate('Start of Scan attack', (Anomaly_startIdx, 7000), xytext=(Anomaly_startIdx-2300, 7000+16000), 
             arrowprops = dict(arrowstyle='fancy'))

plt.subplot(223)
for i in range(3):
    plt.scatter(packetIndex, pktDelay_w4[i], marker='o', color=colors_1D[i], alpha=1, s=1, label=labels_1D[i])
plt.title("Features related to packet's jitter - time window 10s")
plt.xlabel('Network packet number')
plt.ylabel('Statistical features')
plt.legend(loc = 'upper left')
plt.annotate('Start of Scan attack', (Anomaly_startIdx, 100), xytext=(Anomaly_startIdx-2000, 100+300), 
             arrowprops = dict(arrowstyle='fancy'))

plt.subplot(224)
for i in range(7):
    plt.scatter(packetIndex, bw2_w4[i], marker='o', color=colors_2D[i], alpha=1, s=1, label=labels_2D[i])
plt.title("Features related to packet's size (outbound/inbound) - time window 10s")
plt.xlabel('Network packet number')
plt.ylabel('Statistical features')
plt.legend(loc = 'upper left')
plt.annotate('Start of Scan attack', (Anomaly_startIdx, 10000), xytext=(Anomaly_startIdx-2300, 10000+55000), 
             arrowprops = dict(arrowstyle='fancy'))

plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
plt.show()

### Feature Clustering 

In [None]:
print(feature_map)

## Evaluation of NIDS (online)

In [None]:
with open('models/model_live.pkl', 'wb') as f:
    pickle.dump([model, RMSEs_train, RMSEs_exec, threshold_phi], f)
#with open('models/model_live.pkl', 'rb') as f:
#     model, RMSEs_train, RMSEs_exec, threshold_phi = pickle.load(f)

### RMSEs and Threshold

In [None]:
cm = plt.cm.get_cmap('RdYlGn_r')
plt.figure(figsize=(10,5))
x = range(1, len(RMSEs_train))
y = RMSEs_train[1:]
fig = plt.scatter(x,y,c=y,norm=matplotlib.colors.Normalize(),s=0.1,cmap=cm)
plt.axhline(y=threshold_phi, color='r', linestyle='-')
#plt.yscale('log')
plt.title('Anomaly Scores from Network IDS - Training Phase')
plt.ylabel('RMSE')
plt.xlabel('Network packet number')
figbar=plt.colorbar()
plt.show()

In [None]:
cm = plt.cm.get_cmap('RdYlGn_r')
plt.figure(figsize=(10,5))
x = range(1, len(RMSEs_exec))
y = RMSEs_exec[1:]
fig = plt.scatter(x,y,c=y,norm=matplotlib.colors.LogNorm(),s=1,cmap=cm)
plt.axhline(y=threshold_phi, color='r', linestyle='-')
plt.yscale('log')
plt.title('Anomaly Scores from Network IDS - live Execution Phase')
plt.ylabel('RMSE (log scaled)')
plt.xlabel('Network packet number')
figbar=plt.colorbar()
plt.show()