In [1]:

###############################################################################
# Random Forest Classification Model (TensorFlow)                             #
# For LOIC Dataset                                                            #
# Based on the Implementation of:                                             #
# https://www.tensorflow.org/decision_forests/tutorials/beginner_colab        #
###############################################################################


In [2]:
# Installieren der benötigten Python Pakete
!pip install tensorflow_decision_forests
!pip install numpy==1.19.5
!pip install six==1.15.0
!pip install wheel==0.35
!pip install pandas
!pip install wurlitzer
!pip install matplotlib



In [3]:
# Importieren der benötigten Python Pakete
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
from wurlitzer import sys_pipes
import matplotlib.pyplot as plt
import tensorflow as tf

2021-06-30 17:05:21.796349: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-06-30 17:05:21.796384: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
# Laden der Netzwerk Traffic Daten für den LOIC Angriff
data_LOIC = pd.read_csv('../Data/Optimized/Tuesday-20-02-2018_LOIC-Attack.csv')

In [5]:
# Suchen und Ersetzen von NaN Werten im Dataset
nan_count = data_LOIC.isna().sum().sum()
print(f"Count of NaN in Dataset: {nan_count}")

Count of NaN in Dataset: 0


In [6]:
# Festlegen der Label-Spalte innerhalb des Datasets
label = 'label'

In [7]:
# Aufteilen des Datasets in Training- und Test-Daten
def split_dataset(dataset,  test_ratio=0.30):
    """Splits a panda dataframe in two dataframes."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

training_data_LOIC, testing_data_LOIC = split_dataset(data_LOIC)

print("{} examples in training, {} examples for testing.".format(
    len(training_data_LOIC), len(testing_data_LOIC)))

5565312 examples in training, 2383436 examples for testing.


In [8]:
# Konvertieren des Panda Dataframes in ein TensorFlow Dataset
print("Converting Panda Dataframe into TensorFlow Dataset...")
training_dataset_LOIC = tfdf.keras.pd_dataframe_to_tf_dataset(training_data_LOIC, label=label)
testing_dataset_LOIC = tfdf.keras.pd_dataframe_to_tf_dataset(testing_data_LOIC, label=label)

Converting Panda Dataframe into TensorFlow Dataset...


2021-06-30 17:07:04.753042: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-06-30 17:07:04.753101: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-06-30 17:07:04.753143: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (WS000252): /proc/driver/nvidia/version does not exist
2021-06-30 17:07:04.754801: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-06-30 17:07:04.762077: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 44522496 exceeds 1

In [9]:
# Erstellen des Random Forest Modells
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])

In [10]:
# Trainieren des Modells
print("Training the Model...")
with sys_pipes():
    model.fit(x=training_dataset_LOIC)

Training the Model...


2021-06-30 17:07:17.764153: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-06-30 17:07:17.818498: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3000005000 Hz




[INFO kernel.cc:746] Start Yggdrasil model training
[INFO kernel.cc:747] Collect training examples
[INFO kernel.cc:392] Number of batches: 86958
[INFO kernel.cc:393] Number of examples: 5565312
[INFO data_spec_inference.cc:289] 23592 item(s) have been pruned (i.e. they are considered out of dictionary) for the column dst_ip (2000 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO data_spec_inference.cc:289] 3979151 item(s) have been pruned (i.e. they are considered out of dictionary) for the column flow_id (2000 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO data_spec_inference.cc:289] 26905 item(s) have been pruned (i.e. they are considered out of dictionary) for the column src_ip (2000 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO data_spec_inference.cc:289] 35609 item(s) have been pruned (i.e. they are considered out of dictionary) for the column timestamp (2000 item(s) left) bec

In [11]:
# Evaluieren des trainierten Modells
print("Evaluating the Model...")
evaluation = model.evaluate(testing_dataset_LOIC, return_dict=True)
print()
for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")


Evaluating the Model...

loss: 0.0000
accuracy: 1.0000


In [12]:
# Erstellen einer Bilanz für das trainierte Modell
model.summary()

Model: "random_forest_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (83):
	ack_flag_cnt
	active_max
	active_mean
	active_min
	active_std
	bwd_blk_rate_avg
	bwd_byts/b_avg
	bwd_header_len
	bwd_iat_max
	bwd_iat_mean
	bwd_iat_min
	bwd_iat_std
	bwd_iat_tot
	bwd_pkt_len_max
	bwd_pkt_len_mean
	bwd_pkt_len_min
	bwd_pkt_len_std
	bwd_pkts/b_avg
	bwd_pkts/s
	bwd_psh_flags
	bwd_seg_size_avg
	bwd_urg_flags
	cwe_flag_count
	down/up_ratio
	dst_ip
	dst_port
	ece_flag_cnt
	fin_flag_cnt
	flow_byts/s
	flow_duration
	flow_iat_max
	flow_iat_mean
	flow_iat_min
	flow_iat_std
	flow_id
	flow_pkts/s
	fwd_act_data_pkts
	fwd_blk_rate_avg
	fwd_byts/b_avg
	fwd_header_len
	fwd_iat_max
	fwd_iat_mean
	fwd_iat_min
	fwd_iat_std
	

In [13]:
# Erstellen der Feature Importance Kriterien aus Sicht des TensorFlow Modells
model.make_inspector().variable_importances()

{'NUM_AS_ROOT': [("totlen_fwd_pkts" (1; #81), 52.0),
  ("dst_ip" (4; #24), 38.0),
  ("subflow_fwd_byts" (1; #74), 36.0),
  ("fwd_pkt_len_max" (1; #45), 28.0),
  ("fwd_seg_size_avg" (1; #52), 22.0),
  ("fwd_pkt_len_mean" (1; #46), 20.0),
  ("fwd_pkts/s" (1; #50), 18.0),
  ("flow_iat_max" (1; #30), 13.0),
  ("flow_pkts/s" (1; #35), 13.0),
  ("flow_duration" (1; #29), 12.0),
  ("dst_port" (1; #25), 10.0),
  ("flow_iat_mean" (1; #31), 9.0),
  ("flow_iat_min" (1; #32), 7.0),
  ("fwd_iat_min" (1; #42), 6.0),
  ("src_ip" (4; #70), 4.0),
  ("bwd_pkts/s" (1; #18), 2.0),
  ("fwd_iat_max" (1; #40), 2.0),
  ("flow_byts/s" (1; #28), 1.0),
  ("pkt_len_std" (1; #64), 1.0),
  ("bwd_pkt_len_std" (1; #16), 1.0),
  ("pkt_len_min" (1; #63), 1.0),
  ("fwd_pkt_len_std" (1; #48), 1.0),
  ("bwd_seg_size_avg" (1; #20), 1.0),
  ("init_fwd_win_byts" (1; #60), 1.0),
  ("pkt_len_var" (1; #65), 1.0)]}