In [1]:

###############################################################################
# Random Forest Classification Model (TensorFlow)                             #
# Based on the Implementation of:                                             #
# For GoldenEye Dataset                                                       #
# https://www.tensorflow.org/decision_forests/tutorials/beginner_colab        #
###############################################################################


In [2]:
# Installieren aller benötigten Pakete
!pip install pandas
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install matplotlib
!pip install ipython

Collecting ipython
  Using cached ipython-7.25.0-py3-none-any.whl (786 kB)
Collecting jedi>=0.16
  Using cached jedi-0.18.0-py2.py3-none-any.whl (1.4 MB)
Collecting backcall
  Using cached backcall-0.2.0-py2.py3-none-any.whl (11 kB)
Collecting pickleshare
  Using cached pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)
Collecting matplotlib-inline
  Using cached matplotlib_inline-0.1.2-py3-none-any.whl (8.2 kB)
Collecting pexpect>4.3; sys_platform != "win32"
  Using cached pexpect-4.8.0-py2.py3-none-any.whl (59 kB)
Collecting decorator
  Using cached decorator-5.0.9-py3-none-any.whl (8.9 kB)
Collecting prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0
  Using cached prompt_toolkit-3.0.19-py3-none-any.whl (368 kB)
Collecting traitlets>=4.2
  Using cached traitlets-5.0.5-py3-none-any.whl (100 kB)
Collecting pygments
  Using cached Pygments-2.9.0-py3-none-any.whl (1.0 MB)
Collecting parso<0.9.0,>=0.8.0
  Using cached parso-0.8.2-py2.py3-none-any.whl (94 kB)
Collecting ptyprocess>=0.5
  Using cac

In [3]:
# Laden der benötigten Python Pakete
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
from wurlitzer import sys_pipes
import matplotlib.pyplot as plt

In [4]:
# Prüfung der installierten TensorFlow Decision Forests Version
print(f"Found TensorFlow Decision Forests v{tfdf.__version__}")


Found TensorFlow Decision Forests v0.1.7


In [5]:
# Laden der Netzwerk Traffic Daten für den GoldenEye Angriff
data_GoldenEye = pd.read_csv('../Data/Optimized/Thursday-15-02-2018_GoldenEye-Attack.csv')

In [6]:
# Suchen und Ersetzen von NaN Werten im Dataset
nan_count = data_GoldenEye.isna().sum().sum()
print(f"Initial Count of NaN in Dataset: {nan_count}")

data_GoldenEye = data_GoldenEye.replace([np.inf, -np.inf], np.nan)
data_GoldenEye = data_GoldenEye.interpolate()

nan_count = data_GoldenEye.isna().sum().sum()
print(f"Count of NaN in Dataset after Cleanse: {nan_count}")

Initial Count of NaN in Dataset: 4921
Count of NaN in Dataset after Cleanse: 0


In [7]:
# Festlegen des Wertes der bestimmten Variable
label = 'label'

In [8]:
# Aufteilen des Datasets in Training- und Test-Daten
def split_dataset(dataset,  test_ratio=0.30):
    """Splits a panda dataframe in two dataframes."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

training_data_GoldenEye, testing_data_GoldenEye = split_dataset(data_GoldenEye)

print("{} examples in training, {} examples for testing.".format(
    len(training_data_GoldenEye), len(testing_data_GoldenEye)))


726480 examples in training, 311105 examples for testing.


In [9]:
# Konvertieren des Panda Dataframes in ein TensorFlow Dataset
print("Converting Panda Dataframe into TensorFlow Dataset...")
training_dataset_GoldenEye = tfdf.keras.pd_dataframe_to_tf_dataset(training_data_GoldenEye, label=label)
testing_dataset_GoldenEye = tfdf.keras.pd_dataframe_to_tf_dataset(testing_data_GoldenEye, label=label)


Converting Panda Dataframe into TensorFlow Dataset...


In [10]:
# Erstellen des Random Forest Modells
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])


In [11]:
# Trainieren des Modells
print("Training the Model: ")
with sys_pipes():
    model.fit(x=training_dataset_GoldenEye)

Training the Model: 
2021-06-28 14:57:03.047266: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-06-28 14:57:03.079364: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2199995000 Hz
[INFO kernel.cc:746] Start Yggdrasil model training
[INFO kernel.cc:747] Collect training examples
[INFO kernel.cc:392] Number of batches: 11352
[INFO kernel.cc:393] Number of examples: 726480
[INFO data_spec_inference.cc:289] 31744 item(s) have been pruned (i.e. they are considered out of dictionary) for the column timestamp (2000 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO kernel.cc:769] Dataset:
Number of records: 726480
Number of columns: 80

Number of columns by type:
	NUMERICAL: 78 (97.5%)
	CATEGORICAL: 2 (2.5%)

Columns:

NUMERICAL: 78 (97.5%)
	0: "ack_flag_cnt" NUMERICAL mean:0.285002 min:0 max:1 sd:0.451415
	1: "active_max" NUMERICAL mean:148222 min:0

In [12]:
# Evaluation des trainierten Modells mit den Testdaten
print("Evaluating the Model...")
evaluation = model.evaluate(testing_dataset_GoldenEye, return_dict=True)

print()

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

Evaluating the Model...

loss: 0.0000
accuracy: 1.0000


In [13]:
# Erstellen einer Bilanz für das trainierte Modell
model.summary()

.196333 ################
   10.    "fwd_byts/b_avg"  6.196333 ################
   11.    "fwd_pkts/b_avg"  6.196333 ################
   12.     "fwd_urg_flags"  6.196333 ################
   13.      "rst_flag_cnt"  6.196333 ################
   14.           "__LABEL"  6.196333 ################
   15.   "bwd_pkt_len_min"  6.196174 ###############
   16.     "fwd_psh_flags"  6.195752 ###############
   17.      "syn_flag_cnt"  6.195395 ###############
   18.        "active_std"  6.186666 ###############
   19.          "protocol"  6.185479 ###############
   20.      "urg_flag_cnt"  6.185220 ###############
   21.       "pkt_len_min"  6.163760 ###############
   22.   "fwd_pkt_len_min"  6.161323 ###############
   23.     "down/up_ratio"  6.146889 ###############
   24.       "bwd_iat_max"  6.144748 ###############
   25.       "bwd_iat_tot"  6.132439 ###############
   26.          "idle_std"  6.131485 ###############
   27.          "idle_max"  6.110771 ###############
   28.         "

In [14]:
# Erstellen der Feature Importance Kriterien aus Sicht des TensorFlow Modells
model.make_inspector().variable_importances()

{'NUM_AS_ROOT': [("fwd_seg_size_min" (1; #51), 43.0),
  ("init_fwd_win_byts" (1; #58), 39.0),
  ("flow_pkts/s" (1; #33), 35.0),
  ("flow_iat_mean" (1; #30), 31.0),
  ("fwd_header_len" (1; #37), 21.0),
  ("fwd_pkts/s" (1; #48), 21.0),
  ("fwd_iat_max" (1; #38), 19.0),
  ("flow_iat_max" (1; #29), 18.0),
  ("timestamp" (4; #73), 12.0),
  ("fwd_iat_tot" (1; #42), 11.0),
  ("flow_duration" (1; #28), 11.0),
  ("fwd_iat_mean" (1; #39), 8.0),
  ("dst_port" (1; #24), 5.0),
  ("fwd_pkt_len_std" (1; #46), 5.0),
  ("bwd_pkts/s" (1; #18), 5.0),
  ("bwd_iat_std" (1; #11), 3.0),
  ("pkt_len_var" (1; #63), 3.0),
  ("fwd_pkt_len_max" (1; #43), 2.0),
  ("bwd_iat_mean" (1; #9), 2.0),
  ("fwd_pkt_len_mean" (1; #44), 2.0),
  ("fwd_seg_size_avg" (1; #50), 1.0),
  ("subflow_bwd_byts" (1; #68), 1.0),
  ("bwd_pkt_len_std" (1; #16), 1.0),
  ("bwd_iat_min" (1; #10), 1.0)]}