In [11]:

###############################################################################
# Random Forest Classification Model (TensorFlow)                             #
# For Slowloris Dataset                                                       #
# Based on the Implementation of:                                             #
# https://www.tensorflow.org/decision_forests/tutorials/beginner_colab        #
###############################################################################


In [12]:
# Installieren aller benötigten Pakete
!pip install pandas
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install matplotlib
!pip install ipython



In [13]:
# Laden der benötigten Python Pakete
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
from wurlitzer import sys_pipes
import matplotlib.pyplot as plt

In [14]:
# Prüfung der installierten TensorFlow Decision Forests Version
print(f"Found TensorFlow Decision Forests v{tfdf.__version__}")


Found TensorFlow Decision Forests v0.1.7


In [15]:
# Laden der Netzwerk Traffic Daten für den GoldenEye Angriff
data_Slowloris = pd.read_csv('../Data/Optimized/Thursday-15-02-2018_Slowloris-Attack.csv')


In [16]:
# Suchen und Ersetzen von NaN Werten im Dataset
nan_count = data_Slowloris.isna().sum().sum()
print(f"Initial Count of NaN in Dataset: {nan_count}")

data_Slowloris = data_Slowloris.replace([np.inf, -np.inf], np.nan)
data_Slowloris = data_Slowloris.interpolate()

nan_count = data_Slowloris.isna().sum().sum()
print(f"Count of NaN in Dataset after Cleanse: {nan_count}")

Initial Count of NaN in Dataset: 4921
Count of NaN in Dataset after Cleanse: 0


In [17]:
# Festlegen des Wertes der bestimmten Variable
label = 'label'

In [18]:
# Aufteilen des Datasets in Training- und Test-Daten
def split_dataset(dataset,  test_ratio=0.30):
    """Splits a panda dataframe in two dataframes."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

training_data_Slowloris, testing_data_Slowloris = split_dataset(data_Slowloris)

print("{} examples in training, {} examples for testing.".format(
    len(training_data_Slowloris), len(testing_data_Slowloris)))


704941 examples in training, 302126 examples for testing.


In [19]:
# Konvertieren des Panda Dataframes in ein TensorFlow Dataset
print("Converting Panda Dataframe into TensorFlow Dataset...")
training_dataset_Slowloris = tfdf.keras.pd_dataframe_to_tf_dataset(training_data_Slowloris, label=label)
testing_dataset_Slowloris = tfdf.keras.pd_dataframe_to_tf_dataset(testing_data_Slowloris, label=label)


Converting Panda Dataframe into TensorFlow Dataset...


In [20]:
# Erstellen des Random Forest Modells
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])


In [21]:
# Trainieren des Modells
print("Training the Model: ")
with sys_pipes():
    model.fit(x=training_dataset_Slowloris)

Training the Model: 
[INFO kernel.cc:746] Start Yggdrasil model training
[INFO kernel.cc:747] Collect training examples
[INFO kernel.cc:392] Number of batches: 11015
[INFO kernel.cc:393] Number of examples: 704941
[INFO data_spec_inference.cc:289] 31737 item(s) have been pruned (i.e. they are considered out of dictionary) for the column timestamp (2000 item(s) left) because min_value_count=5 and max_number_of_unique_values=2000
[INFO kernel.cc:769] Dataset:
Number of records: 704941
Number of columns: 80

Number of columns by type:
	NUMERICAL: 78 (97.5%)
	CATEGORICAL: 2 (2.5%)

Columns:

NUMERICAL: 78 (97.5%)
	0: "ack_flag_cnt" NUMERICAL mean:0.283114 min:0 max:1 sd:0.450512
	1: "active_max" NUMERICAL mean:182167 min:0 max:1.11992e+08 sd:1.20938e+06
	2: "active_mean" NUMERICAL mean:111171 min:0 max:1.11992e+08 sd:950296
	3: "active_min" NUMERICAL mean:81805.5 min:0 max:1.11992e+08 sd:871158
	4: "active_std" NUMERICAL mean:49027.7 min:0 max:6.01771e+07 sd:379077
	5: "bwd_blk_rate_avg" N

In [22]:
# Evaluation des trainierten Modells mit den Testdaten
print("Evaluating the Model...")
evaluation = model.evaluate(testing_dataset_Slowloris, return_dict=True)

print()

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

Evaluating the Model...

loss: 0.0000
accuracy: 1.0000


In [23]:
# Erstellen einer Bilanz für das trainierte Modell
model.summary()

4.          "protocol"  7.225300 ###############
   15. "fwd_act_data_pkts"  7.219609 ###############
   16.      "urg_flag_cnt"  7.210363 ###############
   17.   "bwd_pkt_len_min"  7.197026 ###############
   18.     "down/up_ratio"  7.194212 ###############
   19.       "pkt_len_min"  7.191280 ###############
   20.      "psh_flag_cnt"  7.180366 ###############
   21.      "tot_bwd_pkts"  7.172341 ###############
   22.  "subflow_fwd_pkts"  7.170572 ###############
   23.   "fwd_pkt_len_std"  7.167852 ###############
   24.   "bwd_pkt_len_std"  7.161721 ###############
   25.  "subflow_bwd_pkts"  7.156462 ###############
   26.      "tot_fwd_pkts"  7.152198 ###############
   27.         "timestamp"  7.150396 ###############
   28.       "bwd_iat_tot"  7.139798 ###############
   29.          "idle_max"  7.101867 ###############
   30.          "idle_min"  7.085014 ##############
   31.      "syn_flag_cnt"  7.080036 ##############
   32.    "bwd_header_len"  7.066946 ##############


In [24]:
# Erstellen der Feature Importance Kriterien aus Sicht des TensorFlow Modells
model.make_inspector().variable_importances()

{'NUM_AS_ROOT': [("bwd_iat_max" (1; #8), 44.0),
  ("bwd_iat_mean" (1; #9), 33.0),
  ("bwd_iat_min" (1; #10), 32.0),
  ("fwd_seg_size_min" (1; #51), 31.0),
  ("bwd_iat_std" (1; #11), 25.0),
  ("init_fwd_win_byts" (1; #58), 23.0),
  ("flow_iat_std" (1; #32), 20.0),
  ("active_max" (1; #1), 14.0),
  ("active_mean" (1; #2), 14.0),
  ("idle_std" (1; #56), 13.0),
  ("active_min" (1; #3), 11.0),
  ("active_std" (1; #4), 6.0),
  ("flow_iat_mean" (1; #30), 5.0),
  ("fwd_iat_max" (1; #38), 5.0),
  ("idle_mean" (1; #54), 4.0),
  ("flow_pkts/s" (1; #33), 4.0),
  ("fwd_iat_mean" (1; #39), 4.0),
  ("subflow_bwd_byts" (1; #68), 3.0),
  ("idle_max" (1; #53), 2.0),
  ("fwd_header_len" (1; #37), 1.0),
  ("fwd_pkt_len_mean" (1; #44), 1.0),
  ("subflow_fwd_byts" (1; #70), 1.0),
  ("totlen_fwd_pkts" (1; #77), 1.0),
  ("flow_iat_max" (1; #29), 1.0),
  ("bwd_iat_tot" (1; #12), 1.0),
  ("flow_duration" (1; #28), 1.0)]}