In [38]:

###############################################################################
# Random Forest Classification Model (TensorFlow)                             #
# For LOIC Dataset                                                            #
# Based on the Implementation of:                                             #
# https://www.tensorflow.org/decision_forests/tutorials/beginner_colab        #
###############################################################################


In [39]:
# Installieren aller benötigten Pakete
!pip install numpy==1.19.2
!pip install six==1.15.0
!pip install wheel==0.35
!pip install tensorflow_decision_forests
!pip install pandas
!pip install wurlitzer
!pip install matplotlib
!pip install onnxruntime
!pip install keras2onnx



In [40]:
# Laden der benötigten Python Pakete
import os
# os.environ["TF_KERAS"]='1'
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
import tensorflow as tf
from wurlitzer import sys_pipes
import matplotlib.pyplot as plt
import onnx
import keras2onnx as k2o

In [41]:
# Prüfung der installierten TensorFlow Decision Forests Version
print(f"Found TensorFlow Decision Forests v{tfdf.__version__}")

Found TensorFlow Decision Forests v0.1.7


In [42]:
# Laden der Netzwerk Traffic Daten für den LOIC Angriff
data_LOIC = pd.read_csv('../Data/Tuesday-20-02-2018_LOIC-Attack.csv')
# Umbenennen der Spalten für bessere Kompatibiltät mit TensorFlow
data_LOIC.rename(columns={
    'Bwd Pkt Len Std':'bwd_pkt_len_std',
    'Pkt Size Avg':'pkt_size_avg',
    'Flow Duration':'flow_duration',
    'Flow IAT Std':'flow_iat_std',
    'Label':'label'}, 
    inplace=True)

In [43]:
# Festlegen der Label-Spalte innerhalb des Datasets
label = 'label'

In [44]:
# Aufteilen des Datasets in Training- und Test-Daten
def split_dataset(dataset,  test_ratio=0.30):
    """Splits a panda dataframe in two dataframes."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

training_data_LOIC, testing_data_LOIC = split_dataset(data_LOIC)

print("{} examples in training, {} examples for testing.".format(
    len(training_data_LOIC), len(testing_data_LOIC)))

5565192 examples in training, 2383556 examples for testing.


In [45]:
# Konvertieren des Panda Dataframes in ein TensorFlow Dataset
print("Converting Panda Dataframe into TensorFlow Dataset...")
training_dataset_LOIC = tfdf.keras.pd_dataframe_to_tf_dataset(training_data_LOIC, label=label)
testing_dataset_LOIC = tfdf.keras.pd_dataframe_to_tf_dataset(testing_data_LOIC, label=label)

Converting Panda Dataframe into TensorFlow Dataset...


In [46]:
# Erstellen des Random Forest Modells
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])

In [47]:
# Trainieren des Modells
print("Training the Model...")
with sys_pipes():
    model.fit(x=training_dataset_LOIC)

Training the Model...
[INFO kernel.cc:746] Start Yggdrasil model training
[INFO kernel.cc:747] Collect training examples
[INFO kernel.cc:392] Number of batches: 86957
[INFO kernel.cc:393] Number of examples: 5565192
[INFO kernel.cc:769] Dataset:
Number of records: 5565192
Number of columns: 5

Number of columns by type:
	NUMERICAL: 4 (80%)
	CATEGORICAL: 1 (20%)

Columns:

NUMERICAL: 4 (80%)
	0: "bwd_pkt_len_std" NUMERICAL mean:142.4 min:0 max:20469.6 sd:217.746
	1: "flow_duration" NUMERICAL mean:1.35374e+07 min:0 max:1.2e+08 sd:3.24438e+07
	2: "flow_iat_std" NUMERICAL mean:1.13545e+06 min:0 max:8.48081e+07 sd:3.80227e+06
	3: "pkt_size_avg" NUMERICAL mean:94.1061 min:0 max:8225.8 sd:106.205

CATEGORICAL: 1 (20%)
	4: "__LABEL" CATEGORICAL integerized vocab-size:3 no-ood-item

Terminology:
	nas: Number of non-available (i.e. missing) values.
	ood: Out of dictionary.
	manually-defined: Attribute which type is manually defined by the user i.e. the type was not automatically inferred.
	token

In [48]:
# Evaluieren des trainierten Modells
print("Evaluating the Model...")
evaluation = model.evaluate(testing_dataset_LOIC, return_dict=True)
print()
for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")


Evaluating the Model...

loss: 0.0000
accuracy: 0.9971


In [49]:
data_path = "../Data"
model_path = "Models"
onnx_path = "ONNX_Models"
model_name = "loic_model"

# Trainiertes Modell für die spätere Verwendung abspeichern
model.save(os.path.join(data_path,model_path,model_name),overwrite=True)

# Konvertieren in das ONNX Modell
# onnx_model = k2o.convert_keras(model,df_model_name)
# onnx.save_model(onnx_model,os.path.join(data_path,onnx_path,model_name + ".onnx"))



INFO:tensorflow:Assets written to: ../Data/Models/loic_model/assets
INFO:tensorflow:Assets written to: ../Data/Models/loic_model/assets


In [50]:
# Plotten des ersten Baumes innerhalb des Decision Forests
with open('../Data/Models/LOIC_Model_Tree.html', 'w') as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=3))
tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=3)


'\n<script src="https://d3js.org/d3.v6.min.js"></script>\n<div id="tree_plot_ad86c3d8b8ba497b9498b329f4bb4379"></div>\n<script>\n/*\n * Copyright 2021 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     https://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/**\n *  Plotting of decision trees generated by TF-DF.\n *\n *  A tree is a recursive structure of node objects.\n *  A node contains one or more of the following components:\n *\n *    - A value: Representing the output of the node. If the node is not a leaf,\

In [51]:
# Erstellen einer Bilanz für das trainierte Modell
model.summary()

Model: "random_forest_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (4):
	bwd_pkt_len_std
	flow_duration
	flow_iat_std
	pkt_size_avg

No weights

Variable Importance: NUM_NODES:
    1.   "flow_duration" 285691.000000 ################
    2.    "pkt_size_avg" 3430.000000 
    3.    "flow_iat_std" 2718.000000 
    4. "bwd_pkt_len_std" 1145.000000 

Variable Importance: NUM_AS_ROOT:
    1.   "flow_duration" 234.000000 ################
    2. "bwd_pkt_len_std" 66.000000 

Variable Importance: SUM_SCORE:
    1.   "flow_duration" 159002490.749388 ################
    2.    "pkt_size_avg" 137233041.642846 ############
    3.    "flow_iat_std" 69001430.202241 ##
    4. "bwd_pkt_len_std" 55517544.016287

In [52]:
# Erstellen von Grafiken für die Effizienz des Trainings
logs = model.make_inspector().training_logs()
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1,2,2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.savefig('../Data/Visualized/LOIC_Model.png')
plt.clf()

<Figure size 864x288 with 0 Axes>