In [1]:

###############################################################################
# Random Forest Classification Model (TensorFlow)                             #
# For LOIC Dataset                                                            #
# Based on the Implementation of:                                             #
# https://www.tensorflow.org/decision_forests/tutorials/beginner_colab        #
###############################################################################


In [2]:
# Installieren der benötigten Python Pakete
!pip install pandas
!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install matplotlib
!pip install ipython



In [3]:
# Importieren der benötigten Python Pakete
import pandas as pd
import numpy as np
import tensorflow_decision_forests as tfdf
from wurlitzer import sys_pipes
import matplotlib.pyplot as plt

In [4]:
# Laden der Netzwerk Traffic Daten für den LOIC Angriff
data_LOIC = pd.read_csv('../Data/Tuesday-20-02-2018_LOIC-Attack.csv')
# Umbenennen der Spalten für bessere Kompatibiltät mit TensorFlow
data_LOIC.rename(columns={
    'Bwd Pkt Len Std':'bwd_pkt_len_std',
    'Pkt Size Avg':'pkt_size_avg',
    'Flow Duration':'flow_duration',
    'Flow IAT Std':'flow_iat_std',
    'Label':'label'}, 
    inplace=True)

In [5]:
# Festlegen der Label-Spalte innerhalb des Datasets
label = 'label'

In [6]:
# Aufteilen des Datasets in Training- und Test-Daten
def split_dataset(dataset,  test_ratio=0.30):
    """Splits a panda dataframe in two dataframes."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

training_data_LOIC, testing_data_LOIC = split_dataset(data_LOIC)

print("{} examples in training, {} examples for testing.".format(
    len(training_data_LOIC), len(testing_data_LOIC)))

5562652 examples in training, 2386096 examples for testing.


In [7]:
# Konvertieren des Panda Dataframes in ein TensorFlow Dataset
print("Converting Panda Dataframe into TensorFlow Dataset...")
training_dataset_LOIC = tfdf.keras.pd_dataframe_to_tf_dataset(training_data_LOIC, label=label)
testing_dataset_LOIC = tfdf.keras.pd_dataframe_to_tf_dataset(testing_data_LOIC, label=label)

Converting Panda Dataframe into TensorFlow Dataset...


In [8]:
# Erstellen des Random Forest Modells
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])

In [9]:
# Trainieren des Modells
print("Training the Model...")
with sys_pipes():
    model.fit(x=training_dataset_LOIC)

Training the Model...
2021-06-28 10:23:10.569070: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-06-28 10:23:10.586959: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2199995000 Hz
[INFO kernel.cc:746] Start Yggdrasil model training
[INFO kernel.cc:747] Collect training examples
[INFO kernel.cc:392] Number of batches: 86917
[INFO kernel.cc:393] Number of examples: 5562652
[INFO kernel.cc:769] Dataset:
Number of records: 5562652
Number of columns: 5

Number of columns by type:
	NUMERICAL: 4 (80%)
	CATEGORICAL: 1 (20%)

Columns:

NUMERICAL: 4 (80%)
	0: "bwd_pkt_len_std" NUMERICAL mean:142.283 min:0 max:22448.4 sd:217.872
	1: "flow_duration" NUMERICAL mean:1.35138e+07 min:0 max:1.2e+08 sd:3.24093e+07
	2: "flow_iat_std" NUMERICAL mean:1.13273e+06 min:0 max:8.48081e+07 sd:3.79651e+06
	3: "pkt_size_avg" NUMERICAL mean:94.1014 min:0 max:16801.1 sd:106.464

CATEGORICAL: 1 (20%)
	4: 

In [10]:
# Evaluieren des trainierten Modells
print("Evaluating the Model...")
evaluation = model.evaluate(testing_dataset_LOIC, return_dict=True)
print()
for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")


Evaluating the Model...

loss: 0.0000
accuracy: 0.9972


In [11]:
# Abspeichern des Modells für spätere Verwendung
model.save("../Data/Models/loic_model")

INFO:tensorflow:Assets written to: ../Data/Models/loic_model/assets
INFO:tensorflow:Assets written to: ../Data/Models/loic_model/assets


In [12]:
# Plotten des ersten Baumes innerhalb des Decision Forests
with open('../Data/Models/GoldenEye_Model_Tree.html', 'w') as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=3))
tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=3)


'\n<script src="https://d3js.org/d3.v6.min.js"></script>\n<div id="tree_plot_7da3ef096da540e8b79ac5a4d014a1b8"></div>\n<script>\n/*\n * Copyright 2021 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     https://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/**\n *  Plotting of decision trees generated by TF-DF.\n *\n *  A tree is a recursive structure of node objects.\n *  A node contains one or more of the following components:\n *\n *    - A value: Representing the output of the node. If the node is not a leaf,\

In [13]:
# Erstellen einer Bilanz für das trainierte Modell
model.summary()

Model: "random_forest_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (4):
	bwd_pkt_len_std
	flow_duration
	flow_iat_std
	pkt_size_avg

No weights

Variable Importance: NUM_NODES:
    1.   "flow_duration" 287401.000000 ################
    2.    "pkt_size_avg" 3367.000000 
    3.    "flow_iat_std" 2832.000000 
    4. "bwd_pkt_len_std" 1121.000000 

Variable Importance: NUM_AS_ROOT:
    1.   "flow_duration" 211.000000 ################
    2. "bwd_pkt_len_std" 89.000000 

Variable Importance: SUM_SCORE:
    1.   "flow_duration" 156924348.584383 ################
    2.    "pkt_size_avg" 131792410.712506 ###########
    3. "bwd_pkt_len_std" 66087031.562824 
    4.    "flow_iat_std" 65541936.751925 

Va

In [15]:
# Erstellen von Grafiken für die Effizienz des Trainings
logs = model.make_inspector().training_logs()
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1,2,2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.savefig('../Data/Visualized/LOIC_Model.png')
plt.clf()

<Figure size 864x288 with 0 Axes>