In [2]:
#######################################################################
# Installieren der notwendigen Pakete für das Nutzen des 
# TensorFlow Decision Forest:
#######################################################################

!pip install tensorflow_decision_forests
!pip install wurlitzer
!pip install pandas
!pip install matplotlib
!pip install ipython 



In [3]:
# Importieren der notwendigen Pakete
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
from wurlitzer import sys_pipes
import matplotlib.pyplot as plt

from IPython.core.magic import register_line_magic
from IPython.display import display, HTML

In [4]:
# Aussgabe der verwendeten TensorFlow Decision Forests Version
print(f"TensorFlow Decision Forests v{tfdf.__version__}")

TensorFlow Decision Forests v0.1.7


In [5]:
# Laden des Pinguin Datasets
data_Penguins = pd.read_csv('../Data/penguins.csv')

label = 'species'

# Ausgabe der vorhandenen Labels im Dataset
classes = data_Penguins[label].unique().tolist()
print(f"Label classes: {classes}")
# Umwandeln der String Literale als nummerische Werte
data_Penguins[label] = data_Penguins[label].map(classes.index)

Label classes: ['Adelie', 'Gentoo', 'Chinstrap']


In [6]:
# Aufteilen des Datasets in Training und Testdaten
def split_dataset(dataset,  test_ratio=0.30):
    """Splits a panda dataframe in two dataframes."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

training_data_Penguins, testing_data_Penguins = split_dataset(data_Penguins)

print("{} examples in training, {} examples for testing.".format(
    len(training_data_Penguins), len(testing_data_Penguins)))


237 examples in training, 107 examples for testing.


In [7]:
# Konvertieren des Panda Dataframes in ein TensorFlow Dataset
print("Converting Panda Dataframe into TensorFlow Dataset...")
training_dataset_Penguins = tfdf.keras.pd_dataframe_to_tf_dataset(training_data_Penguins, label=label)
testing_dataset_Penguins = tfdf.keras.pd_dataframe_to_tf_dataset(testing_data_Penguins, label=label)


Converting Panda Dataframe into TensorFlow Dataset...


In [8]:
# Erstellen des Decision Forest Modells
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])

In [9]:
# Trainieren des Modells
print("Training the Model...")
with sys_pipes():
    model.fit(x=training_dataset_Penguins)

Training the Model...
2021-06-26 13:44:17.428382: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-06-26 13:44:17.453214: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2199995000 Hz
[INFO kernel.cc:746] Start Yggdrasil model training
[INFO kernel.cc:747] Collect training examples
[INFO kernel.cc:392] Number of batches: 4
[INFO kernel.cc:393] Number of examples: 237
[INFO kernel.cc:769] Dataset:
Number of records: 237
Number of columns: 8

Number of columns by type:
	NUMERICAL: 5 (62.5%)
	CATEGORICAL: 3 (37.5%)

Columns:

NUMERICAL: 5 (62.5%)
	0: "bill_depth_mm" NUMERICAL mean:17.0025 min:13.1 max:21.2 sd:1.95976
	1: "bill_length_mm" NUMERICAL mean:44.1025 min:32.1 max:59.6 sd:5.42568
	2: "body_mass_g" NUMERICAL mean:4203.59 min:2700 max:6300 sd:799.646
	3: "flipper_length_mm" NUMERICAL mean:201.068 min:172 max:231 sd:14.1527
	6: "year" NUMERICAL mean:2008.03 min:2007 max:2009

In [10]:
# Evaluieren des gewonnenen Modells
print("Evaluating the Model: ")
evaluation = model.evaluate(testing_dataset_Penguins, return_dict=True)

print()

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

# Abspeichern des gewonnenen Modells
model.save("../Data/Models/Penguins_model")

Evaluating the Model: 

loss: 0.0000
accuracy: 0.9533
INFO:tensorflow:Assets written to: ./Models/Penguins_model/assets
INFO:tensorflow:Assets written to: ./Models/Penguins_model/assets


In [16]:
# Plotten des ersten Baumes innerhalb des Decision Forests
with open('../Data/Models/Penguin_Model_Tree.html', 'w') as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=3))
tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=3)


'\n<script src="https://d3js.org/d3.v6.min.js"></script>\n<div id="tree_plot_cf46dbc31eb1401592f32d9f8f9a04af"></div>\n<script>\n/*\n * Copyright 2021 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     https://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/**\n *  Plotting of decision trees generated by TF-DF.\n *\n *  A tree is a recursive structure of node objects.\n *  A node contains one or more of the following components:\n *\n *    - A value: Representing the output of the node. If the node is not a leaf,\

In [12]:
# Ausgabe einer Model-Summary 
model.summary()

Model: "random_forest_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (7):
	bill_depth_mm
	bill_length_mm
	body_mass_g
	flipper_length_mm
	island
	sex
	year

No weights

Variable Importance: NUM_NODES:
    1.    "bill_length_mm" 733.000000 ################
    2. "flipper_length_mm" 374.000000 ########
    3.     "bill_depth_mm" 341.000000 #######
    4.       "body_mass_g" 293.000000 ######
    5.            "island" 244.000000 #####
    6.               "sex" 34.000000 
    7.              "year" 14.000000 

Variable Importance: NUM_AS_ROOT:
    1. "flipper_length_mm" 175.000000 ################
    2.    "bill_length_mm" 68.000000 #####
    3.     "bill_depth_mm" 43.000000 ###
    4.            

In [13]:
# Ausgabe der errechneten Feature Importance
model.make_inspector().variable_importances()


{'NUM_AS_ROOT': [("flipper_length_mm" (1; #3), 175.0),
  ("bill_length_mm" (1; #1), 68.0),
  ("bill_depth_mm" (1; #0), 43.0),
  ("island" (4; #4), 10.0),
  ("body_mass_g" (1; #2), 4.0)]}

In [14]:
# Plotten der restlichen Metriken

logs = model.make_inspector().training_logs()
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1,2,2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.savefig('../Data/Visualized/Penguins_Model.png')
plt.clf()

<Figure size 864x288 with 0 Axes>