Machine Learning - Exercise 7 

Goal of the exercise is to **code** the **Decision tree algorithm** which is focused on the optimum split part using either **gini index** or **entrophy**.

**Scikit-learn** documentation [Decision Tree](https://scikit-learn.org/stable/modules/tree.html#tree)


In [13]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os
import certifi
os.environ.setdefault('SSL_CERT_FILE', certifi.where())

'/Users/stepankudlacek/VSCode/su/.venv/lib/python3.12/site-packages/certifi/cacert.pem'

# Load the Iris dataset

https://archive.ics.uci.edu/dataset/53/iris

- One of the earliest datasets used in the literature on classification methods and widely used in statistics and machine learning.
- The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.

In [14]:
df = pd.read_csv('https://github.com/lowoncuties/VSB-FEI-Machine-Learning-Exercises/raw/main/datasets/ml_06/iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [15]:
# === Data Preparation ===
# We assume `df` already contains the Iris dataset loaded above.
# The last column is the label (species). We'll:
# 1. Separate features (X) and labels (y)
# 2. Encode string labels to integer class indices for easier processing
# 3. Create a simple train/test split
# 4. Keep a mapping from index back to original class name

import numpy as np
import pandas as pd

# Identify target column (last column) and feature columns (all but last)
label_col = df.columns[-1]
feature_cols = df.columns[:-1]
X = df[feature_cols].values  # numeric features
y_raw = df[label_col].values  # string labels like 'Iris-setosa'

# Encode labels to integers (0..K-1)
classes, y = np.unique(y_raw, return_inverse=True)

# Simple train/test split (shuffle indices then slice)
rng = np.random.default_rng(seed=42)  # deterministic seed for reproducibility
indices = np.arange(len(X))
rng.shuffle(indices)
train_size = int(0.8 * len(X))  # 80% train
train_idx = indices[:train_size]
test_idx = indices[train_size:]

X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]

print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
print("Classes:", classes)

Train samples: 120, Test samples: 30
Classes: ['setosa' 'versicolor' 'virginica']


In [None]:
# === Decision Tree Implementation (Simple, Gini only) ===
# Zjednodušená verze stromu:
# - Používá pouze Gini index (odstraněna entropie a volba criterion)
# - Binární štěpení podle prahu <= na numerických znacích
# - Stop: max_depth, čistý uzel, nebo málo vzorků
# - Bez prořezávání, bez chybějících hodnot, bez kateg. proměnných
# - Hodně komentářů pro pochopení
# Pozn.: "Gini index" = míra nečistoty uzlu. "feature index" = index sloupce (příznaku), podle kterého štěpíme.

from dataclasses import dataclass

@dataclass
class TreeNode:
    feature_idx: int = None         # Index sloupce (příznaku), podle kterého se štěpí
    threshold: float = None         # Prahová hodnota pro split
    left: 'TreeNode' = None         # Levá větev (<= threshold)
    right: 'TreeNode' = None        # Pravá větev (> threshold)
    prediction: int = None          # Třída v listu (index do classes)

class SimpleDecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    # --- Gini nečistota ---
    def _gini(self, y):
        # Gini = 1 - sum(p_i^2). Čistý uzel => 0.
        if len(y) == 0:
            return 0.0
        _, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return 1.0 - np.sum(probs ** 2)

    # --- Hledání nejlepšího splitu ---
    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        if n_samples < self.min_samples_split:
            return None, None, None  # Málo vzorků, neštěpíme

        parent_impurity = self._gini(y)
        best_gain = 0.0
        best_feat = None
        best_thresh = None

        for feat in range(n_features):
            values = X[:, feat]
            unique_vals = np.unique(values)
            if len(unique_vals) == 1:
                continue  # Všechny hodnoty stejné => žádný split
            thresholds = (unique_vals[:-1] + unique_vals[1:]) / 2.0
            for t in thresholds:
                left_mask = values <= t
                right_mask = values > t
                y_left = y[left_mask]
                y_right = y[right_mask]
                # Vážená nečistota dětí
                impurity_left = self._gini(y_left)
                impurity_right = self._gini(y_right)
                w_left = len(y_left) / n_samples
                w_right = len(y_right) / n_samples
                child_impurity = w_left * impurity_left + w_right * impurity_right
                info_gain = parent_impurity - child_impurity
                if info_gain > best_gain:
                    best_gain = info_gain
                    best_feat = feat
                    best_thresh = t
        return best_feat, best_thresh, best_gain

    def _build(self, X, y, depth):
        if depth >= self.max_depth or len(np.unique(y)) == 1 or len(y) < self.min_samples_split:
            majority_class = np.bincount(y).argmax()
            return TreeNode(prediction=majority_class)

        feat, thresh, gain = self._best_split(X, y)
        if feat is None or gain <= 0:  # žádné zlepšení
            majority_class = np.bincount(y).argmax()
            return TreeNode(prediction=majority_class)

        left_mask = X[:, feat] <= thresh
        right_mask = X[:, feat] > thresh
        left_child = self._build(X[left_mask], y[left_mask], depth + 1)
        right_child = self._build(X[right_mask], y[right_mask], depth + 1)
        return TreeNode(feature_idx=feat, threshold=thresh, left=left_child, right=right_child)

    def fit(self, X, y):
        self.root = self._build(X, y, depth=0)

    def _predict_one(self, x, node):
        if node.prediction is not None:
            return node.prediction
        if x[node.feature_idx] <= node.threshold:
            return self._predict_one(x, node.left)
        else:
            return self._predict_one(x, node.right)

    def predict(self, X):
        return np.array([self._predict_one(row, self.root) for row in X])

    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.root
        indent = "  " * depth
        if node.prediction is not None:
            print(f"{indent}Leaf: class={classes[node.prediction]}")
        else:
            feat_name = feature_cols[node.feature_idx]
            print(f"{indent}Node: if {feat_name} <= {node.threshold:.3f}")
            self.print_tree(node.left, depth + 1)
            print(f"{indent}else:")
            self.print_tree(node.right, depth + 1)

print("Implementation ready (Gini only).")

Implementation ready (Gini only).


In [None]:
from collections import Counter

def accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def confusion_matrix(y_true, y_pred, n_classes):
    cm = np.zeros((n_classes, n_classes), dtype=int)
    for t, p in zip(y_true, y_pred):
        cm[t, p] += 1
    return cm

clf = SimpleDecisionTree(max_depth=5, min_samples_split=2)
clf.fit(X_train, y_train)

# Predikce na testu
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred, len(classes))

print(f"Test Accuracy: {acc*100:.2f}%")
print("Confusion Matrix (rows=true, cols=pred):")
print(cm)
print("\nTree Structure:")
clf.print_tree()

print("\nSample predictions (first 5 test samples):")
for i in range(min(5, len(X_test))):
    original_class = classes[y_test[i]]
    predicted_class = classes[y_pred[i]]
    print(f"Sample {i}: true={original_class}, pred={predicted_class} -> {predicted_class == original_class}")

Test Accuracy: 96.67%
Confusion Matrix (rows=true, cols=pred):
[[13  0  0]
 [ 0  5  1]
 [ 0  0 11]]

Tree Structure:
Node: if petal_length <= 2.450
  Leaf: class=setosa
else:
  Node: if petal_width <= 1.650
    Node: if petal_length <= 4.950
      Leaf: class=versicolor
    else:
      Node: if sepal_length <= 6.050
        Node: if sepal_width <= 2.450
          Leaf: class=virginica
        else:
          Leaf: class=versicolor
      else:
        Leaf: class=virginica
  else:
    Node: if petal_length <= 4.850
      Node: if sepal_width <= 3.000
        Leaf: class=virginica
      else:
        Leaf: class=versicolor
    else:
      Leaf: class=virginica

Sample predictions (first 5 test samples):
Sample 0: true=virginica, pred=virginica -> True
Sample 1: true=setosa, pred=setosa -> True
Sample 2: true=setosa, pred=setosa -> True
Sample 3: true=virginica, pred=virginica -> True
Sample 4: true=virginica, pred=virginica -> True
