<a href="https://colab.research.google.com/github/LucasCavalherie/DFDT/blob/main/DFDT_River.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# !pip install river

In [7]:
# 1. Imports
import math
import random
import csv
import time
import os
import itertools
from collections import defaultdict
from river import base, metrics, evaluate

def calculate_gini_impurity(class_distribution):
    """Calculates the Gini Impurity for a class distribution."""
    total_count = sum(class_distribution.values())
    if total_count == 0:
        return 0.0

    impurity = 1.0
    for count in class_distribution.values():
        probability = count / total_count
        impurity -= probability ** 2
    return impurity

class NumericAttributeSplitter:
    """
    This class stores statistics for a single numeric attribute
    and finds the best split point for it.
    """
    def __init__(self):
        self._value_class_observations = defaultdict(list)

    def update(self, value, class_label):
        """Updates the splitter with a new value and its class."""
        self._value_class_observations[value].append(class_label)

    def get_best_split(self, parent_impurity):
        """
        Finds the best split point for this attribute.
        It tests all midpoints between unique values as split candidates.
        """
        best_split_value = None
        best_gain = -1.0
        sorted_unique_values = sorted(self._value_class_observations.keys())
        if len(sorted_unique_values) < 2:
            return None, -1.0
        for i in range(len(sorted_unique_values) - 1):
            split_candidate = (sorted_unique_values[i] + sorted_unique_values[i+1]) / 2
            left_branch_dist = defaultdict(int)
            right_branch_dist = defaultdict(int)
            for value, classes in self._value_class_observations.items():
                for c in classes:
                    if value <= split_candidate:
                        left_branch_dist[c] += 1
                    else:
                        right_branch_dist[c] += 1
            left_count = sum(left_branch_dist.values())
            right_count = sum(right_branch_dist.values())
            total_count = left_count + right_count
            if left_count == 0 or right_count == 0:
                continue
            left_impurity = calculate_gini_impurity(left_branch_dist)
            right_impurity = calculate_gini_impurity(right_branch_dist)
            weighted_children_impurity = (left_count / total_count) * left_impurity + \
                                         (right_count / total_count) * right_impurity
            gain = parent_impurity - weighted_children_impurity
            if gain > best_gain:
                best_gain = gain
                best_split_value = split_candidate
        return best_split_value, best_gain

class Node:
    """Represents a node in the decision tree with learning capabilities."""
    def __init__(self, is_leaf=True, parent=None, depth=0):
        self.is_leaf = is_leaf
        self.parent = parent
        self.depth = depth
        self.children = {}
        self.active = True
        self.id = random.randint(0, 1000000)
        self.class_distribution = defaultdict(int)
        self.feature_estimators = defaultdict(NumericAttributeSplitter)
        self.n_l = 0
        self.n_check_l = 0
        self.n_leaf_l = 0
        self.n_tree_l = 0
        self.split_attribute = None
        self.split_value = None

    def __repr__(self):
        return f"Node(id={self.id}, is_leaf={self.is_leaf}, active={self.active}, n_l={self.n_l}, dist={self.class_distribution})"

    def get_prediction(self):
        if not self.class_distribution: return None
        return max(self.class_distribution, key=self.class_distribution.get)

    def update_stats(self, instance_X, instance_y):
        self.class_distribution[instance_y] += 1
        for attr_index, value in instance_X.items():
            self.feature_estimators[attr_index].update(value, instance_y)

class DFDT(base.Classifier):
    """
    Implementation of the Dynamic Fast Decision Tree (DFDT) algorithm,
    made compatible with the River API.
    """
    def __init__(self, delta=0.05, initial_grace_period=100,
                 grow_fast_threshold=2.0, deactivate_threshold=0.2,
                 std_dev_multiplier_c3_c4=1.0, std_dev_multiplier_c5=1.0,
                 r_heuristic_range=0.5):
        self.delta = delta
        self.initial_grace_period = initial_grace_period
        self.grow_fast_threshold = grow_fast_threshold
        self.deactivate_threshold = deactivate_threshold
        self.std_dev_multiplier_c3_c4 = std_dev_multiplier_c3_c4
        self.std_dev_multiplier_c5 = std_dev_multiplier_c5
        self.R_HEURISTIC_RANGE = r_heuristic_range

        self.root = Node(is_leaf=True, depth=0)
        self.root.n_min_grace_period = self.initial_grace_period
        self.leaves = {self.root}
        self.h_stat_values, self.g_stat_values, self.n_stat_values, self.hb_stat_values = [], [], [], []
        self.n_total_instances = 0

    @property
    def _supervised(self):
        return True

    def learn_one(self, x, y):
        """Treina o modelo com uma única instância, conforme a API do River."""
        leaf = self._route_to_leaf(x)
        leaf.update_stats(x, y)
        self.n_total_instances += 1
        leaf.n_l += 1
        self._attempt_growth(leaf, self.n_total_instances)
        return self

    def predict_one(self, x):
        """Prevê o rótulo de uma única instância, conforme a API do River."""
        return self.predict(x)

    def _avg(self, values_list):
        return sum(values_list) / len(values_list) if values_list else 0

    def _std_dev(self, values_list):
        if not values_list or len(values_list) < 2: return 0
        mean = self._avg(values_list)
        variance = sum([(x - mean) ** 2 for x in values_list]) / (len(values_list) - 1)
        return math.sqrt(variance)

    def _route_to_leaf(self, instance_X):
        current_node = self.root
        while not current_node.is_leaf:
            if current_node.split_attribute is None or current_node.split_value is None: break
            instance_value = instance_X.get(current_node.split_attribute, 0)
            branch_index = 0 if instance_value <= current_node.split_value else 1
            if branch_index in current_node.children: current_node = current_node.children[branch_index]
            else: break
        return current_node

    def predict(self, instance_X):
        leaf = self._route_to_leaf(instance_X)
        return leaf.get_prediction()

    def _attempt_growth(self, leaf, n_total_instances):
        if not leaf.active: return
        fraction = ((leaf.n_l - leaf.n_leaf_l) * len(self.leaves)) / (n_total_instances - leaf.n_tree_l) if n_total_instances - leaf.n_tree_l > 0 else 0
        grow_fast_flag = fraction > self.grow_fast_threshold
        if fraction < self.deactivate_threshold:
            leaf.active = False
            return
        if not hasattr(leaf, 'n_min_grace_period'): leaf.n_min_grace_period = self.initial_grace_period
        is_ready_for_check = (leaf.n_l - leaf.n_check_l > leaf.n_min_grace_period)
        if len(leaf.class_distribution) > 1 and is_ready_for_check:
            epsilon = self._calculate_hoeffding_bound(leaf.n_l)
            g_values = self._calculate_g_values(leaf)
            if self._can_split(leaf, grow_fast_flag, g_values, epsilon):
                self._split_leaf(leaf)
            else:
                leaf.n_check_l = leaf.n_l
                self._adapt_grace_period(leaf, g_values, epsilon)

    def _split_leaf(self, leaf):
        leaf.is_leaf = False
        leaf.active = False
        self.leaves.remove(leaf)
        split_attr, split_val = leaf.best_split_info['attribute'], leaf.best_split_info['split_value']
        leaf.split_attribute, leaf.split_value = split_attr, split_val
        left_child, right_child = Node(is_leaf=True, parent=leaf, depth=leaf.depth + 1), Node(is_leaf=True, parent=leaf, depth=leaf.depth + 1)
        for attr, splitter in leaf.feature_estimators.items():
            for val, classes in splitter._value_class_observations.items():
                for cls in classes:
                    target_child = left_child if val <= split_val else right_child
                    target_child.update_stats({attr: val}, cls)
        self.leaves.add(left_child)
        self.leaves.add(right_child)
        leaf.children[0], leaf.children[1] = left_child, right_child

    def _can_split(self, leaf, grow_fast, g_values, epsilon):
        if not g_values: return False
        sorted_g = sorted(g_values.items(), key=lambda item: item[1], reverse=True)
        if not sorted_g: return False
        g_best_attr, g_best_val = sorted_g[0]
        g_second_best_val = sorted_g[1][1] if len(sorted_g) > 1 else 0
        avg_hb_hist = self._avg(self.hb_stat_values) if self.hb_stat_values else float('inf')
        if not ((g_best_val - g_second_best_val >= epsilon) or (epsilon < avg_hb_hist)): return False
        impurity = calculate_gini_impurity(leaf.class_distribution)
        if grow_fast: return True
        current_impurities = [calculate_gini_impurity(n.class_distribution) for n in self.leaves if n.active]
        c3 = impurity >= self._avg(current_impurities) - (self.std_dev_multiplier_c3_c4 * self._std_dev(current_impurities))
        c4 = impurity >= self._avg(self.h_stat_values) - (self.std_dev_multiplier_c3_c4 * self._std_dev(self.h_stat_values))
        c5 = g_best_val >= self._avg(self.g_stat_values) - (self.std_dev_multiplier_c5 * self._std_dev(self.g_stat_values))
        c6 = leaf.n_l >= self._avg(self.n_stat_values)
        if c3 and c4 and c5 and c6:
            self._update_historical_stats(impurity, g_best_val, leaf.n_l, epsilon)
            best_split_val, _ = leaf.feature_estimators[g_best_attr].get_best_split(impurity)
            leaf.best_split_info = {'attribute': g_best_attr, 'split_value': best_split_val}
            return True
        return False

    def _adapt_grace_period(self, leaf, g_values, epsilon):
        sorted_g = sorted(g_values.values(), reverse=True)
        g_best, g_second_best = (sorted_g[0] if sorted_g else 0), (sorted_g[1] if len(sorted_g) > 1 else 0)
        delta_g = g_best - g_second_best
        avg_hb = self._avg(self.hb_stat_values) if self.hb_stat_values else epsilon
        new_n_min = 0
        if delta_g < epsilon and delta_g > avg_hb and delta_g > 1e-9: new_n_min = math.ceil((self.R_HEURISTIC_RANGE**2 * math.log(1 / self.delta)) / (2 * delta_g**2))
        elif delta_g < avg_hb and epsilon > avg_hb and avg_hb > 1e-9: new_n_min = math.ceil((self.R_HEURISTIC_RANGE**2 * math.log(1 / self.delta)) / (2 * avg_hb**2))
        if new_n_min > 0: leaf.n_min_grace_period = max(leaf.n_min_grace_period, new_n_min)

    def _calculate_g_values(self, leaf):
        gains = {}
        parent_impurity = calculate_gini_impurity(leaf.class_distribution)
        for attr_index, splitter in leaf.feature_estimators.items():
            _, gain = splitter.get_best_split(parent_impurity)
            if gain > 0: gains[attr_index] = gain
        return gains

    def _calculate_hoeffding_bound(self, n):
        if n == 0: return float('inf')
        return math.sqrt((self.R_HEURISTIC_RANGE**2 * math.log(1 / self.delta)) / (2 * n))

    def _update_historical_stats(self, impurity, g_best, n_l, epsilon):
        self.h_stat_values.append(impurity); self.g_stat_values.append(g_best); self.n_stat_values.append(n_l); self.hb_stat_values.append(epsilon)

def load_csv_dataset(file_path, class_index=-1):
    """Loads a dataset from a local CSV file into a list."""
    stream = []
    try:
        with open(file_path, 'r', newline='') as f:
            reader = csv.reader(f)
            try:
                next(reader)
            except StopIteration:
                return []
            for row in reader:
                try:
                    if class_index < 0:
                        effective_class_index = len(row) + class_index
                    else:
                        effective_class_index = class_index


                    try:
                        label = int(row[effective_class_index])
                    except ValueError:
                        label = row[effective_class_index]

                    features_list = [float(val) for i, val in enumerate(row) if i != effective_class_index]
                    instance_X_dict = {j: features_list[j] for j in range(len(features_list))}
                    stream.append((instance_X_dict, label))
                except (ValueError, IndexError):
                    continue
        return stream
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'.")
        return None

def set_results_log(filepath, dataset_name, accuracy, exec_time, params=None):
    """Logs the results of an experiment to a file."""
    result_line = f"{dataset_name},{exec_time:.2f},{accuracy*100:.4f},{params if params else 'N/A'}\n"
    write_header = not os.path.exists(filepath)
    with open(filepath, 'a') as f:
        if write_header:
            f.write("Dataset,Execution Time (s),Final Accuracy (%),Parameters\n")
        f.write(result_line)

if __name__ == '__main__':
    print("DFDT Python Implementation - River-based Grid Search Runner")
    print("=" * 40)

    datasets = [
        {'name': 'NOAA', 'path': 'NOAA.csv', 'class_index': 8},
        {'name': 'Keystroke', 'path': 'Keystroke.csv', 'class_index': 10},
        {'name': 'Chess', 'path': 'Chess.csv', 'class_index': 7},
        {'name': 'Luxembourg', 'path': 'Luxembourg.csv', 'class_index': 31},
        {'name': 'Ozone', 'path': 'Ozone.csv', 'class_index': 72},
        {'name': 'SmartMeter', 'path': 'SmartMeter.csv', 'class_index': 96},
        {'name': 'Electricity', 'path': 'Electricity.csv', 'class_index': 8},
        {'name': 'Rialto', 'path': 'Rialto.csv', 'class_index': 27},
        {'name': 'Forest', 'path': 'Forest.csv', 'class_index': 54},
        {'name': 'Posture', 'path': 'Posture.csv', 'class_index': 3},
        {'name': 'PokerHand', 'path': 'PokerHand.csv', 'class_index': 10},
    ]

    param_grid = {
        'delta': [0.05],
        'initial_grace_period': [200, 300, 400],
        'grow_fast_threshold': [1.5],
        'deactivate_threshold': [0.1],
        'std_dev_multiplier_c3_c4': [0.5],
        'std_dev_multiplier_c5': [0.5],
        'r_heuristic_range': [0.45]
    }

    results_filepath = 'dfdt_river_grid_search_results.csv'
    if os.path.exists(results_filepath):
        os.remove(results_filepath)

    # Loop principal para a pesquisa em grelha
    for config in datasets:
        print(f"\n{'='*40}")
        print(f"Processing dataset: {config['name']}")

        data_stream = load_csv_dataset(config['path'], config['class_index'])
        if not data_stream:
            print(f"Could not load or empty dataset: {config['name']}. Skipping.")
            continue

        keys, values = zip(*param_grid.items())
        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

        print(f"Starting Grid Search with {len(param_combinations)} combinations.")

        for i, params in enumerate(param_combinations):
            model = DFDT(**params)
            metric = metrics.Accuracy()

            start_time = time.time()

            evaluate.progressive_val_score(dataset=data_stream, model=model, metric=metric, print_every=0)

            end_time = time.time()
            execution_time = end_time - start_time

            print(f"  [Combination {i+1}/{len(param_combinations)}] Accuracy: {metric.get()*100:.4f}% | Time: {execution_time:.2f}s")
            set_results_log(results_filepath, config['name'], metric.get(), execution_time, params)

    print(f"\n{'=' * 40}")
    print("Processing complete!")
    print(f"All results saved to '{results_filepath}'.")

DFDT Python Implementation - River-based Grid Search Runner

Processing dataset: Forest
Starting Grid Search with 3 combinations.
  [Combination 1/3] Accuracy: 48.7594% | Time: 41.95s
  [Combination 2/3] Accuracy: 48.7594% | Time: 41.68s
  [Combination 3/3] Accuracy: 63.1086% | Time: 42.00s

Processing dataset: Posture
Starting Grid Search with 3 combinations.
  [Combination 1/3] Accuracy: 33.0458% | Time: 5.97s
  [Combination 2/3] Accuracy: 33.0458% | Time: 3.22s
  [Combination 3/3] Accuracy: 33.0458% | Time: 3.63s

Processing dataset: PokerHand
Starting Grid Search with 3 combinations.
  [Combination 1/3] Accuracy: 49.9642% | Time: 18.33s
  [Combination 2/3] Accuracy: 52.5279% | Time: 19.90s
  [Combination 3/3] Accuracy: 49.9749% | Time: 18.36s

Processing complete!
All results saved to 'dfdt_river_grid_search_results.csv'.
