# Random Forest

## I - Data set up

### 1) Package importation

In [114]:
import pandas as pd
import math

### 2) Data loading

In [115]:
tennis_data = pd.read_csv('data.csv')
print(tennis_data)

     Prévision Température Humidité    Vent Classe
0   Ensoleillé       Chaud   Elevée  Faible    Non
1   Ensoleillé       Chaud   Elevée    Fort    Non
2      Nuageux       Chaud   Elevée  Faible    Oui
3     Pluvieux       Moyen   Elevée  Faible    Oui
4     Pluvieux       Frais  Normale  Faible    Oui
5     Pluvieux       Frais  Normale    Fort    Non
6      Nuageux       Frais  Normale    Fort    Oui
7   Ensoleillé       Moyen   Elevée  Faible    Non
8   Ensoleillé       Frais  Normale  Faible    Oui
9     Pluvieux       Moyen  Normale  Faible    Oui
10  Ensoleillé       Moyen  Normale    Fort    Oui
11     Nuageux       Moyen   Elevée    Fort    Oui
12     Nuageux       Chaud  Normale  Faible    Oui
13    Pluvieux       Moyen   Elevée    Fort    Non


### 3) Entropy calculation

In [116]:
def calculate_entropy(data: pd.DataFrame, target_variable: str):
    total_number = len(data)
    target_counts = data[target_variable].value_counts()
    entropy = 0
    for target_value, count in target_counts.items():
        entropy -= count/total_number * math.log(count/total_number, 2)
    return entropy


class_entropy = calculate_entropy(tennis_data, "Classe")
print(class_entropy)

0.9402859586706309


### 4) Information gain calculation

In [117]:
def calculate_information_gain(data: pd.DataFrame, splitting_column: str, target_variable: str):
    information_gain = calculate_entropy(data, target_variable)
    data_groups = data.groupby(data[splitting_column])
    for column_value, data_group in data_groups:
        information_gain -= len(data_group)/len(data) * calculate_entropy(data_group, target_variable)
    return information_gain


for column in tennis_data.columns:
    print(column)
    print(calculate_information_gain(tennis_data, column, "Classe"))
    print()

Prévision
0.2467498197744391

Température
0.029222565658954647

Humidité
0.15183550136234136

Vent
0.04812703040826932

Classe
0.9402859586706309



### 5) Gini impurity calculation

In [118]:
def calculate_gini_impurity(data: pd.DataFrame, target_variable: str):
    gini_impurity = 1
    total_number = len(data)
    target_counts = data[target_variable].value_counts()
    for target_value, count in target_counts.items():
        gini_impurity -= math.pow(count/total_number, 2)
    return gini_impurity


print(calculate_gini_impurity(tennis_data, "Classe"))
    

0.4591836734693877


### 6) Residual Gini impurity after division

In [119]:
def calculate_residual_gini_impurity(data: pd.DataFrame, splitting_column: str, target_variable: str):
    residual_impurity = 0
    data_groups = data.groupby(data[splitting_column])
    for column_value, data_group in data_groups:
        residual_impurity += len(data_group)/len(data) * calculate_gini_impurity(data_group, target_variable)
    return residual_impurity


for column in tennis_data.columns:
    print(column)
    print(calculate_residual_gini_impurity(tennis_data, column, "Classe"))
    print()

Prévision
0.34285714285714286

Température
0.44047619047619047

Humidité
0.3673469387755103

Vent
0.42857142857142855

Classe
0.0



### 7) Tree class

In [120]:
class TreeNode:
    def __init__(self, decision_variable:str = None):
        self._decision_variable = decision_variable
        self._children = {}

    @property
    def decision_variable(self):
        return self._decision_variable

    @decision_variable.setter
    def decision_variable(self, value):
        self._decision_variable = value

    @property
    def children(self):
        return self._children

    @children.setter
    def children(self, value):
        self._children = value

    def add_child(self, key, value):
        self._children[key] = value
        
    def decide(self, row):
        variable_value = row[self._decision_variable]
        child = self._children[variable_value]
        if isinstance(child, TreeNode):
            return child.decide(row)
        return child
    
    def id3_generate(self, data: pd.DataFrame, target_variable: str):
        information_gains = [(column_name, calculate_information_gain(data, column_name, target_variable)) for column_name in data.columns if column_name != target_variable]
        max_info_gain_column = max(information_gains, key=lambda x: x[1])[0]
        self._decision_variable = max_info_gain_column
        data_groups = data.groupby(data[max_info_gain_column])
        for column_value, group in data_groups:
            if calculate_entropy(group, target_variable) == 0:
                self.add_child(column_value, group.iloc[0][target_variable])
                continue
            child_node = TreeNode()
            child_node.id3_generate(group, target_variable)
            self.add_child(column_value, child_node)
    
    def card_generate(self, data: pd.DataFrame, target_variable: str):
        residual_impurities = [(column_name, calculate_residual_gini_impurity(data, column_name, target_variable)) for column_name in data if column_name != target_variable]
        min_impurity_column = min(residual_impurities, key=lambda x: x[1])[0]
        self._decision_variable = min_impurity_column
        data_groups = data.groupby(data[min_impurity_column])
        for column_value, group in data_groups:
            if calculate_gini_impurity(group, target_variable) == 0:
                self.add_child(column_value, group.iloc[0][target_variable])
                continue
            child_node = TreeNode()
            child_node.card_generate(group, target_variable)
            self.add_child(column_value, child_node)
    
    def print_node(self, node=None, space=''):
        if node is None:
            node = self
        if node._decision_variable:
            print(space + 'Decision Variable:', node._decision_variable)
        for key, value in node._children.items():
            if isinstance(value, TreeNode):
                print(space + 'Key:', key)
                value.print_node(value, space + '  ')
            else:
                print(space + 'Key:', key, 'Value:', value)


class DecisionTree:
    def __init__(self):
        self._root = None
    
    @property
    def root(self):
        return self._root

    @root.setter
    def root(self, value):
        self._root = value
    
    def decide(self, row: pd.Series):
        return self._root.decide(row)
    
    def id3_generate(self, data: pd.DataFrame, target_variable: str):
        self._root = TreeNode()
        self._root.id3_generate(data, target_variable)
        
    def card_generate(self, data: pd.DataFrame, target_variable: str):
        self._root = TreeNode()
        self._root.card_generate(data, target_variable)

#### a) Tree test class

In [121]:
test_tree = DecisionTree()
test_tree.root = TreeNode("Prévision")

humidity_node = TreeNode("Humidité")
humidity_node.add_child("Elevée", "Non")
humidity_node.add_child("Normale", "Oui")

wind_node = TreeNode("Vent")
wind_node.add_child("Fort", "Non")
wind_node.add_child("Faible", "Oui")

test_tree.root.add_child("Nuageux", "Oui")
test_tree.root.add_child(
    "Pluvieux",
    humidity_node
)
test_tree.root.add_child(
    "Ensoleillé",
    wind_node
)

test_tree.root.print_node()
print()

for data_index, data_row in tennis_data.iterrows():
    print(data_index)
    print(data_row)
    print(test_tree.decide(data_row))
    print()

Decision Variable: Prévision
Key: Nuageux Value: Oui
Key: Pluvieux
  Decision Variable: Humidité
  Key: Elevée Value: Non
  Key: Normale Value: Oui
Key: Ensoleillé
  Decision Variable: Vent
  Key: Fort Value: Non
  Key: Faible Value: Oui

0
Prévision      Ensoleillé
Température         Chaud
Humidité           Elevée
Vent               Faible
Classe                Non
Name: 0, dtype: object
Oui

1
Prévision      Ensoleillé
Température         Chaud
Humidité           Elevée
Vent                 Fort
Classe                Non
Name: 1, dtype: object
Non

2
Prévision      Nuageux
Température      Chaud
Humidité        Elevée
Vent            Faible
Classe             Oui
Name: 2, dtype: object
Oui

3
Prévision      Pluvieux
Température       Moyen
Humidité         Elevée
Vent             Faible
Classe              Oui
Name: 3, dtype: object
Non

4
Prévision      Pluvieux
Température       Frais
Humidité        Normale
Vent             Faible
Classe              Oui
Name: 4, dtype: object
O

## II - ID3 Algorithm

In [122]:
id3_tree = DecisionTree()
id3_tree.id3_generate(tennis_data, "Classe")
id3_tree.root.print_node()

Decision Variable: Prévision
Key: Ensoleillé
  Decision Variable: Humidité
  Key: Elevée Value: Non
  Key: Normale Value: Oui
Key: Nuageux Value: Oui
Key: Pluvieux
  Decision Variable: Vent
  Key: Faible Value: Oui
  Key: Fort Value: Non


In [123]:
print(tennis_data.loc[0])
print()
print(id3_tree.decide(tennis_data.loc[0]))

Prévision      Ensoleillé
Température         Chaud
Humidité           Elevée
Vent               Faible
Classe                Non
Name: 0, dtype: object

Non


## III - CARD Algorithm

In [124]:
card_tree = DecisionTree()
card_tree.card_generate(tennis_data, "Classe")
card_tree.root.print_node()

Decision Variable: Prévision
Key: Ensoleillé
  Decision Variable: Humidité
  Key: Elevée Value: Non
  Key: Normale Value: Oui
Key: Nuageux Value: Oui
Key: Pluvieux
  Decision Variable: Vent
  Key: Faible Value: Oui
  Key: Fort Value: Non


In [125]:
print(tennis_data.loc[0])
print()
print(card_tree.decide(tennis_data.loc[0]))

Prévision      Ensoleillé
Température         Chaud
Humidité           Elevée
Vent               Faible
Classe                Non
Name: 0, dtype: object

Non


## IV - Random forest

### 1) Random Forest class

In [126]:
class RandomForest:
    def __init__(self):
        self._trees = []
    
    @property
    def trees(self):
        return self._trees
    
    def add_tree(self, tree: DecisionTree):
        self._trees.append(tree)
    
    def decide(self, row: pd.Series):
        answers = {}
        for i in range(len(self._trees)):
            try:
                decision = self._trees[i].decide(row)
                if decision in answers:
                    answers[decision] += 1
                else:
                    answers[decision] = 1
            except KeyError:
                print(f"Tree {i} couldn't decide")
        return max(answers, key=answers.get)
    
    def id3_generate(self, data: pd.DataFrame, target_variable: str, sample_percentage: float, tree_number: int):
        for _ in range(tree_number):
            data_sample = data.sample(frac=sample_percentage/100)
            new_tree = DecisionTree()
            new_tree.id3_generate(data_sample, target_variable)
            self.add_tree(new_tree)
    
    def card_generate(self, data: pd.DataFrame, target_variable: str, sample_percentage: float, tree_number: int):
        for _ in range(tree_number):
            data_sample = data.sample(frac=sample_percentage/100)
            new_tree = DecisionTree()
            new_tree.card_generate(data_sample, target_variable)
            self.add_tree(new_tree)

### 2) ID3 random forest

In [127]:
id3_forest = RandomForest()
id3_forest.id3_generate(tennis_data, "Classe", 75, 10)

# Print the forest
# for i in range(len(id3_forest.trees)):
#     print("Tree", i)
#     id3_forest.trees[i].root.print_node()
#     print()

print(tennis_data.loc[2])
print()
print(f"ID3 Forest decision: {id3_forest.decide(tennis_data.loc[2])}")

Prévision      Nuageux
Température      Chaud
Humidité        Elevée
Vent            Faible
Classe             Oui
Name: 2, dtype: object

Tree 0 couldn't decide
ID3 Forest decision: Oui


### 3) CARD random forest

In [128]:
card_forest = RandomForest()
card_forest.id3_generate(tennis_data, "Classe", 75, 10)

# Print the forest
# for i in range(len(card_forest.trees)):
#     print("Tree", i)
#     card_forest.trees[i].root.print_node()
#     print()

print(tennis_data.loc[2])
print()
print(f"CARD Forest decision: {card_forest.decide(tennis_data.loc[2])}")

Prévision      Nuageux
Température      Chaud
Humidité        Elevée
Vent            Faible
Classe             Oui
Name: 2, dtype: object

Tree 9 couldn't decide
CARD Forest decision: Oui
