<a href="https://colab.research.google.com/github/IslamJenishbekov/CustomDecisionTreeRegressor/blob/main/Custom_Regressor_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
data = sns.load_dataset('diamonds').sample(1000)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 32683 to 14741
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float64 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float64 
 5   table    1000 non-null   float64 
 6   price    1000 non-null   int64   
 7   x        1000 non-null   float64 
 8   y        1000 non-null   float64 
 9   z        1000 non-null   float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 66.3 KB


In [16]:
X = data[["table", "carat", "clarity"]]
y = data["price"]

In [17]:
X = pd.get_dummies(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 32683 to 14741
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   table         1000 non-null   float64
 1   carat         1000 non-null   float64
 2   clarity_IF    1000 non-null   bool   
 3   clarity_VVS1  1000 non-null   bool   
 4   clarity_VVS2  1000 non-null   bool   
 5   clarity_VS1   1000 non-null   bool   
 6   clarity_VS2   1000 non-null   bool   
 7   clarity_SI1   1000 non-null   bool   
 8   clarity_SI2   1000 non-null   bool   
 9   clarity_I1    1000 non-null   bool   
dtypes: bool(8), float64(2)
memory usage: 31.2 KB


In [18]:
#конкретный узел
class Node:
    def __init__(self, feature_name, threshold, current_depth, data, mse):
        self.feature_name = feature_name
        self.threshold = threshold
        self.current_depth = current_depth
        self.data = data
        self.mse = mse
        self.left = None
        self.right = None
        self.answer = None
        self.status = None

#само дерево
class RegressorTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.columns = None
        self.root = None

    def fit(self, X, y):
        X = X.copy()
        self.columns = X.columns.tolist()
        X['target'] = y

        column, threshold, node_mse = self.split_best(X)

        self.root = Node(column, threshold, 0, X, node_mse)
        self.root.answer = X['target'].mean()
        self.setting_nodes(self.root)

    def setting_nodes(self, root):
        column_name, threshold = root.feature_name, root.threshold
        left_data = root.data[root.data[column_name] <= threshold]
        right_data = root.data[root.data[column_name] > threshold]

        if left_data.shape[0] > 0:
            left_column, left_threshold, left_mse = self.split_best(left_data)
            if left_column is not None:
                left_node = Node(left_column, left_threshold, root.current_depth + 1, left_data, left_mse)
                left_node.answer = left_data['target'].mean()
                root.left = left_node

                if left_node.current_depth < self.max_depth:
                    self.setting_nodes(left_node)
                else:
                    left_node.status = 'leaf'

            else:
                left_node = Node(None, None, root.current_depth + 1, left_data, left_mse)
                left_node.answer = left_data["target"].mean()
                left_node.status = 'leaf'
                root.left = left_node

        if right_data.shape[0] > 0:
            right_column, right_threshold, right_mse = self.split_best(right_data)
            if right_column is not None:
                right_node = Node(right_column, right_threshold, root.current_depth + 1, right_data, right_mse)
                right_node.answer = right_data['target'].mean()
                root.right = right_node

                if right_node.current_depth < self.max_depth:
                    self.setting_nodes(right_node)
                else:
                    right_node.status = 'leaf'

            else:
                right_node = Node(None, None, root.current_depth + 1, right_data, right_mse)
                right_node.answer = right_data["target"].mean()
                right_node.status = 'leaf'
                root.right= right_node

    def split_best(self, data):

        column = None
        threshold = None
        node_mse = self._calculate_mse(data)
        best_gain = 0

        for feature in self.columns:
            possible_thresholds = np.unique(data[feature]).tolist()
            possible_thresholds = [(possible_thresholds[i] + possible_thresholds[i+1])/2 for i in range(len(possible_thresholds)-1)]
            for t in possible_thresholds:
                left = data[data[feature] <= t]
                right = data[data[feature] > t]

                if len(left) == 0 or len(right) == 0:
                    continue

                left_mse = self._calculate_mse(left)
                right_mse = self._calculate_mse(right)
                total_mse = (left_mse * len(left) + right_mse * len(right))/len(data)
                gain =  node_mse - total_mse
                if gain > best_gain:
                    best_gain = gain
                    column = feature
                    threshold = t

        return column, threshold, node_mse

    def _calculate_mse(self, data):
      target_mean = data['target'].mean()
      return ((data['target'] - target_mean) ** 2).mean()


    def show_tree(self):
        print("ROOT: ", end="\t")
        self._show_node(self.root)

    def _show_node(self, node):
        if node.status == "leaf":
            print(f"DEPTH: {node.current_depth} \t return: {node.answer}")
        else:
            print(f"DEPTH: {node.current_depth} \t {node.feature_name} <= {node.threshold} : {node.answer}")
        if node.left is not None:
            print("Left:", end="\t")
            self._show_node(node.left)
        if node.right is not None:
            print("Right:", end="\t")
            self._show_node(node.right)

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            prediction = self.predict_single(row)
            predictions.append(prediction)
        return np.array(predictions)

    def predict_single(self, row):
        node = self.root
        while node.left != None:
            if row[node.feature_name] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node.answer



In [19]:
tree = RegressorTree(max_depth=3)
tree.fit(X, y)

In [20]:
y_pred = tree.predict(X)

In [21]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y, y_pred)

0.21753500070789691