### Задача кредитного скоринга
Необходимо построить модель предсказывающую количество дней просрочки

**Основные этапы:**
1. Выгрузить данные
2.  

-----------------------
1. Запрос в ClickHouse:
```sql
select  age,
        income,
        dependents,
        has_property,
        has_car,
        credit_score,
        job_tenure,
        has_education,
        loan_amount,
        date_diff(day,loan_start,loan_deadline) as loan_period,
        if (date_diff(day, loan_deadline, loan_payed) > 0, date_diff(day,loan_deadline,loan_payed), 0) as delay_days
from default.loan_delay_days
```
-----------------------

* Информационный критерий — это метод оценки качества разбиения данных на две или более группы в решающем дереве. <br>
* Для задач регрессии в качестве критерия разбиения обычно используется MSE (Mean Squared Error). Мы считаем MSE до разбиения и после. То разбиение, которое даст наибольшее снижение ошибки – оптимальное.

In [15]:
import numpy as np
import pandas as pd

In [12]:
def mse(y: np.ndarray) -> float:
    """Compute the mean squared error of a vector."""
    return np.square(y - np.mean(y)).mean()

def weighted_mse(y_left: np.ndarray, y_right: np.ndarray) -> float:
    """Compute the weighted mean squared error of two vectors."""
    mse_left = mse(y_left)
    mse_right = mse(y_right)
    n_left = len(y_left)
    n_right = len(y_right)
    return (mse_left*n_left + mse_right*n_right) / (n_left+n_right)

**Сплит по одному признаку** <br>
(чтобы средневзвешенный MSE был минимальным.)

In [39]:
def split(X: np.ndarray, y: np.ndarray, feature: int) -> float:
    """Find the best split for a node (one feature)"""
    feature_list = X[:, feature]
    best_metric = mse(y)
    f_size = y.shape[0]
    for j in range(f_size-1):
        metric = weighted_mse(y[0:j+1], y[j+1:f_size])
        if metric < best_metric:
            best_metric = metric
            threshold = j
    return feature_list[threshold]

**Сплит по всем признакам**

In [73]:
from __future__ import annotations

import numpy as np


def best_split(X: np.ndarray, y: np.ndarray) -> tuple[int, float]:
    """Find the best split for a node (one feature)"""
    best_metric = mse(y)
    f_size = y.shape[0]
    for i in range(X.shape[1]):
        feature_list = X[:, i]
        for j in range(f_size-1):
            metric = weighted_mse(y[0:j+1], y[j+1:f_size])
            if metric < best_metric:
                best_metric = metric
                threshold = j
                best_feature = i
    return best_feature, X[threshold, best_feature]

In [74]:
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15], [16, 17, 18], [19, 20, 21], [22, 23, 24], [25, 26, 27], [28, 29, 30]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [75]:
best_split(X, y)

(0, 13)

**Реализуем класс Node**<br>
Датакласс: https://docs.python.org/3/library/dataclasses.html 

In [488]:
from __future__ import annotations

from dataclasses import dataclass, field


@dataclass
class Node:
    """Decision tree node."""
    feature: int = field(default=None)
    threshold: float = field(default=None)
    n_samples: int = field(default=None)
    value:int = field(default=None)
    mse: float = field(default=None)  
    left: Node = None
    right: Node = None

### Построение дерева 
* Используется рекурсивный алгоритм
* Корневая нода делится на две дочерние ноды
* Деление продолжается, пока не будет достигнут критерий остановки<br>

**Критерии остановки:**
- max_depth (максимальная глубина дерева)
- min_samples_split (минимальное число объектов в ноде для дальнейшего деления)

In [489]:
from __future__ import annotations

from dataclasses import dataclass

import numpy as np
import json


@dataclass
class DecisionTreeRegressor:
    """Decision tree regressor."""
    max_depth: int
    min_samples_split: int = 2

    def fit(self, X: np.ndarray, y: np.ndarray) -> DecisionTreeRegressor:
        """Build a decision tree regressor from the training set (X, y)."""
        self.n_features_ = X.shape[1]
        self.tree_ = self._split_node(X, y)
        return self

    def _mse(self, y: np.ndarray) -> float:
        """Compute the mse criterion for a given set of target values."""
        return np.square(y - np.mean(y)).mean()

    def _weighted_mse(self, y_left: np.ndarray, y_right: np.ndarray) -> float:
        """Compute the weithed mse criterion for a two given sets of target values"""
        mse_left = self._mse(y_left)
        mse_right = self._mse(y_right)
        n_left = len(y_left)
        n_right = len(y_right)
        return (mse_left*n_left + mse_right*n_right) / (n_left+n_right)

    def _best_split(self, X: np.ndarray, y: np.ndarray) -> tuple[int, float]:
        """Find the best split for a node."""
        best_idx = None
        best_thr = None
        best_metric = None
        f_size = y.shape[0]
        for i in range(X.shape[1]):
            feature_list = X[:, i]
            for feat in np.unique(feature_list):
                c = feature_list<=feat
                left = y[c]
                right = y[~c]
                if left.shape[0] < self.min_samples_split or right.shape[0] < self.min_samples_split:
                    continue
                metric = self._weighted_mse(left, right)
                if best_metric is None or metric < best_metric:
                    best_metric = metric
                    best_thr = feat
                    best_idx = i
        return best_idx, best_thr

    def _split_node(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> Node:
        """Split a node and return the resulting left and right child nodes."""
        if depth > self.max_depth:
            return Node(
                        n_samples = X.shape[0],
                        value = round(y.mean()),
                        mse = self._mse(y)
                    )
        
        feature, threshold = self._best_split(X, y)
        
        if feature is None:
            return Node(n_samples = X.shape[0],
                        value = round(y.mean()),
                        mse = self._mse(y))
        
        c = X[:, feature] <= threshold
            
        return Node(feature,
                    threshold,
                    X.shape[0],
                    round(y.mean()),
                    self._mse(y),
                    self._split_node(X[c], y[c], depth=depth+1),
                    self._split_node(X[~c], y[~c], depth=depth+1))
    
    def as_json(self) -> str:
        """Return the decision tree as a JSON string."""
        return self._as_json(self.tree_)

    def _as_json(self, node: Node) -> str:
        """Return the decision tree as a JSON string. Execute recursively."""
        json_str = ''
        
        if node.left and node.feature:
            json_str += '{'
            json_str += f'"feature": {node.feature}'
            json_str += ','
            json_str += f'"threshold": {node.threshold}'
            json_str += ','
            json_str += f'"n_samples": {node.n_samples}'
            json_str += ','
            json_str += f'"mse": {round(node.mse,2)}' 
        else:
            json_str += '{'
            json_str += f'"value": {node.value}'
            json_str += ','
            json_str += f'"n_samples": {node.n_samples}'
            json_str += ','
            json_str += f'"mse": {round(node.mse,2)}'
            
        
        if node.left:
            json_str += ','
            json_str += f'"left": {self._as_json(node.left)}'
        if node.right:
            json_str += ','
            json_str += f'"right": {self._as_json(node.right)}'
            json_str += '}'
        else:
            json_str += '}'     

        return json_str
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predict regression target for X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        y : array of shape (n_samples,)
            The predicted values.
        """
        y = []
        for features in X:
            y.append(self._predict_one_sample(features))
        return y


    def _predict_one_sample(self, features: np.ndarray) -> int:
        """Predict the target value of a single sample."""
        node = self.tree_
        i = 0
        while (node.left or node.right):
            if (features[node.feature] <= node.threshold):
                node = node.left
            else:
                node = node.right 

        return node.value

In [490]:
df = pd.read_csv('data.csv')

In [491]:
X = df.drop("delay_days", axis=1).to_numpy()
y = df["delay_days"].to_numpy()

In [492]:
model = DecisionTreeRegressor(max_depth=3, min_samples_split=2)
model.fit(X, y)

DecisionTreeRegressor(max_depth=3, min_samples_split=2)

In [493]:
print(model.tree_)

Node(feature=1, threshold=44443, n_samples=1000, value=14, mse=408.523319, left=Node(feature=8, threshold=370968, n_samples=406, value=27, mse=600.1995013225268, left=Node(feature=9, threshold=960, n_samples=320, value=20, mse=267.88374999999996, left=Node(feature=1, threshold=26241, n_samples=181, value=10, mse=77.6077653307286, left=Node(feature=None, threshold=None, n_samples=62, value=17, mse=102.91467221644119, left=None, right=None), right=Node(feature=None, threshold=None, n_samples=119, value=6, mse=22.3991243556246, left=None, right=None)), right=Node(feature=1, threshold=22315, n_samples=139, value=32, mse=245.38719527974743, left=Node(feature=None, threshold=None, n_samples=29, value=50, mse=195.90725326991677, left=None, right=None), right=Node(feature=None, threshold=None, n_samples=110, value=27, mse=152.94380165289255, left=None, right=None))), right=Node(feature=9, threshold=720, n_samples=86, value=55, mse=874.1548134126556, left=Node(feature=1, threshold=19511, n_samp

### Визуализация дерева
```
Сформировать_json (Node):
    Если это лист, формируем json для листа.
        Останавливаемся.
    Иначе
        Формируем json для текущей ноды
        Сформировать_json (Left)
        Сформировать_json (Right)
```        

In [467]:
def as_json(self) -> str:
    """Return the decision tree as a JSON string."""
    return json.dumps(self._as_json(self.tree_))

def _as_json(self, node: Node) -> str:
        """Return the decision tree as a JSON string. Execute recursively."""
        dict_json = {}
        
        if node.threshold and node.feature:
            dict_json['feature'] = node.feature
            dict_json['threshold'] = node.threshold
        else:
            dict_json['value'] = node.value
        
        dict_json['n_samples']: node.n_samples 
        dict_json['mse']: round(node.mse,2)    
            
        
        if node.left:
            dict_json['left'] = self._as_json(node.left)
        if node.right:
            dict_json['right'] = self._as_json(node.right) 


        return dict_json

In [487]:
model.as_json()

'{"feature": 1,"threshold": 44443,"n_samples": 1000,"mse": 408.52,"left": {"feature": 8,"threshold": 370968,"n_samples": 406,"mse": 600.2,"left": {"feature": 9,"threshold": 960,"n_samples": 320,"mse": 267.88,"left": {"feature": 1,"threshold": 26241,"n_samples": 181,"mse": 77.61,"left": {"value": 17,"n_samples": 62,"mse": 102.91},"right": {"value": 6,"n_samples": 119,"mse": 22.4}},"right": {"feature": 1,"threshold": 22315,"n_samples": 139,"mse": 245.39,"left": {"value": 50,"n_samples": 29,"mse": 195.91},"right": {"value": 27,"n_samples": 110,"mse": 152.94}}},"right": {"feature": 9,"threshold": 720,"n_samples": 86,"mse": 874.15,"left": {"feature": 1,"threshold": 19511,"n_samples": 35,"mse": 277.97,"left": {"value": 49,"n_samples": 6,"mse": 107.14},"right": {"value": 26,"n_samples": 29,"mse": 221.86}},"right": {"feature": 8,"threshold": 702002,"n_samples": 51,"mse": 587.65,"left": {"value": 66,"n_samples": 46,"mse": 356.16},"right": {"value": 116,"n_samples": 5,"mse": 469.44}}}},"right": 

### Predict

In [469]:
def predict(self, X: np.ndarray) -> np.ndarray:
    """
    Predict regression target for X.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The input samples.

    Returns
    -------
    y : array of shape (n_samples,)
        The predicted values.
    """
    y = np.array()
    for features in X:
        #y.append(_predict_one_sample(features))
        print(_predict_one_sample(features))
    return y


def _predict_one_sample(self, features: np.ndarray) -> int:
    """Predict the target value of a single sample."""
    node = self.tree_
    while (node.left | node.right):
        if features[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
            
    return node.value

In [481]:
model.predict(X)

Node(feature=8, threshold=483916, n_samples=25, value=23, mse=367.54240000000004, left=Node(feature=9, threshold=990, n_samples=22, value=19, mse=207.17355371900823, left=Node(feature=1, threshold=34143, n_samples=12, value=12, mse=49.576388888888886, left=Node(feature=None, threshold=None, n_samples=7, value=15, mse=42.816326530612244, left=None, right=None), right=Node(feature=None, threshold=None, n_samples=5, value=6, mse=9.36, left=None, right=None)), right=Node(feature=1, threshold=19974, n_samples=10, value=28, mse=247.48999999999995, left=Node(feature=None, threshold=None, n_samples=2, value=52, mse=72.25, left=None, right=None), right=Node(feature=None, threshold=None, n_samples=8, value=22, mse=120.1875, left=None, right=None))), right=Node(feature=None, threshold=None, n_samples=3, value=54, mse=491.5555555555556, left=None, right=None)) 0
Node(feature=9, threshold=990, n_samples=22, value=19, mse=207.17355371900823, left=Node(feature=1, threshold=34143, n_samples=12, value=

[22,
 8,
 0,
 0,
 52,
 0,
 6,
 1,
 0,
 6,
 15,
 22,
 22,
 1,
 0,
 11,
 0,
 22,
 15,
 1,
 22,
 1,
 22,
 6,
 0,
 0,
 54,
 15,
 15,
 22,
 1,
 6,
 15,
 15,
 8,
 22,
 0,
 1,
 11,
 0,
 54,
 0,
 52,
 1,
 6,
 0,
 1,
 0,
 54,
 15]