# Regression Tree Implement 

## Target value should be continual

### Input: dataset D

### Algorithm Steps:

### (1) choose optimal feature and value to split feature space:
#### $min_{j,s}[min_{c_1} \sum_{x_i \in R_1(j,s)} {(y_i-c_1)^2} + min_{c_2} \sum_{x_i \in R_1(j,s)} {(y_i-c_2)^2}]$

### (2) set the optimal $(j,s)$ 

### (3) compute output in each region accroding to $(j,s)$:
#### $R_1(j,s)=\{x|x^j<=s\},R_2(j,s)=\{x|x^j>s\},\hat{c_m}=\frac{1}{N}\sum_{x_i \in R_m(j,s)}{y_i},x\in R_m, m =1,2$

### (4) perform (1) and (2) until satisfying stop condition

### (5) generate regression tree:

#### $f(x)=\sum_{m=1}^{M} \hat{c_m}I(x\in R_m)$
 

In [1]:
import numpy as np
import pandas as pd

In [2]:
class RegTree():

    def __init__(self):
        self.__tree = []

    def _dataProcessing(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        x = data[:, :-1]
        y = data[:, -1]
        return x, y

    def train(self, x, y, features=None):
        if features == None:
            features = np.arange(x.shape[1])
        if len(features) == 0 or len(y) == 0:
            return self.__tree
        featureL = []
        for feature in features:
            values = x[:, feature]
            valSL = []
            for value in values:
                c1Index = np.nonzero(value >= values)
                c2Index = np.nonzero(value < values)
                # get region c1
                c1 = y[c1Index].mean()  # all values <= value
                # get region c2
                c2 = y[c2Index].mean()  # all values >  value
                # compute cost
                cond = np.sum((y[c1Index] - c1) ** 2) + np.sum((y[c2Index] - c2) ** 2)
                valSL.append((value, c1, c2, cond))
            mv = min(valSL, key=lambda v: v[3])
            fvc = (feature, *mv)
            # (feature,value,c1,c2,cond)
            featureL.append(fvc)
        #(feature,value,c1,c2)
        mfv = min(featureL, key=lambda v: v[4])[:-1]
        self.__tree.append(mfv)
        # remove the feature in feature space
        left = np.nonzero(features != mfv[0])
        leftFeatures = features[left]
        # partition space into R1 and R2
        r1Index = np.nonzero(x[:, mfv[0]] <= mfv[1])
        r2Index = np.nonzero(x[:, mfv[0]] > mfv[1])
        r1 = x[r1Index, :]
        y1 = y[r1Index]
        r2 = x[r2Index, :]
        y2 = y[r2Index, :]
        treeR1 = self.train(r1, y1, leftFeatures)
        treeR2 = self.train(r2, y2, leftFeatures)
        # note: treeR2 is the final tree structure
        return treeR1, treeR2

    # tree structure = [(feature,value,c1,c2),(),(),...()]

    def predict(self, x):
        predictL = []
        for row in x:
            matchedL = []
            for node in self.__tree:



