# Regression Tree Implement 

## Target value should be continual

### Input: dataset D

### Algorithm Steps:

### (1) choose optimal feature and value to split feature space:
#### $min_{j,s}[min_{c_1} \sum_{x_i \in R_1(j,s)} {(y_i-c_1)^2} + min_{c_2} \sum_{x_i \in R_1(j,s)} {(y_i-c_2)^2}]$

### (2) set the optimal $(j,s)$ 

### (3) compute output in each region accroding to $(j,s)$:
#### $R_1(j,s)=\{x|x^j<=s\},R_2(j,s)=\{x|x^j>s\},\hat{c_m}=\frac{1}{N}\sum_{x_i \in R_m(j,s)}{y_i},x\in R_m, m =1,2$

### (4) perform (1) and (2) until satisfying stop condition

### (5) generate regression tree:

#### $f(x)=\sum_{m=1}^{M} \hat{c_m}I(x\in R_m)$
 

In [1]:
import numpy as np
import pandas as pd

In [2]:
class RegTree:

    def __init__(self,maxDepth=5):
        self.__maxDepth=maxDepth
        self.__tree = []

    def _dataProcessing(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        x = data[:, :-1]
        y = data[:, -1]
        return x, y

    def train(self, x, y, depth=0,features=None):
        if np.any(features) == None:
            features = np.arange(x.shape[1])
            c0=np.mean(y)
            self.__tree.append((depth,'Root Node',None,None,c0))
        if len(features) == 0 or len(y) == 0 or depth>=self.__maxDepth:
            return self.__tree
        elif len(y)==1:
            c0=y[0]
            self.__tree.append((depth,'single Node',None,None,c0))
            return  self.__tree
        elif len(y)==2:
            c1=y.min()
            c2=y.max()
            self.__tree.append((depth,'R1',None,None,c1))
            self.__tree.append((depth,'R2',None,None,c2))
            return self.__tree
        else:
            # start building tree
            depth+=1
            # only a data point
            featureL = []
            for feature in features:
                values = x[:, feature]
                valSL = []
                for value in values:
                    if value==min(values):
                        c1=value
                        c2=y[values!=value].mean()
                        # cond = np.sum((value - c1) ** 2) + np.sum((y[values!=value] - c2) ** 2)
                        # cond =np.sum((y[values!=value] - c2) ** 2)
                        cond = np.sum((y[values!=value] - c2) ** 2)
                    elif value==max(values):
                        c1=y[values!=value].mean()
                        c2=value
                        cond = np.sum((y[values!=value] - c1) ** 2)
                    else:
                        c1Index = np.flatnonzero(value >= values)
                        c2Index = np.flatnonzero(value < values)
                        # get region c1
                        c1 = y[c1Index].mean()  # all values <= value
                        # get region c2
                        c2 = y[c2Index].mean()  # all values >  value
                        # compute cost
                        cond = np.sum((y[c1Index] - c1) ** 2) + np.sum((y[c2Index] - c2) ** 2)
                    valSL.append((value, c1, c2, cond))
                mv = min(valSL, key=lambda v: v[3])
                fvc = (feature, *mv)
                # (feature,value,c1,c2,cond)
                featureL.append(fvc)
            #(feature,value,c1,c2)
            mfv = min(featureL, key=lambda v: v[4])[:-1]
            #R1=(feature,value,c1)  # all values <= value
            R1=mfv[:-1]
            #R2=(feature,value,c2)  # all values >  value
            R2=np.array(mfv)[[0,1,3]]
            self.__tree.append((depth,'R1',*R1))
            self.__tree.append((depth,'R2',*R2))
            # remove the feature in feature space
            left = np.flatnonzero(features != mfv[0])
            leftFeatures = features[left]
            # partition space into R1 and R2
            r1Index = np.flatnonzero(x[:, mfv[0]] <= mfv[1])
            r2Index = np.flatnonzero(x[:, mfv[0]] > mfv[1])
            r1 = x[r1Index]
            y1 = y[r1Index]
            r2 = x[r2Index]
            y2 = y[r2Index]
            treeR1 = self.train(r1, y1, depth,leftFeatures)
            treeR2 = self.train(r2, y2, depth,leftFeatures)
            # note: treeR2 is the final tree structure
            # tree structure = [ Root Node: (depth,feature,value,c1) R1|(),(),.|  R2|..(depth,feature,value,c2)|]
            return treeR2

In [3]:
# generate history data
np.random.seed(42)
hx = np.random.randint(5, size=(10, 5))
hy = np.random.choice(np.arange(5), size=10)

In [4]:
dt=RegTree()
tree=dt.train(hx,hy)
print(tree)

[(0, 'Root Node', None, None, 2.3), (1, 'R1', 2, 4, 2.3333333333333335), (1, 'R2', 2.0, 4.0, 4.0), (2, 'R1', 3, 0, 0), (2, 'R2', 3.0, 0.0, 2.1666666666666665), (3, 'R1', 4, 1, 1.5), (3, 'R2', 4.0, 1.0, 3.5), (3, 'R1', None, None, 1), (3, 'R2', None, None, 2), (3, 'R1', None, None, 3), (3, 'R2', None, None, 4), (3, 'R1', 0, 3, 1.3333333333333333), (3, 'R2', 0.0, 3.0, 3.0), (4, 'R1', 4, 4, 1.75), (4, 'R2', 4.0, 4.0, 4.0), (5, 'R1', 1, 4, 2.0), (5, 'R2', 1.0, 4.0, 4.0)]
