In [2]:
import numpy as np
import pandas as pd

In [48]:
col_names=["X0","X1","X2","X3","X4","Y"]
dataset=pd.read_csv("../Downloads/AirfoilSelfNoise_dataset/AirfoilSelfNoise.csv",skiprows=1, header=None,names=col_names)
dataset.head(7)

Unnamed: 0,X0,X1,X2,X3,X4,Y
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
5,2500,0.0,0.3048,71.3,0.002663,125.571
6,3150,0.0,0.3048,71.3,0.002663,125.201


In [57]:
class Node():
    def __init__(self,feature_index=None,threshold=None,left=None,right=None,var_red=None,value=None):
        # for decision node
        self.feature_index=feature_index
        self.threshold=threshold
        self.left=left
        self.right=right
        self.var_red=var_red

        #for leaf node
        self.value=value

In [84]:
class DecisionTreeRegressor():
    def __init__(self,min_samples_split=2, max_depth=2):
        self.root=None
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth

    def build_tree(self,df,curr_depth=0):
        X,Y= df[:,:-1],df[:,-1]
        num_samples,num_features=np.shape(X)

        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            best_split=self.get_best_split(df,num_samples,num_features)
            if best_split['var_red']>0:
                left_tree= self.build_tree(best_split['df_left'],curr_depth+1)
                right_tree=self.build_tree(best_split['df_right'], curr_depth+1)
                return Node(best_split['feature_index'],best_split['threshold'],left_tree,right_tree,best_split['var_red'])

        leaf_value=self.calculate_leaf_value(Y)
        return Node(value=leaf_value)


    def get_best_split(self,df,num_samples,num_features):
        best_split={}
        max_var_red=-float("inf")
        for feature_index in range(num_features):
            feature_values=df[:,feature_index]
            possible_thresholds=np.unique(feature_values)
            for threshold in possible_thresholds:
                df_left,df_right=self.split(df,feature_index,threshold)
                if len(df_left)>0 and len(df_right)>0:
                    y,y_left,y_right= df[:,-1],df_left[:,-1],df_right[:,-1]
                    # calcualte Variance Reduction for this split
                    curr_var_red=self.VarianceReduction(y,y_left,y_right)
                    if curr_var_red>max_var_red:
                        best_split['feature_index']=feature_index
                        best_split['threshold']=threshold
                        best_split['df_left']=df_left
                        best_split['df_right']=df_right
                        best_split['var_red']=curr_var_red
                        max_var_red=curr_var_red
                        
        return best_split
                    
    def split(self,df,feature_index,threshold):
         df_left = np.array([row for row in df if row[feature_index]<=threshold])
         df_right = np.array([row for row in df if row[feature_index]>threshold])
         return df_left,df_right

    def VarianceReduction(self,parent, l_child, r_child):
        weight_l= len(l_child)/len(parent)
        weight_r=len(r_child)/len(parent)
        var_red = np.var(parent) - (weight_l * np.var(l_child) + np.var(r_child)* weight_r)
        return var_red

    def print_tree(self,tree=None,indent=" "):
        if not tree:
            tree=self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print("X"+str(tree.feature_index), "<=",tree.threshold, "?", "VR:"+str(tree.var_red))
            print("%sleft: "%(indent), end=" ")
            self.print_tree(tree.left, indent+indent)
            print("%sright: "%(indent), end=" ")
            self.print_tree(tree.right, indent+indent)

    def calculate_leaf_value(self,Y):
        return np.mean(Y)
        
    def fit(self,X,Y):
        df=np.c_[X,Y]
        self.root=self.build_tree(df)
        
    def predict(self,X):
        predictions=[self.make_prediction(x, self.root) for x in X]
        return predictions

    def make_prediction(self,x,tree):
        if tree.value!=None:
            return tree.value
        feature_value= x[tree.feature_index]
        if feature_value<=tree.threshold:
            return self.make_prediction(x,tree.left)
        else:
            return self.make_prediction(x,tree.right)

    

<h3>Train-Test split</h3>

In [85]:
# data
X=dataset.iloc[:,:-1].values
Y=dataset.iloc[:,-1].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train,X_test, Y_train,Y_test = train_test_split(X,Y , test_size=0.2, random_state=45)

In [86]:
regressor= DecisionTreeRegressor(min_samples_split= 3, max_depth=3)
regressor.fit(X_train,Y_train)
regressor.print_tree()


X0 <= 3150.0 ? VR:8.630878895189205
 left:  X4 <= 0.0150478 ? VR:4.172028706038219
  left:  X2 <= 0.1016 ? VR:2.5625021268268284
    left:  X0 <= 630.0 ? VR:1.721826397649771
        left:  127.45626153846153
        right:  130.54485714285713
    right:  X0 <= 1600.0 ? VR:5.1350653158285215
        left:  128.01269803921568
        right:  122.9858910891089
  right:  X4 <= 0.0483159 ? VR:5.826555047106439
    left:  X0 <= 1250.0 ? VR:3.902209097480153
        left:  125.46159259259258
        right:  121.09173846153845
    right:  X0 <= 1250.0 ? VR:14.46083182679063
        left:  118.03387500000001
        right:  109.49533333333335
 right:  X4 <= 0.0015988 ? VR:26.671275429341037
  left:  X0 <= 8000.0 ? VR:11.996849775654265
    left:  X4 <= 0.000930789 ? VR:11.311113686938834
        left:  134.03506249999998
        right:  127.19027272727271
    right:  X4 <= 0.00121072 ? VR:9.832823537777779
        left:  126.01444000000001
        right:  118.90677777777779
  right:  X4 <= 0.0

In [87]:
pred_test = regressor.predict(X_test)
from sklearn.metrics import mean_squared_error as mse
mse_reg = mse(Y_test, pred_test)
rmse_reg=np.sqrt(mse_reg)
print(rmse_reg)

4.837506510477529
