In [23]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

cali = fetch_california_housing()
data, labels, colNames = cali.data, cali.target, cali.feature_names
data = data[:1000]
labels = labels[:1000]
data = pd.DataFrame(data, columns=colNames)
labels = pd.DataFrame(labels)

xtrain, xtest, ytrain, ytest = train_test_split(data, labels, test_size=0.2, random_state=42)

In [29]:
#finds the best split in the current set of data
def bestSplit(data: pd.DataFrame, labels: pd.Series) -> (str, float):
    colName, splitVal, minMSE = '', 0, float('inf')
    
    for col in data.columns:
        uniqueVals = np.sort(data[col].unique())
        splits = (uniqueVals[1:] + uniqueVals[:-1]) / 2
        
        for split in splits:
            #split into left and right group
            mask = data[col] <= split
            left = labels[mask]
            right = labels[~mask]

            #find mean for right and left side
            leftMean = float(left.mean().iloc[0])
            rightMean = float(right.mean().iloc[0])

            #sse = summation of (y - group mean)^2 for each group
            sse = ((left - leftMean) ** 2).sum() + ((right - rightMean) ** 2).sum()
            # mse = sse / (n-k)
            mse = sse / (len(labels) - 2)
            mse = float(mse.iloc[0])
            
            if mse < minMSE:
                colName = col
                splitVal = split
                minMSE = mse
            
    return (colName, splitVal)

# recursively build decision tree
def buildTree(data: pd.DataFrame, labels: pd.Series, min_split: int):
    if len(labels) < min_split:
        return {'isLeaf': True,
                'pred': labels.mean()} #return mean of the labels in the leaf

    (col, split) = bestSplit(data, labels)
    leftData = data[data[col] <= split]
    leftLabels = labels[data[col] <= split]
    
    rightData = data[data[col] > split]
    rightLabels = labels[data[col] > split]

    left = buildTree(leftData, leftLabels, min_split)
    right = buildTree(rightData, rightLabels, min_split)

    return {'column': col,
            'split': split,
            'left': left,
            'right': right, 
            'isLeaf': False}

tree = buildTree(xtrain, ytrain, 50)

In [35]:
def predict(tree, data: pd.DataFrame):
    if tree['isLeaf']:
        return pd.Series([tree['pred']] * len(data), index=data.index) 
    
    col = tree['column']
    split = tree['split']
    
    left_data = data[data[col] <= split]
    right_data = data[data[col] > split]
    
    pred = pd.Series(index=data.index)

    pred[data[col] <= split] = predict(tree['left'], left_data)
    pred[data[col] > split] = predict(tree['right'], right_data)

    return pred


pred = np.array(predict(tree, xtest))
mse = mean_squared_error(pred, ytest)
print("mse test:", mse)

pred = np.array(predict(tree, xtrain))
mse = mean_squared_error(pred, ytrain)
print("mse train:", mse)

  array = numpy.asarray(array, order=order, dtype=dtype)


mse test: 0.20979883975793684
mse train: 0.16709217642822682


  array = numpy.asarray(array, order=order, dtype=dtype)


In [36]:
#tests results with scikit-learn library
libraryTree = DecisionTreeRegressor(criterion="squared_error",min_samples_split=70)
libraryTree.fit(xtrain, ytrain)

pred = libraryTree.predict(xtest)
mse = mean_squared_error(pred, ytest)
print("mse test:", mse)
pred = libraryTree.predict(xtrain)
mse = mean_squared_error(pred, ytrain)
print("mse train:", mse)

mse test: 0.2053516029023301
mse train: 0.19983847626648782
