In [13]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

# Path of the file to read
iowa_file_path = './data/iowa_house_data.csv'

home_data = pd.read_csv(iowa_file_path)

# print(home_data.columns)

y = home_data.SalePrice # By convention, 'y' is the target variable

feature_names = ['LotArea',
'YearBuilt',
'1stFlrSF',
'2ndFlrSF',
'FullBath',
'BedroomAbvGr',
'TotRmsAbvGrd']

X = home_data[feature_names] # By convention, 'X' is th feature matrix. i.e. the things (features) that influence what the model is trying to predict (target)

# print(X.describe)
# print the top few lines
# print(X.head)

#For model reproducibility, set a numeric value for random_state when specifying the model
iowa_model = DecisionTreeRegressor(random_state=1)

# fit ingests the training data and learns the relationship between X and y
iowa_model.fit(X, y)

predictions = iowa_model.predict(X)
print(abs(sum(predictions)/len(predictions) - sum(y)/len(y)))


0.0


So the above will train on the whole dataset, which leaves us nothing to evaluate the model on. This is called "training on the test set" and is a big no-no. So lets split them up

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

# Fit model on training data
iowa_model.fit(X_train, y_train)

# Predict on validation data
val_predictions = iowa_model.predict(X_valid)

for i in range(len(X_valid)):
    print(f"Predicted: {val_predictions[i]}, Actual: {y_valid.iloc[i]}")

# Compare predictions to actual values
mae = mean_absolute_error(y_valid, val_predictions)
print(f"Mean Absolute Error: {mae}")

Predicted: 186500.0, Actual: 231500
Predicted: 184000.0, Actual: 179500
Predicted: 130000.0, Actual: 122000
Predicted: 92000.0, Actual: 84500
Predicted: 164500.0, Actual: 142000
Predicted: 220000.0, Actual: 325624
Predicted: 335000.0, Actual: 285000
Predicted: 144152.0, Actual: 151000
Predicted: 215000.0, Actual: 195000
Predicted: 262000.0, Actual: 275000
Predicted: 180000.0, Actual: 175000
Predicted: 121000.0, Actual: 61000
Predicted: 175900.0, Actual: 174000
Predicted: 210000.0, Actual: 385000
Predicted: 248900.0, Actual: 230000
Predicted: 131000.0, Actual: 87000
Predicted: 100000.0, Actual: 125000
Predicted: 149350.0, Actual: 98600
Predicted: 235000.0, Actual: 260000
Predicted: 156000.0, Actual: 143000
Predicted: 149900.0, Actual: 124000
Predicted: 265979.0, Actual: 122500
Predicted: 193500.0, Actual: 236500
Predicted: 377500.0, Actual: 337500
Predicted: 100000.0, Actual: 76000
Predicted: 162900.0, Actual: 187000
Predicted: 145000.0, Actual: 128000
Predicted: 180000.0, Actual: 17900

Now we have a way to evaluate the model, lets try and improve it. We can modify the tree depth of the decision tree. Too high a depth and we overfit, too low and we underfit. Lets try a few depths:

In [15]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)


candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
depth_and_mae = {}

for depth in candidate_max_leaf_nodes:
    depths_mae = get_mae(depth, X_train, X_valid, y_train, y_valid)
    depth_and_mae[depth] = depths_mae


best_tree_size = min(depth_and_mae, key=depth_and_mae.get)

print(best_tree_size)

100


Now we have the best tree size, lets fit a model on the whole dataset and make some predictions

In [16]:
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

final_predictions = final_model.predict(X)
print(final_predictions)


[209133.65384615 146415.0075188  209133.65384615 ... 238763.63636364
 130629.         146415.0075188 ]
