In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("Sample_Files/houses_for_rent_madrid (1).xlsx")

In [3]:
data.dtypes

Id                int64
District         object
Address          object
Number           object
Area             object
Rent              int64
Bedrooms        float64
Sq.Mt             int64
Floor           float64
Outer           float64
Elevator        float64
Penthouse         int64
Cottage           int64
Duplex            int64
Semidetached      int64
dtype: object

In [4]:
# Data preparation: Remove Number, Address, Id, drop rows with missing values,
# convert, get one-hot (dummy) encoding for the categoricals.
# Split to 80%-20% train-test.
from sklearn.model_selection import train_test_split
data_prepared = data.drop(columns=["Number", "Address", "Id"])
data_prepared = data_prepared.dropna(axis=0)
data_prepared["District"] = data["District"].astype("category")
data_prepared["Area"] = data["Area"].astype("category")
data_prepared = pd.get_dummies(data_prepared)
data_train, data_test = train_test_split(data_prepared, train_size = 0.8)

In [5]:
# How many features do we have now?
data_train.shape

(1452, 170)

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

## Your job
Train a regressor for "Rent" using decision trees.  Start with no restriction on the tree growth, and then try to fiddle with the "max_depth" parameter
(to control the depth of the tree) and the "min_samples_split", which prevents splitting nodes that have too little training data in them.  Use a real number, for example, 0.05 means that the algorithm won't split nodes with less than 0.05*n training samples, where n is the size of the training set.

Either using a loop or manually, find the based choice of min_samples_split and max_depth on the test set (which is here used as a validation set only).

In [7]:
# Documentation for DecisionTreeRegressor:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
clf = DecisionTreeRegressor()

In [9]:
from sklearn.metrics import mean_squared_error

X_train = data_train.drop(columns=['Rent'])
y_train = data_train['Rent']
X_test = data_test.drop(columns=['Rent'])
y_test = data_test['Rent']

max_depth_values = range(1, 21)
min_samples_split_values = [0.05, 0.1, 0.15, 0.2]

best_model = None
best_mse = float('inf')

for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        regressor = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        if mse < best_mse:
            best_mse = mse
            best_model = regressor

print("Best Parameters:")
print("max_depth:", best_model.max_depth)
print("min_samples_split:", best_model.min_samples_split)
print("MSE on Test Set:", best_mse)

Best Parameters:
max_depth: 17
min_samples_split: 0.05
MSE on Test Set: 457798.4467931264
