### import libraries, load house dataset

In [1]:
import pandas as pd

In [18]:
houses_path = './datasets/melb_data.csv'
houses_data = pd.read_csv(houses_path)
houses_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [19]:
houses_data.shape

(13580, 21)

### Data exploration

In [3]:
houses_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

#### Find columns with most null values, drop them, then drop remaining rows with nulls

In [4]:
worst_columns = houses_data.isna().sum().sort_values(ascending=False)[:3]
worst_columns.index

Index(['BuildingArea', 'YearBuilt', 'CouncilArea'], dtype='object')

In [5]:
houses_best_columns = houses_data.drop(labels=worst_columns.index, axis=1)

In [6]:
houses_best_columns.isna().sum()
houses_clean = houses_best_columns.dropna()
houses_clean.shape

(13518, 18)

#### Also drop non-numeric columns

In [7]:
houses_dtypes = houses_clean.dtypes
houses_numeric = houses_clean.select_dtypes(exclude=['object'])

In [8]:
houses_numeric.dtypes

Rooms              int64
Price            float64
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
Lattitude        float64
Longtitude       float64
Propertycount    float64
dtype: object

#### Select X and y, then split into train-test

In [9]:
y = houses_clean.Price

In [10]:
X = houses_numeric
X.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
count,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0
mean,2.939784,1074796.0,10.157827,3105.227401,2.916408,1.53536,1.610075,558.110593,-37.809191,144.995306,7455.482986
std,0.956438,639858.6,5.861593,90.724572,0.966692,0.69231,0.962634,3998.19456,0.079366,0.104003,4381.437721
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.2,3044.0,2.0,1.0,1.0,178.0,-37.857,144.929425,4380.0
50%,3.0,901000.0,9.2,3084.0,3.0,1.0,2.0,442.5,-37.8023,145.0003,6567.0
75%,3.0,1328000.0,13.0,3148.0,3.0,2.0,2.0,651.0,-37.756203,145.0586,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,-37.40853,145.52635,21650.0


In [11]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)

#### Train with trees

In [12]:
from sklearn.tree import DecisionTreeRegressor

houses_model = DecisionTreeRegressor(random_state=1)

houses_model.fit(train_X,train_y)

In [13]:
# print(X.head())
# print(houses_model.predict(X.head()))

In [14]:
from sklearn.metrics import mean_absolute_error as mae

predicted_prices = houses_model.predict(test_X)
mae(test_y, predicted_prices)

798.2056213017752

### Tree parameter tuning

In [16]:
def get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(test_X)
    mae_val = mae(test_y, preds)
    return mae_val 

In [17]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    mae_val = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print(f'max leaft nodes: {max_leaf_nodes}, mae: {mae_val}')

max leaft nodes: 5, mae: 166639.13605130874
max leaft nodes: 50, mae: 16964.547527943258
max leaft nodes: 500, mae: 1070.3369694282303
max leaft nodes: 5000, mae: 513.6896449704142
