Predicting Housing Prices in Melbourne: Decision Tree

In [2]:
import pandas as pd
import os

#path to housing data
HOUSING_DATA_PATH = os.path.join("datasets","housing")

#load housing data
melb_data = pd.read_csv(HOUSING_DATA_PATH+"/melb_housing_data.csv")

In [3]:
#label
y=melb_data.Price

#features
X = melb_data.drop(["Price"], axis=1)

In [4]:
#split data into training and test data
from sklearn.model_selection import train_test_split

X_train_full, X_valid_full, y_train, y_valid = train_test_split(
    X,y,train_size=0.8, test_size=0.2, random_state=0
)


In [12]:
#select numerical columns
numerical_columns= list(
    X_train_full.select_dtypes(include=['float64', 'int64'])
    .columns
)

#select categorical columns
categorical_columns = [
    col for col in X_train_full.columns
    if X_train_full[col].nunique() <= 15 and X_train_full[col].dtype == 'object'
    ]

#Keep the selected columns only
selected_columns=categorical_columns + numerical_columns
X_train = X_train_full[selected_columns].copy()
X_valid = X_valid_full[selected_columns].copy()

In [14]:
#processing data
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#preprocessing for numerical data
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('standard_scaler', StandardScaler())
    ])

#preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

#Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num_transformer', numerical_transformer, numerical_columns),
    ('categorical_transfomer', categorical_transformer, categorical_columns)
])


In [15]:
#define model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [16]:
from sklearn.metrics import mean_absolute_error
#bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

#preprocessing of training data, fit model
pipeline.fit(X_train, y_train)

#preprocessing of training data, get predictions
predictions = pipeline.predict(X_valid)

#evaluate the model
score = mean_absolute_error(y_valid, predictions)

print('Mean Absolute Error: ', score)

Mean Absolute Error:  160811.53551195737
