In [None]:
# Import necessary libraries
import joblib
import pandas as pd  # add pandas to the libraries
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
import sys

sys.path.append("../../src/data")
# Import the DataTransformer class from the 'updated_wrangle_module.py' file
from updated_wrangle_module import DataTransformer

In [None]:
# ----------------------------------------------------------------------------------------------
# 1. Use the DataTransformer class to prepare the data
# ----------------------------------------------------------------------------------------------

# Instantiate the DataTransformer class
DT = DataTransformer()

# Use the 'updated_wrangle' method to process the data
df = DT.updated_wrangle("../../data/raw/Melbourne_housing_FULL.csv")
print("df shape:", df.shape)

df shape: (7091, 5)


In [None]:
df.head()

Unnamed: 0,Suburb,BuildingArea,Latitude,Longitude,Price_USD
0,Abbotsford,79.0,-37.8079,144.9934,776250.0
1,Abbotsford,150.0,-37.8093,144.9944,1098750.0
2,Abbotsford,142.0,-37.8072,144.9941,1200000.0
3,Abbotsford,210.0,-37.8024,144.9993,1407000.0
4,Abbotsford,107.0,-37.806,144.9954,1227000.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7091 entries, 0 to 7090
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        7091 non-null   object 
 1   BuildingArea  7091 non-null   float64
 2   Latitude      7086 non-null   float64
 3   Longitude     7086 non-null   float64
 4   Price_USD     7091 non-null   float64
dtypes: float64(4), object(1)
memory usage: 277.1+ KB


In [None]:
# ----------------------------------------------------------------------------------------------
# 2. Split the data into features and target
# ----------------------------------------------------------------------------------------------

# Split the data into features and target
X = df.drop(columns="Price_USD")
y = df["Price_USD"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5672, 4), (1419, 4), (5672,), (1419,))

In [None]:
# ----------------------------------------------------------------------------------------------
# 3. Build Model Baseline
# ----------------------------------------------------------------------------------------------

y_mean = y_train.mean()
y_mean

746603.1101022566

In [None]:
# Create a dumb model
y_pred_baseline = [y_mean] * len(y_train)
y_pred_baseline[:5]

[746603.1101022566,
 746603.1101022566,
 746603.1101022566,
 746603.1101022566,
 746603.1101022566]

In [None]:
len(y_pred_baseline) == len(y_train)

True

In [None]:
# ----------------------------------------------------------------------------------------------
# Calculate the baseline mean absolute error
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean house price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
"""
If an apartment 'Type' 'h' is always predicted at price $746,603.11
the the predictions would be off by an average of $276,482.9. It also means that
the model needs to have mean absolute error below $276,482.9 in order to be useful.
"""

Mean house price 746603.11
Baseline MAE: 276482.9


"\nIf an apartment 'Type' 'h' is always predicted at price $746,603.11\nthe the predictions would be off by an average of $276,482.9. It also means that\nthe model needs to have mean absolute error below $276,482.9 in order to be useful.\n"

In [None]:
# ----------------------------------------------------------------------------------------------
# 4. Iterate on the model
# ----------------------------------------------------------------------------------------------

# Instantiate the model
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
    )

In [None]:
# Fit the model
model.fit(X_train, y_train)

In [None]:
# Check your that model is fitted
check_is_fitted(model)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5672, 4), (1419, 4), (5672,), (1419,))

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5672 entries, 3011 to 860
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        5672 non-null   object 
 1   BuildingArea  5672 non-null   float64
 2   Latitude      5668 non-null   float64
 3   Longitude     5668 non-null   float64
dtypes: float64(3), object(1)
memory usage: 221.6+ KB


In [None]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 5672 entries, 3011 to 860
Series name: Price_USD
Non-Null Count  Dtype  
--------------  -----  
5672 non-null   float64
dtypes: float64(1)
memory usage: 88.6 KB


In [None]:
len(df) * 0.8

5672.8

In [None]:
len(df) * 0.2

1418.2

In [None]:
# ----------------------------------------------------------------------------------------------
# 6. Evaluate the model performance on the training set
# ----------------------------------------------------------------------------------------------

# Make predictions on the training set
y_pred_training = model.predict(X_train)
y_pred_training[:5]

array([860358.99623861, 508968.22627001, 785001.30640336, 587448.84864524,
       621542.87709701])

In [None]:
# Calculate the MAE for predictions in y_pred_training against the actual values in y_train
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

Training MAE: 157843.17


In [None]:
# ----------------------------------------------------------------------------------------------
# 5. Make predictions on the test set and evaluate the model performance on the test set
# ----------------------------------------------------------------------------------------------

# Make predictions on the test set
y_pred = pd.Series(model.predict(X_test))
y_pred[:5]

0    8.057193e+05
1    7.891424e+05
2    1.191324e+06
3    7.326924e+05
4    1.021281e+06
dtype: float64

In [None]:
# Calculate the MAE for predictions in y_pred against the actual values in y_test
mae_testing = mean_absolute_error(y_test, y_pred)
print("Testing MAE:", round(mae_testing, 2))

Testing MAE: 159148.28


In [None]:
# ----------------------------------------------------------------------------------------------
# 7.  Communicate the results
# ----------------------------------------------------------------------------------------------


# Create a function 'price_predictor' that returns the model's prediction for a house price.
def price_predictor(suburb, area, latitude, longitude):
    """
    Predict the price of a house based on its suburb, latitude, longitude and area.

    Parameters:
    suburb (str): The suburb where the house is located.
    area (float): The surface area of the building in square meters.
    latitude (float): The latitude coordinate of the house.
    longitude (float): The longitude coordinate of the house.

    Returns:
    str: A string indicating the predicted apartment price, rounded to two decimal places.
    """
    data = {
        "Suburb": suburb,
        "BuildingArea": area,
        "Latitude": latitude,
        "Longitude": longitude,
    }
    df = pd.DataFrame(data, index=[0])
    prediction = model.predict(df).round(2)[0]
    return f"Predicted House Price: ${prediction}"

In [None]:
# ----------------------------------------------------------------------------------------------

# Test the 'price_predictor' function for 'Fitzory' suburb in 'Melbourne'
predicted_price = price_predictor(
    suburb="Fitzroy",
    area=120.5,
    latitude=-37.7981,
    longitude=144.9789,
)

print(predicted_price)

Predicted House Price: $855716.71


In [None]:
# ----------------------------------------------------------------------------------------------

# Interactive widget for house price prediction
interact(
    price_predictor,
    suburb=Dropdown(options=sorted(X_train["Suburb"].unique()), description="Suburb:"),
    area=IntSlider(
        min=X_train["BuildingArea"].min(),
        max=X_train["BuildingArea"].max(),
        value=X_train["BuildingArea"].mean(),
        description="BuildingArea:",
    ),
    latitude=FloatSlider(
        min=X_train["Latitude"].min(),
        max=X_train["Latitude"].max(),
        step=0.01,
        value=X_train["Latitude"].mean(),
        description="Latitude:",
    ),
    longitude=FloatSlider(
        min=X_train["Longitude"].min(),
        max=X_train["Longitude"].max(),
        step=0.01,
        value=X_train["Longitude"].mean(),
        description="Longitude:",
    ),
)

interactive(children=(Dropdown(description='Suburb:', options=('Abbotsford', 'Aberfeldie', 'Airport West', 'Al…

<function __main__.price_predictor(suburb, area, latitude, longitude)>

In [None]:
# ----------------------------------------------------------------------------------------------
# 8. Save and export the X_train, X_test features and the model
# ----------------------------------------------------------------------------------------------

# Export X_train to the 'data' folder as a CSV file
X_train.to_csv("../../data/processed/X_train.csv", index=False)

In [None]:
# Export X_test to the 'data' folder as a CSV file
X_test.to_csv("../../data/processed/X_test.csv", index=False)

In [None]:
# Save the model to 'models' folder using joblib
joblib.dump(model, "../../models/price_by_sub_area_lat_lon_model.pkl")

['../../models/price_by_sub_area_lat_lon_model.pkl']

In [None]:
# Load the model from the 'models' folder
model = joblib.load("../../models/price_by_sub_area_lat_lon_model.pkl")

In [None]:
# Check the model
model