In [None]:
# SET UP LIBRARIES

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error


# LOAD DATA

df_raw = pd.read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv")


if "Address" in df_raw.columns:
    address_series = df_raw["Address"].copy()
else:
    address_series = None


# DROP COLUMNS (MODEL-SIDE)

cols_to_drop = [
    "Address",  
    "Method", "SellerG", "Date", "Postcode",
    "Lattitude", "Longitude", "Regionname", "Propertycount"
]
df = df_raw.drop(columns=[c for c in cols_to_drop if c in df_raw.columns])


# CLEAN DATA

df = df.dropna()


if address_series is not None:
    address_series = address_series.loc[df.index]


# ONE-HOT ENCODE

cat_cols = [c for c in ["Suburb", "Type", "CouncilArea"] if c in df.columns]
df = pd.get_dummies(df, columns=cat_cols, drop_first=False)


# FEATURES / TARGET

if "Price" not in df.columns:
    raise ValueError("Column 'Price' not found. Check your CSV columns.")

X = df.drop("Price", axis=1)
y = df["Price"]


# TRAIN / TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, random_state=42
)


# ALGORITHM

model = ensemble.GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    min_samples_split=4,
    min_samples_leaf=6,
    max_features=0.6,
    loss="huber",
    random_state=42
)

model.fit(X_train, y_train)


# EVALUATE 

mae_test = mean_absolute_error(y_test, model.predict(X_test))
print(f"Test MAE: ${mae_test:,.0f}")


# PREDICT ALL ROWS + OUTPUT TABLE

ml_pred = model.predict(X)

final_output = pd.DataFrame(index=df.index)
if address_series is not None:
    final_output["Address"] = address_series

final_output["Price"] = y
final_output["ML_Price_Estimate"] = ml_pred

final_output["Price_$"] = final_output["Price"].map(lambda v: f"${v:,.0f}")
final_output["ML_Price_Estimate_$"] = final_output["ML_Price_Estimate"].map(lambda v: f"${v:,.0f}")

cols_to_show = []
if "Address" in final_output.columns:
    cols_to_show.append("Address")
cols_to_show += ["Price_$", "ML_Price_Estimate_$"]

print("\nSample Predictions:")
print(final_output[cols_to_show].head(10))

final_output.to_csv("melbourne_property_price_comparison.csv", index=False)
print("\nSaved: melbourne_property_price_comparison.csv")


Test MAE: $196,976

Sample Predictions:
               Address     Price_$ ML_Price_Estimate_$
0        49 Lithgow St  $1,490,000          $1,407,173
1        59A Turner St  $1,220,000          $1,407,173
2        119B Yarra St  $1,420,000          $1,407,173
3           68 Vida St  $1,515,000          $1,173,234
4     92 Clydesdale Rd    $670,000            $849,078
5         4/32 Earl St    $530,000            $681,267
6       3/74 Hawker St    $540,000            $522,073
7    1/26 Highridge Cr    $715,000            $923,508
9          18 Mills St  $1,925,000          $1,767,220
10  3/15 Drummartin St    $515,000            $556,697

Saved: melbourne_property_price_comparison.csv


In [2]:
# ===== CELL 2: Laptop-safe accuracy upgrade (log target + fast model + early stopping) =====

import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Assumes Cell 1 already created:
# X_train, X_test, y_train, y_test, X, y, df, and (optionally) address_series

# 1) Train on log-price to handle skewed housing prices
y_train_log = np.log1p(y_train)

# 2) Fast/robust model that typically beats classic GradientBoosting on tabular
#    Early stopping prevents unnecessary extra work.
model_safe = HistGradientBoostingRegressor(
    loss="absolute_error",        # MAE-like robustness
    learning_rate=0.08,
    max_depth=6,                  # reduce to 6 if you want even cooler
    max_iter=150,                 # upper bound; early stopping usually stops earlier
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=15,
    random_state=42
)

model_safe.fit(X_train, y_train_log)

# 3) Evaluate in dollars (undo log1p with expm1)
pred_test = np.expm1(model_safe.predict(X_test))
mae_test = mean_absolute_error(y_test, pred_test)
print(f"Improved (Laptop-safe) Test MAE: ${mae_test:,.0f}")

# 4) Predict all rows and build final output (same style as your file)
pred_all = np.expm1(model_safe.predict(X))

final_output2 = pd.DataFrame(index=df.index)

if "address_series" in globals() and address_series is not None:
    final_output2["Address"] = address_series

final_output2["Price"] = y
final_output2["ML_Price_Estimate"] = pred_all

final_output2["Price_$"] = final_output2["Price"].map(lambda v: f"${v:,.0f}")
final_output2["ML_Price_Estimate_$"] = final_output2["ML_Price_Estimate"].map(lambda v: f"${v:,.0f}")

cols_to_show = []
if "Address" in final_output2.columns:
    cols_to_show.append("Address")
cols_to_show += ["Price_$", "ML_Price_Estimate_$"]

print("\nSample Improved Predictions:")
print(final_output2[cols_to_show].head(10))

final_output2.to_csv("melbourne_property_price_comparison_laptop_safe.csv", index=False)
print("\nSaved: melbourne_property_price_comparison_laptop_safe.csv")


Improved (Laptop-safe) Test MAE: $186,402

Sample Improved Predictions:
               Address     Price_$ ML_Price_Estimate_$
0        49 Lithgow St  $1,490,000          $1,438,514
1        59A Turner St  $1,220,000          $1,438,514
2        119B Yarra St  $1,420,000          $1,438,514
3           68 Vida St  $1,515,000          $1,127,122
4     92 Clydesdale Rd    $670,000            $762,217
5         4/32 Earl St    $530,000            $605,504
6       3/74 Hawker St    $540,000            $528,780
7    1/26 Highridge Cr    $715,000            $863,573
9          18 Mills St  $1,925,000          $1,898,749
10  3/15 Drummartin St    $515,000            $557,605

Saved: melbourne_property_price_comparison_laptop_safe.csv
