<a href="https://colab.research.google.com/github/KTH-Sys/MLSN_Davis-House-Price-Prediction/blob/main/MLSN_XGboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from google.colab import files

uploaded = files.upload()
filename = list(uploaded.keys())[0]

data = pd.read_csv(filename)
print("Columns:", data.columns.tolist())
data.head()


Saving davis_housing_clean_2.csv to davis_housing_clean_2.csv
Columns: ['price', 'bed', 'bath', 'acre_lot', 'house_size']


Unnamed: 0,price,bed,bath,acre_lot,house_size
0,399900.0,1.0,1.0,0.04,712.0
1,568000.0,4.0,2.0,0.04,1568.0
2,629000.0,3.0,2.0,0.08,1062.0
3,975000.0,4.0,2.0,0.24,1985.0
4,419000.0,1.0,1.0,0.02,896.0


In [2]:
# Make a copy so we don't mutate original reference
clean = data.copy()

# Drop rows with any missing or non-convertible values
clean = clean.dropna(subset=['price', 'bed', 'bath', 'acre_lot', 'house_size'])
print(clean.dtypes)
print(clean.head())

# Features
X = clean[['acre_lot', 'house_size', 'bed', 'bath']]

# Target
y = clean['price']


price         float64
bed           float64
bath          float64
acre_lot      float64
house_size    float64
dtype: object
      price  bed  bath  acre_lot  house_size
0  399900.0  1.0   1.0      0.04       712.0
1  568000.0  4.0   2.0      0.04      1568.0
2  629000.0  3.0   2.0      0.08      1062.0
3  975000.0  4.0   2.0      0.24      1985.0
4  419000.0  1.0   1.0      0.02       896.0


In [3]:
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
# If not installed:
# !pip install xgboost

from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

# ---- Set this depending on your target ----
# If you trained on raw price: keep as False
# If you trained on log(price) (e.g., y = np.log1p(price)): set to True
TARGET_IS_LOG = False

# ---- Train XGBoost ----
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_train, y_train)

# ---- Predict & evaluate ----
y_pred = xgb.predict(X_test)

if TARGET_IS_LOG:
    # convert both preds and y_test back to dollars
    y_pred_eval = np.expm1(y_pred)
    y_test_eval = np.expm1(y_test)
else:
    y_pred_eval = y_pred
    y_test_eval = y_test

r2  = r2_score(y_test_eval, y_pred_eval)
mae = mean_absolute_error(y_test_eval, y_pred_eval)

print("XGBoost R^2:", r2)
print("XGBoost MAE:", mae)


XGBoost R^2: 0.8037062541831193
XGBoost MAE: 100128.67513020833


In [5]:
import numpy as np
import pandas as pd

# >>> Assumes you already have: xgb (trained), X_train, y_train, X_test, y_test
# If you trained on log(price), set this to True so we invert with expm1 at the end.
TARGET_IS_LOG = False  # set True if y = np.log1p(price) during training

# Use the exact same feature columns and order as training
FEATURES = list(X_train.columns)  # e.g., ['acre_lot','house_size','bed','bath']

def predict_price_xgb(bed, bath, house_size, acre_lot):
    """Return predicted price as a float (in dollars)."""
    row = pd.DataFrame([{
        'acre_lot': float(acre_lot),
        'house_size': float(house_size),
        'bed': float(bed),
        'bath': float(bath),
    }])

    # Ensure column order matches training features exactly
    row = row[FEATURES]

    pred = xgb.predict(row)[0]
    if TARGET_IS_LOG:
        pred = np.expm1(pred)  # convert back to dollars

    return float(pred)

# --- Example usage ---
price = predict_price_xgb(bed=3, bath=2, house_size=1800, acre_lot=0.12)
print(f"Predicted price: ${price:,.0f}")


Predicted price: $827,784


In [7]:
# If not installed, uncomment:
# !pip install xgboost

import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# ---- SETTINGS ----
# Set to True if you trained y = np.log1p(price); False if you used raw price
TARGET_IS_LOG = False

# Use the exact same feature order as in training
FEATURES = list(X_train.columns)  # e.g., ['acre_lot','house_size','bed','bath']

# ---- (Re)train XGBoost on your training split ----
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# ---- Evaluate and show metrics on test set ----
y_pred_test = xgb.predict(X_test)
if TARGET_IS_LOG:
    y_pred_eval = np.expm1(y_pred_test)
    y_test_eval = np.expm1(y_test)
else:
    y_pred_eval = y_pred_test
    y_test_eval = y_test

print("XGBoost R^2 (test):", r2_score(y_test_eval, y_pred_eval))
print("XGBoost MAE (test):", mean_absolute_error(y_test_eval, y_pred_eval))

# ---- Helper: single prediction from user input ----
def predict_price_from_inputs(bed, bath, house_size, acre_lot):
    row = pd.DataFrame([{
        'acre_lot': float(acre_lot),
        'house_size': float(house_size),
        'bed': float(bed),
        'bath': float(bath),
    }])
    row = row[FEATURES]  # ensure correct column order
    pred = xgb.predict(row)[0]
    if TARGET_IS_LOG:
        pred = np.expm1(pred)  # back to dollars
    return float(pred)

# ---- Prompt user for info (loop until blank) ----
def get_float(prompt):
    while True:
        s = input(prompt).strip()
        if s == "":
            return None
        try:
            return float(s)
        except ValueError:
            print("Please enter a number (or leave blank to quit).")

print("\nEnter house details (leave any field blank to stop):")
while True:
    bed = get_float("Bedrooms: ")
    if bed is None: break
    bath = get_float("Bathrooms: ")
    if bath is None: break
    house_size = get_float("House size (sqft): ")
    if house_size is None: break
    acre_lot = get_float("Lot size (acres): ")
    if acre_lot is None: break

    price = predict_price_from_inputs(bed, bath, house_size, acre_lot)
    print(f"Estimated price: ${price:,.0f}\n")
    break


XGBoost R^2 (test): 0.8037062541831193
XGBoost MAE (test): 100128.67513020833

Enter house details (leave any field blank to stop):
Bedrooms: 3
Bathrooms: 3
House size (sqft): 3222
Lot size (acres): 3
Estimated price: $2,093,798

