## 06 Pycaret Model

**Project:** UK Housing Price Paid Records

**Purpose:**

**Team Member(s):** Tymo Verhaegen

**Date Last Run:** 07/11/2025

---

In [1]:
%pip install pycaret
%pip install xgboost





[notice] A new release of pip is available: 23.3.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/72.0 MB 1.3 MB/s eta 0:00:55
   ---------------------------------------- 0.5/72.0 MB 4.7 MB/s eta 0:00:16
    --------------------------------------- 1.0/72.0 MB 7.2 MB/s eta 0:00:10
    --------------------------------------- 1.5/72.0 MB 7.6 MB/s eta 0:00:10
   - -------------------------------------- 2.1/72.0 MB 8.8 MB/s eta 0:00:08
   - -------------------------------------- 2.5/72.0 MB 9.5 MB/s eta 0:00:08
   - -------------------------------------- 3.0/72.0 MB 9.6 MB/s eta 0:00:08
   - -------------------------------------- 3.4/72.0 MB 9.4 MB/s eta 0:00:08
   - ----------------------


[notice] A new release of pip is available: 23.3.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from pycaret.regression import *
import pandas as pd

# 1. Load Data
data = pd.read_parquet('./../../data/housing/processed/price_paid_model_ready.parquet')
print("Data loaded succesfully. Shape: ", data.shape)

Data loaded succesfully. Shape:  (22489256, 10)


In [3]:
# ------------------------------------------------------------------
# IMPROVEMENT 1: Feature Engineering (Extract Seasonality)
# ------------------------------------------------------------------
# The model needs to know if it's Summer or Winter, but can't read raw dates well.
if 'sale_month' not in data.columns:
    data['sale_month'] = data['sale_date'].dt.month

# Check types and NaNs (Your original checks)
print(data.dtypes)
print("NaNs in target:", data['price'].isnull().sum())

price                        int32
sale_date           datetime64[ns]
property_type             category
old/new                   category
duration                  category
town/city                 category
district                  category
county                    category
ppdcategory_type          category
sale_year                    int16
sale_month                   int32
dtype: object
NaNs in target: 0


In [4]:
# ------------------------------------------------------------------
# NEW STEP: Fix Column Names (Professor's requirement + Safety)
# ------------------------------------------------------------------
# Removes the slash '/' which can cause errors in some ML tools
data = data.rename(columns={
    'town/city': 'town_city',
    'old/new': 'old_new',
    'ppdcategory_type': 'ppd_category' # Optional, just looks cleaner
})

# ------------------------------------------------------------------
# Feature Engineering (Seasonality)
# ------------------------------------------------------------------
if 'sale_month' not in data.columns:
    data['sale_month'] = data['sale_date'].dt.month

In [5]:
# ------------------------------------------------------------------
# IMPROVEMENT 2: Sampling
# ------------------------------------------------------------------
# CRITICAL: We use a 5% sample for comparison. 
# Running 'compare_models' on the full dataset will take hours/crash.
data_subset = data.sample(frac=0.05, random_state=42)
print(f"Using subset for model comparison. Shape: {data_subset.shape}")

Using subset for model comparison. Shape: (1124463, 11)


In [6]:
# ------------------------------------------------------------------
# THE FIXED SETUP
# ------------------------------------------------------------------
s = setup(
    data = data_subset,
    target = 'price', 
    session_id = 42,
    html = False,
    
    # A. Ignore Dates
    ignore_features = ['sale_date'],
    
    # B. CATEGORICALS (Put EVERY text column here now)
    # We list everything: both small ones (old_new) and big ones (town_city)
    categorical_features = ['property_type', 'old_new', 'duration', 'ppd_category', 'town_city', 'district', 'county'],
    
    # C. THE FIX for High Cardinality
    # This tells PyCaret: "If a column has more than 20 unique values (like town_city),
    # do NOT create new columns for it. Use Target Encoding instead."
    max_encoding_ohe = 20, 
    
    # D. Cleaning
    remove_outliers = True,
    outliers_threshold = 0.05,
    normalize = True
)

                    Description             Value
0                    Session id                42
1                        Target             price
2                   Target type        Regression
3           Original data shape     (1124463, 11)
4        Transformed data shape     (1085106, 14)
5   Transformed train set shape      (747767, 14)
6    Transformed test set shape      (337339, 14)
7               Ignore features                 1
8              Numeric features                 2
9          Categorical features                 7
10     Rows with missing values              0.0%
11                   Preprocess              True
12              Imputation type            simple
13           Numeric imputation              mean
14       Categorical imputation              mode
15     Maximum one-hot encoding                20
16              Encoding method              None
17              Remove outliers              True
18           Outliers threshold              0.05


In [7]:
# ------------------------------------------------------------------
# IMPROVEMENT 4: Fast Model Comparison
# ------------------------------------------------------------------
best_model = compare_models(
    fold = 3, 
    sort = 'RMSE', # We want the lowest price error, not just R2
    
    # Added 'xgboost' (king of speed) and kept your choices.
    # Since we are using a 5% sample, it is now safe to use 'rf' and 'et'
    include = ['lightgbm', 'xgboost', 'rf', 'et', 'dt'] 
)

print(best_model)

                                                            

                                    Model         MAE           MSE  \
lightgbm  Light Gradient Boosting Machine  58924.7763  1.524787e+11   
xgboost         Extreme Gradient Boosting  58853.0120  1.553980e+11   
rf                Random Forest Regressor  64407.2007  1.559610e+11   
et                  Extra Trees Regressor  67357.5803  1.595467e+11   
dt                Decision Tree Regressor  76348.2913  1.707070e+11   

                 RMSE      R2   RMSLE    MAPE  TT (Sec)  
lightgbm  390360.8941  0.1599  0.4231  0.4267   22.5067  
xgboost   394110.7978  0.1436  0.4182  0.4135   19.5367  
rf        394808.9712  0.1406  0.4489  0.4486  161.2867  
et        399337.4061  0.1207  0.4726  0.4612  173.1633  
dt        413133.1612  0.0585  0.5275  0.5110   10.8367  
LGBMRegressor(n_jobs=-1, random_state=42)


