<h1>Data preprocessing</h1>

In [1]:
import pandas as pd 
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, QuantileTransformer
np.random.seed(42) 
pd.set_option('display.max_columns', None)
df = pd.read_csv('data/dataset v1.csv')

#Convert remaining_lease into numerical remaining_lease_years
def convert_lease_to_years(lease_str):
    if not isinstance(lease_str, str):
        return np.nan
    years = 0
    months = 0
    # Use regex to find numbers associated with 'year' and 'month'
    year_match = re.search(r'(\d+)\s*year', lease_str)
    month_match = re.search(r'(\d+)\s*month', lease_str)
    if year_match:
        years = int(year_match.group(1))
    if month_match:
        months = int(month_match.group(1))
    # Return the total lease in years, or NaN if no parts were found
    if years == 0 and months == 0:
        return np.nan
    return years + months / 12.0

df['remaining_lease_years'] = df['remaining_lease'].apply(convert_lease_to_years)

#Ordinally encoded storey_range by taking the in-between storeys (floors 01-03 becomes 2)
floor_map = {'01 TO 03': 2,'04 TO 06': 5,'07 TO 09': 8,'10 TO 12': 11,'13 TO 15': 14,
    '16 TO 18': 17,'19 TO 21': 20,'22 TO 24': 23,'25 TO 27': 26,'28 TO 30': 29,
    '31 TO 33': 32,'34 TO 36': 35,'37 TO 39': 38,'40 TO 42': 41,'43 TO 45': 44,
    '46 TO 48': 47,'49 TO 51': 50,}

df['storey_ordinal'] = df['storey_range'].map(floor_map)

#Create train test splits
df.drop(columns=['Unnamed: 0','storey_range','street_name','remaining_lease','latitude','longitude','nearest_bus_stop','nearest_pei',
                   'nearest_jc','nearest_kindergarten','nearest_primary_school','nearest_secondary_school','nearest_poly',
                   'nearest_library','nearest_hospital','nearest_mall','nearest_mrt_station','nearest_sports_facility','nearest_hawker_centre'],inplace=True)
X = df.drop(columns=['resale_price'])
y = df['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


# Sampling using Sample Weights
X_train['combined_feature'] = X_train['town'].astype(str) + '_' + \
                            X_train['flat_type'].astype(str) + '_' + \
                            X_train['flat_model'].astype(str)

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=X_train['combined_feature']  # The imbalanced feature we want to correct for
)

fit_params = {
    'regressor__sample_weight': sample_weights
}
X_train.drop(columns=['combined_feature'],inplace=True)

#One hot encode categorical columns and scale numerical columns by fit transforming on train data and transforming test data
categorical_columns = ['town', 'flat_type', 'flat_model']
numerical_columns = [
    'floor_area_sqm', 'lease_commence_date',
    'dist_bus_stop_m', 'dist_pei_m', 'dist_jc_m', 'dist_kindergarten_m',
    'dist_primary_school_m', 'dist_secondary_school_m', 'dist_poly_m',
    'dist_library_m', 'dist_mall_m', 'dist_hospital_m', 'dist_mrt_station_m',
    'dist_sports_facility_m', 'dist_hawker_centre_m',
    'remaining_lease_years','storey_ordinal',]


numerical_columns = [col for col in numerical_columns if col in X_train.columns]
categorical_columns = [col for col in categorical_columns if col in X_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns)
    ],
    remainder='drop' 
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

#Scale y_train and y_test
scaler = QuantileTransformer(output_distribution='normal')
y_train_reshaped = y_train.values.reshape(-1, 1)
y_train_scaled = scaler.fit_transform(y_train_reshaped)

<h1>Modelling</h1>

In [2]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

<h2>Decision Tree Regressor</h2>

In [3]:
print("--- Tuning Decision Tree Regressor ---")
param_grid_tree = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
tree_model = DecisionTreeRegressor(random_state=42)
grid_search_tree = GridSearchCV(estimator=tree_model, param_grid=param_grid_tree,
                                cv=3, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')

grid_search_tree.fit(X_train, y_train_scaled, sample_weight=sample_weights)

cv_results_tree = pd.DataFrame(grid_search_tree.cv_results_)
cv_results_tree['rmse'] = np.sqrt(-cv_results_tree['mean_test_score'])
print("Displaying results for each hyperparameter combination:")
results_df_tree = cv_results_tree[['param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'rmse', 'rank_test_score']]
print(results_df_tree.sort_values(by='rank_test_score'))

best_tree_model = grid_search_tree.best_estimator_
print(f"\nBest Decision Tree Params: {grid_search_tree.best_params_}")

y_pred_tree = best_tree_model.predict(X_test)
y_pred_tree = scaler.inverse_transform(y_pred_tree.reshape(-1, 1))
rmse_tree = np.sqrt(mean_squared_error(y_test, y_pred_tree))
mae_tree = mean_absolute_error(y_test, y_pred_tree)
print(f"Tuned Decision Tree RMSE: {rmse_tree:.4f}")
print(f"Tuned Decision Tree MAE:  {mae_tree:.4f}\n")

--- Tuning Decision Tree Regressor ---
Displaying results for each hyperparameter combination:
   param_max_depth  param_min_samples_split  param_min_samples_leaf      rmse  rank_test_score
4             None                        5                       2  0.280880                1
31              30                        5                       2  0.281161                2
29              30                       10                       1  0.281333                3
5             None                       10                       2  0.281411                4
7             None                        5                       4  0.281475                5
6             None                        2                       4  0.281475                5
32              30                       10                       2  0.281484                7
2             None                       10                       1  0.281503                8
33              30                        2       

<h2>XGBoost</h2>

In [4]:
print("--- Tuning XGBoost Regressor ---")
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.7, 1.0]
}
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb,
                               cv=3, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')

grid_search_xgb.fit(X_train, y_train_scaled, sample_weight=sample_weights)

cv_results_xgb = pd.DataFrame(grid_search_xgb.cv_results_)
cv_results_xgb['rmse'] = np.sqrt(-cv_results_xgb['mean_test_score'])
print("Displaying results for each hyperparameter combination:")
results_df_xgb = cv_results_xgb[['param_n_estimators', 'param_max_depth', 'param_learning_rate', 'param_subsample', 'rmse', 'rank_test_score']]
print(results_df_xgb.sort_values(by='rank_test_score'))


best_xgb_model = grid_search_xgb.best_estimator_
print(f"\nBest XGBoost Params: {grid_search_xgb.best_params_}")

y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_xgb = scaler.inverse_transform(y_pred_xgb.reshape(-1, 1))
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"Tuned XGBoost RMSE: {rmse_xgb:.4f}")
print(f"Tuned XGBoost MAE:  {mae_xgb:.4f}\n")

--- Tuning XGBoost Regressor ---
Displaying results for each hyperparameter combination:
    param_n_estimators  param_max_depth  param_learning_rate  param_subsample      rmse  rank_test_score
22                 200                7                 0.10              0.7  0.220834                1
23                 200                7                 0.10              1.0  0.223157                2
10                 200                7                 0.05              0.7  0.263937                3
20                 100                7                 0.10              0.7  0.267209                4
11                 200                7                 0.05              1.0  0.267485                5
21                 100                7                 0.10              1.0  0.267640                6
18                 200                5                 0.10              0.7  0.272529                7
19                 200                5                 0.10           

<h2>Ridge Regression</h2>

In [5]:
print("--- Tuning Ridge Regressor ---")
param_grid_ridge = {
    'alpha': [0.1, 1.0, 10.0, 50.0, 100.0]
}
ridge_model = Ridge(random_state=42)
grid_search_ridge = GridSearchCV(estimator=ridge_model, param_grid=param_grid_ridge,
                                 cv=3, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')

grid_search_ridge.fit(X_train, y_train_scaled, sample_weight=sample_weights)

cv_results_ridge = pd.DataFrame(grid_search_ridge.cv_results_)
cv_results_ridge['rmse'] = np.sqrt(-cv_results_ridge['mean_test_score'])
print("Displaying results for each hyperparameter combination:")
results_df_ridge = cv_results_ridge[['param_alpha', 'rmse', 'rank_test_score']]
print(results_df_ridge.sort_values(by='rank_test_score'))


best_ridge_model = grid_search_ridge.best_estimator_
print(f"\nBest Ridge Params: {grid_search_ridge.best_params_}")

y_pred_ridge = best_ridge_model.predict(X_test)
y_pred_ridge = scaler.inverse_transform(y_pred_ridge.reshape(-1, 1))
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
print(f"Tuned Ridge Regression RMSE: {rmse_ridge:.4f}")
print(f"Tuned Ridge Regression MAE:  {mae_ridge:.4f}\n")

--- Tuning Ridge Regressor ---
Displaying results for each hyperparameter combination:
   param_alpha      rmse  rank_test_score
0          0.1  0.322375                1
1          1.0  0.322375                2
2         10.0  0.322422                3
3         50.0  0.323364                4
4        100.0  0.325697                5

Best Ridge Params: {'alpha': 0.1}
Tuned Ridge Regression RMSE: 52930.0944
Tuned Ridge Regression MAE:  40473.0117

