In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("data/listings_clean.csv")

In [4]:
import re

# 1. Map number-words to integers
word_to_num = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

# 2. Define the extractor
def extract_bedroom_count(title: str) -> int:
    txt = title.lower()
    # Studio → 0 bedrooms
    if "studio" in txt:
        return 0
    # Regex: digits or words before 'bedroom' or 'bedrooms'
    pattern = r'\b(\d+|' + '|'.join(word_to_num.keys()) + r')\b[\s-]*bedrooms?'
    match = re.search(pattern, txt, flags=re.IGNORECASE)
    if match:
        val = match.group(1)
        return int(val) if val.isdigit() else word_to_num[val.lower()]
    # Default if no explicit mention
    return 1

# 3. Apply to your DataFrame
# Assuming your DataFrame is called `data` and has a column `name`
data['bedroom_count'] = data['name'].apply(extract_bedroom_count)

# 4. Quick check
print(data[['name', 'bedroom_count']].head())

                                              name  bedroom_count
0                      Huge Four Bedroom Apartment              4
1                            One Bedroom Apartment              1
2          Two Bedroom Newly Refurbished Apartment              2
3                Refurbished Two Bedroom Apartment              2
4  Spacious refurbished 2 bedroom apt with balcony              2


In [5]:
data["log_price"] = np.log1p(data["price"])
data["log_reviews_per_month"] = np.log1p(data["reviews_per_month"])
data["log_reviews_ltm"] = np.log1p(data["number_of_reviews_ltm"])
data["log_days_since_review"] = np.log1p(data["days_since_review"])
data["log_host_listings"] = np.log1p(data["calculated_host_listings_count"])

In [6]:
top8 = data["neighbourhood"].value_counts().nlargest(8).index
data["borough_grouped"] = data["neighbourhood"].where(data["neighbourhood"].isin(top8), "Other")

In [7]:
data = data.drop(columns=["Unnamed: 0", "id", "name", "host_id", "host_name", "price", "number_of_reviews", "last_review", "reviews_per_month", "calculated_host_listings_count", "number_of_reviews_ltm", "days_since_review", "neighbourhood"], axis=1)

In [8]:
data

Unnamed: 0,latitude,longitude,room_type,minimum_nights,availability_365,has_review,bedroom_count,log_price,log_reviews_per_month,log_reviews_ltm,log_days_since_review,log_host_listings,borough_grouped
0,51.443060,-0.019480,Entire home/apt,3,293,True,4,5.697093,0.412110,2.564949,4.204693,2.484907,Other
1,51.442840,-0.019970,Entire home/apt,3,318,True,1,4.595120,0.198851,1.609438,5.459586,2.484907,Other
2,51.443590,-0.022750,Entire home/apt,3,302,True,2,5.003946,0.357674,1.945910,4.543295,2.484907,Other
3,51.443550,-0.023090,Entire home/apt,3,328,True,2,4.976734,0.262364,2.079442,4.744932,2.484907,Other
4,51.443330,-0.023070,Entire home/apt,3,255,True,2,5.062595,0.300105,1.609438,5.379897,2.484907,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62679,51.525360,-0.072550,Entire home/apt,1,365,False,1,7.241366,0.000000,0.000000,,1.386294,Tower Hamlets
62680,51.543040,-0.076190,Entire home/apt,90,361,False,0,4.564348,0.000000,0.000000,,1.609438,Hackney
62681,51.518096,-0.168024,Entire home/apt,2,359,False,1,5.703782,0.000000,0.000000,,1.386294,Westminster
62682,51.496230,-0.133060,Entire home/apt,1,365,False,1,8.006701,0.000000,0.000000,,3.912023,Westminster


In [9]:
data = pd.get_dummies(data, 
                      columns=["borough_grouped", "room_type"],
                      prefix=["boro", "room"],
                      drop_first=True)

In [10]:
data.columns

Index(['latitude', 'longitude', 'minimum_nights', 'availability_365',
       'has_review', 'bedroom_count', 'log_price', 'log_reviews_per_month',
       'log_reviews_ltm', 'log_days_since_review', 'log_host_listings',
       'boro_Hackney', 'boro_Kensington and Chelsea', 'boro_Lambeth',
       'boro_Other', 'boro_Southwark', 'boro_Tower Hamlets', 'boro_Wandsworth',
       'boro_Westminster', 'room_Hotel room', 'room_Private room',
       'room_Shared room'],
      dtype='object')

In [11]:
data = data.fillna(0)

In [12]:
bool_cols = data.select_dtypes(include="bool").columns
data[bool_cols] = data[bool_cols].astype(int)

In [13]:
data

Unnamed: 0,latitude,longitude,minimum_nights,availability_365,has_review,bedroom_count,log_price,log_reviews_per_month,log_reviews_ltm,log_days_since_review,...,boro_Kensington and Chelsea,boro_Lambeth,boro_Other,boro_Southwark,boro_Tower Hamlets,boro_Wandsworth,boro_Westminster,room_Hotel room,room_Private room,room_Shared room
0,51.443060,-0.019480,3,293,1,4,5.697093,0.412110,2.564949,4.204693,...,0,0,1,0,0,0,0,0,0,0
1,51.442840,-0.019970,3,318,1,1,4.595120,0.198851,1.609438,5.459586,...,0,0,1,0,0,0,0,0,0,0
2,51.443590,-0.022750,3,302,1,2,5.003946,0.357674,1.945910,4.543295,...,0,0,1,0,0,0,0,0,0,0
3,51.443550,-0.023090,3,328,1,2,4.976734,0.262364,2.079442,4.744932,...,0,0,1,0,0,0,0,0,0,0
4,51.443330,-0.023070,3,255,1,2,5.062595,0.300105,1.609438,5.379897,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62679,51.525360,-0.072550,1,365,0,1,7.241366,0.000000,0.000000,0.000000,...,0,0,0,0,1,0,0,0,0,0
62680,51.543040,-0.076190,90,361,0,0,4.564348,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
62681,51.518096,-0.168024,2,359,0,1,5.703782,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,1,0,0,0
62682,51.496230,-0.133060,1,365,0,1,8.006701,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,1,0,0,0


In [14]:
data.to_csv("data/ml_ready_listings.csv")

In [15]:
RAND_SEED = 42

In [16]:
from sklearn.model_selection import train_test_split

y = data["log_price"]
X = data.drop(columns="log_price")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RAND_SEED)

In [17]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1.0, random_state=RAND_SEED)
ridge_model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [18]:
y_pred_ridge = ridge_model.predict(X_test)

In [19]:
def RMSE_to_pct_error(RMSE: float):
    return f"{np.exp(RMSE) - 1:.2%}"

In [20]:
from sklearn.metrics import mean_squared_error, r2_score

ridge_RMSE = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
ridge_r2 = r2_score(y_test, y_pred_ridge)

print(f"Ridge || RMSE: {ridge_RMSE:.3f} | R²: {ridge_r2:.3f}")

Ridge || RMSE: 0.559 | R²: 0.522


In [21]:
RMSE_to_pct_error(ridge_RMSE)

'74.85%'

In [22]:
ridge_coef_series = pd.Series(ridge_model.coef_, index=X_train.columns)
ridge_top10 = ridge_coef_series.abs().sort_values(ascending=False).head(10)
ridge_top10

room_Shared room               1.524520
room_Private room              0.929574
has_review                     0.551736
boro_Other                     0.350597
room_Hotel room                0.344038
boro_Hackney                   0.249941
boro_Westminster               0.227553
latitude                       0.217895
boro_Tower Hamlets             0.212755
boro_Kensington and Chelsea    0.210430
dtype: float64

In [23]:
ridge_coef_series.loc[ridge_top10.index]

room_Shared room              -1.524520
room_Private room             -0.929574
has_review                    -0.551736
boro_Other                    -0.350597
room_Hotel room               -0.344038
boro_Hackney                  -0.249941
boro_Westminster               0.227553
latitude                       0.217895
boro_Tower Hamlets            -0.212755
boro_Kensington and Chelsea    0.210430
dtype: float64

In [24]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    random_state=RAND_SEED,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
y_pred_rf = rf_model.predict(X_test)

In [26]:
rf_RMSE = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regressor || RMSE: {rf_RMSE:.3f} | R²: {rf_r2:.3f}")

Random Forest Regressor || RMSE: 0.476 | R²: 0.653


In [27]:
RMSE_to_pct_error(rf_RMSE)

'60.97%'

In [28]:
rf_coef_series = pd.Series(rf_model.feature_importances_, index=X_train.columns)
rf_top10 = rf_coef_series.abs().sort_values(ascending=False).head(10)
rf_top10

room_Private room              0.440404
longitude                      0.100907
latitude                       0.095732
bedroom_count                  0.060127
availability_365               0.057191
log_host_listings              0.047830
boro_Westminster               0.036547
log_reviews_per_month          0.034189
log_days_since_review          0.030566
boro_Kensington and Chelsea    0.027567
dtype: float64



In [29]:
# Hyperparameter Tuning
from sklearn.linear_model import RidgeCV

ridgecv_model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
                        scoring="neg_root_mean_squared_error",
                        cv=5,)

ridgecv_model.fit(X_train, y_train)

0,1,2
,alphas,"[0.001, 0.01, ...]"
,fit_intercept,True
,scoring,'neg_root_mean_squared_error'
,cv,5
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [30]:
ridgecv_model.alpha_

np.float64(0.1)

In [31]:
ridgecv_y_pred = ridgecv_model.predict(X_test)

In [32]:
ridgecv_RMSE = np.sqrt(mean_squared_error(y_test, ridgecv_y_pred))
ridgecv_r2 = r2_score(y_test, ridgecv_y_pred)

In [33]:
print(f"RidgeCV || Best alpha: {ridgecv_model.alpha_} | RMSE: {ridgecv_RMSE:.3f} | R²: {ridgecv_r2:.3f}")

RidgeCV || Best alpha: 0.1 | RMSE: 0.559 | R²: 0.522


In [34]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "n_estimators": [i for i in range(1, 1000)],
    "max_depth": [None] + [i for i in range(1, 50)],
    "min_samples_leaf": [i for i in range(1, 20)],
    "max_features": ["sqrt", "log2", 0.5, 0.8]
}

rf2 = RandomForestRegressor(random_state=RAND_SEED, n_jobs=-1)

search = RandomizedSearchCV(
    estimator=rf2,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=3,
    random_state=RAND_SEED,
    n_jobs=-1
)

In [35]:
search.fit(X_train, y_train)

0,1,2
,estimator,RandomForestR...ndom_state=42)
,param_distributions,"{'max_depth': [None, 1, ...], 'max_features': ['sqrt', 'log2', ...], 'min_samples_leaf': [1, 2, ...], 'n_estimators': [1, 2, ...]}"
,n_iter,20
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,143
,criterion,'squared_error'
,max_depth,40
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
search.best_params_

{'n_estimators': 143,
 'min_samples_leaf': 4,
 'max_features': 0.8,
 'max_depth': 40}

In [37]:
best_rf2 = search.best_estimator_
rf2_y_pred = best_rf2.predict(X_test)
rf2_RMSE = np.sqrt(mean_squared_error(y_test, rf2_y_pred))
rf2_r2 = r2_score(y_test, rf2_y_pred)

print(f"Random Forest Regressor CV || RMSE: {rf2_RMSE:.3f} | R²: {rf2_r2:.3f}")

Random Forest Regressor CV || RMSE: 0.473 | R²: 0.657


In [38]:
RMSE_to_pct_error(rf2_RMSE)

'60.52%'

In [39]:
from sklearn.ensemble import HistGradientBoostingRegressor

gb = HistGradientBoostingRegressor(
    max_iter=200,
    learning_rate=0.1,
    max_depth=10,
    min_samples_leaf=20,
    max_bins=255,
    random_state=RAND_SEED
)

In [40]:
gb.fit(X_train, y_train)

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,200
,max_leaf_nodes,31
,max_depth,10
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [41]:
gb_y_pred = gb.predict(X_test)
gb_RMSE = np.sqrt(mean_squared_error(y_test, gb_y_pred))
gb_r2 = r2_score(y_test, gb_y_pred)

print(f"Gradient Boosting || RMSE: {gb_RMSE:.3f} | R²: {gb_r2:.3f}")

Gradient Boosting || RMSE: 0.476 | R²: 0.653


In [42]:
RMSE_to_pct_error(gb_RMSE)

'61.02%'

In [43]:
param_dist = {
    "learning_rate": list(np.arange(0.01, 1.0, 0.01)),
    "max_iter": list(np.arange(100, 1100, 100)),
    "max_depth": [i for i in range(1, 21)],
    "min_samples_leaf": [i for i in range(1, 51)]
}

gb2 = HistGradientBoostingRegressor(random_state=RAND_SEED)

gb_search = RandomizedSearchCV(
    estimator=gb2,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=3,
    random_state=RAND_SEED,
    n_jobs=-1
)

In [44]:
gb_search.fit(X_train, y_train)

0,1,2
,estimator,HistGradientB...ndom_state=42)
,param_distributions,"{'learning_rate': [np.float64(0.01), np.float64(0.02), ...], 'max_depth': [1, 2, ...], 'max_iter': [np.int64(100), np.int64(200), ...], 'min_samples_leaf': [1, 2, ...]}"
,n_iter,20
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,np.float64(0.14)
,max_iter,np.int64(700)
,max_leaf_nodes,31
,max_depth,15
,min_samples_leaf,38
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [45]:
gb_search.best_params_

{'min_samples_leaf': 38,
 'max_iter': np.int64(700),
 'max_depth': 15,
 'learning_rate': np.float64(0.14)}

In [46]:
best_gb = gb_search.best_estimator_
gb2_y_pred = best_gb.predict(X_test)
gb2_RMSE = np.sqrt(mean_squared_error(y_test, gb2_y_pred))
gb2_r2 = r2_score(y_test, gb2_y_pred)

print(f"Gradient Boosting || RMSE: {gb2_RMSE:.3f} | R²: {gb2_r2:.3f}")

Gradient Boosting || RMSE: 0.471 | R²: 0.661


In [47]:
RMSE_to_pct_error(gb2_RMSE)

'60.16%'

In [48]:
y_rf = best_rf2.predict(X_test)
y_gb = best_gb.predict(X_test)

y_ensemble = (y_rf + y_gb) / 2

ens_RMSE = np.sqrt(mean_squared_error(y_test, y_ensemble))
ens_r2 = r2_score(y_test, y_ensemble)

print(f"Ensemble || RMSE: {ens_RMSE:.3f} | R²: {ens_r2:.3f}")

Ensemble || RMSE: 0.466 | R²: 0.668


In [49]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

voting = VotingRegressor(
    estimators=[
        ("rf", best_rf2),
        ("gb", best_gb)
    ],
    n_jobs=-1
)

voting.fit(X_train, y_train)

0,1,2
,estimators,"[('rf', ...), ('gb', ...)]"
,weights,
,n_jobs,-1
,verbose,False

0,1,2
,n_estimators,143
,criterion,'squared_error'
,max_depth,40
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,np.float64(0.14)
,max_iter,np.int64(700)
,max_leaf_nodes,31
,max_depth,15
,min_samples_leaf,38
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [50]:
y_v = voting.predict(X_test)

In [51]:
v_RMSE = np.sqrt(mean_squared_error(y_test, y_v))
v_r2 = r2_score(y_test, y_v)

print(f"Voting || RMSE: {v_RMSE:.3f} | R²: {v_r2:.3f}")

Voting || RMSE: 0.466 | R²: 0.668


In [52]:
stack = StackingRegressor(
    estimators=[
        ("rf", best_rf2),
        ("gb", best_gb)
    ],
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1,
    passthrough=False
)

In [53]:
stack.fit(X_train, y_train)

0,1,2
,estimators,"[('rf', ...), ('gb', ...)]"
,final_estimator,Ridge()
,cv,5
,n_jobs,-1
,passthrough,False
,verbose,0

0,1,2
,n_estimators,143
,criterion,'squared_error'
,max_depth,40
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,np.float64(0.14)
,max_iter,np.int64(700)
,max_leaf_nodes,31
,max_depth,15
,min_samples_leaf,38
,l2_regularization,0.0
,max_features,1.0
,max_bins,255

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [54]:
y_s = stack.predict(X_test)

s_RMSE = np.sqrt(mean_squared_error(y_test, y_s))
s_r2 = r2_score(y_test, y_s)

print(f"Stack || RMSE: {s_RMSE:.3f} | R²: {s_r2:.3f}")

Stack || RMSE: 0.465 | R²: 0.669


In [55]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

estimator = stack

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=True)

cv_results = cross_validate(
    estimator,
    X, y,
    cv=10,
    scoring={"RMSE": rmse_scorer, "R2": "r2"},
    return_train_score=False,
    n_jobs=-1
)

In [56]:
mean_rmse = np.mean(cv_results["test_RMSE"])
std_rmse  = np.std(cv_results["test_RMSE"])
mean_r2   = np.mean(cv_results["test_R2"])
std_r2    = np.std(cv_results["test_R2"])

print(f"10-Fold CV RMSE: {mean_rmse:.3f} ± {std_rmse:.3f}")
print(f"10-Fold CV R²:   {mean_r2:.3f} ± {std_r2:.3f}")

10-Fold CV RMSE: 0.481 ± 0.013
10-Fold CV R²:   0.639 ± 0.015


In [57]:
RMSE_to_pct_error(mean_rmse)

'61.75%'