In [251]:
import pandas as pd
import numpy as np

In [252]:
data = pd.read_csv("data/listings_clean.csv")

In [253]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,has_review,days_since_review
0,0,264776,Huge Four Bedroom Apartment,1389063,Sue,Lewisham,51.44306,-0.01948,Entire home/apt,297.0,3,68,2025-05-28,0.51,11,293,12,True,64.0
1,1,264777,One Bedroom Apartment,1389063,Sue,Lewisham,51.44284,-0.01997,Entire home/apt,98.0,3,24,2024-12-11,0.22,11,318,4,True,232.0
2,2,264778,Two Bedroom Newly Refurbished Apartment,1389063,Sue,Lewisham,51.44359,-0.02275,Entire home/apt,148.0,3,58,2025-05-01,0.43,11,302,6,True,91.0
3,3,264779,Refurbished Two Bedroom Apartment,1389063,Sue,Lewisham,51.44355,-0.02309,Entire home/apt,144.0,3,36,2025-04-10,0.3,11,328,7,True,112.0
4,4,264780,Spacious refurbished 2 bedroom apt with balcony,1389063,Sue,Lewisham,51.44333,-0.02307,Entire home/apt,157.0,3,54,2024-12-29,0.35,11,255,4,True,214.0


In [254]:
import re

# 1. (Re)load your cleaned data into `data`
# data = pd.read_csv("data/cleaned/listings_clean.csv")

# 2. Map number-words to integers
word_to_num = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

# 3. Define the extraction function
def extract_bedroom_count(title: str) -> int:
    txt = title.lower()
    if "studio" in txt:
        return 0
    pattern = r'\b(\d+|' + '|'.join(word_to_num.keys()) + r')\b[\s-]*bedrooms?'
    match = re.search(pattern, txt)
    if match:
        val = match.group(1)
        return int(val) if val.isdigit() else word_to_num[val]
    return 1

# 4. Apply it
data['bedroom_count'] = data['name'].apply(extract_bedroom_count)

In [255]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,has_review,days_since_review,bedroom_count
0,0,264776,Huge Four Bedroom Apartment,1389063,Sue,Lewisham,51.44306,-0.01948,Entire home/apt,297.0,3,68,2025-05-28,0.51,11,293,12,True,64.0,4
1,1,264777,One Bedroom Apartment,1389063,Sue,Lewisham,51.44284,-0.01997,Entire home/apt,98.0,3,24,2024-12-11,0.22,11,318,4,True,232.0,1
2,2,264778,Two Bedroom Newly Refurbished Apartment,1389063,Sue,Lewisham,51.44359,-0.02275,Entire home/apt,148.0,3,58,2025-05-01,0.43,11,302,6,True,91.0,2
3,3,264779,Refurbished Two Bedroom Apartment,1389063,Sue,Lewisham,51.44355,-0.02309,Entire home/apt,144.0,3,36,2025-04-10,0.3,11,328,7,True,112.0,2
4,4,264780,Spacious refurbished 2 bedroom apt with balcony,1389063,Sue,Lewisham,51.44333,-0.02307,Entire home/apt,157.0,3,54,2024-12-29,0.35,11,255,4,True,214.0,2


In [256]:
data = data.drop(["Unnamed: 0", "id", "name", "host_id", "host_name", "last_review"], axis=1)

In [257]:
# 1. Log transforms
data["log_price"]               = np.log1p(data["price"])
data["log_reviews_per_month"]   = np.log1p(data["reviews_per_month"])
data["log_reviews_ltm"]         = np.log1p(data["number_of_reviews_ltm"])
data["log_days_since_review"]   = np.log1p(data["days_since_review"])
data["log_host_listings"]       = np.log1p(data["calculated_host_listings_count"])

data = data.rename(columns={"neighbourhood": "borough"})

In [258]:
data.head()

Unnamed: 0,borough,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,has_review,days_since_review,bedroom_count,log_price,log_reviews_per_month,log_reviews_ltm,log_days_since_review,log_host_listings
0,Lewisham,51.44306,-0.01948,Entire home/apt,297.0,3,68,0.51,11,293,12,True,64.0,4,5.697093,0.41211,2.564949,4.174387,2.484907
1,Lewisham,51.44284,-0.01997,Entire home/apt,98.0,3,24,0.22,11,318,4,True,232.0,1,4.59512,0.198851,1.609438,5.451038,2.484907
2,Lewisham,51.44359,-0.02275,Entire home/apt,148.0,3,58,0.43,11,302,6,True,91.0,2,5.003946,0.357674,1.94591,4.521789,2.484907
3,Lewisham,51.44355,-0.02309,Entire home/apt,144.0,3,36,0.3,11,328,7,True,112.0,2,4.976734,0.262364,2.079442,4.727388,2.484907
4,Lewisham,51.44333,-0.02307,Entire home/apt,157.0,3,54,0.35,11,255,4,True,214.0,2,5.062595,0.300105,1.609438,5.370638,2.484907


In [259]:
data = data.drop(["price", "reviews_per_month", "number_of_reviews_ltm", "days_since_review", "calculated_host_listings_count"], axis=1)

In [260]:
data.head()

Unnamed: 0,borough,latitude,longitude,room_type,minimum_nights,number_of_reviews,availability_365,has_review,bedroom_count,log_price,log_reviews_per_month,log_reviews_ltm,log_days_since_review,log_host_listings
0,Lewisham,51.44306,-0.01948,Entire home/apt,3,68,293,True,4,5.697093,0.41211,2.564949,4.174387,2.484907
1,Lewisham,51.44284,-0.01997,Entire home/apt,3,24,318,True,1,4.59512,0.198851,1.609438,5.451038,2.484907
2,Lewisham,51.44359,-0.02275,Entire home/apt,3,58,302,True,2,5.003946,0.357674,1.94591,4.521789,2.484907
3,Lewisham,51.44355,-0.02309,Entire home/apt,3,36,328,True,2,4.976734,0.262364,2.079442,4.727388,2.484907
4,Lewisham,51.44333,-0.02307,Entire home/apt,3,54,255,True,2,5.062595,0.300105,1.609438,5.370638,2.484907


In [261]:
top_8_boroughs = data["borough"].value_counts().nlargest(8).index
data["borough_grouped"] = data["borough"].where(data["borough"].isin(top_8_boroughs), "Other")
dummies = pd.get_dummies(data, columns=["borough_grouped", "room_type"], prefix=["boro", "room"], drop_first=True)

In [262]:
dummies

Unnamed: 0,borough,latitude,longitude,minimum_nights,number_of_reviews,availability_365,has_review,bedroom_count,log_price,log_reviews_per_month,...,boro_Kensington and Chelsea,boro_Lambeth,boro_Other,boro_Southwark,boro_Tower Hamlets,boro_Wandsworth,boro_Westminster,room_Hotel room,room_Private room,room_Shared room
0,Lewisham,51.443060,-0.019480,3,68,293,True,4,5.697093,0.412110,...,False,False,True,False,False,False,False,False,False,False
1,Lewisham,51.442840,-0.019970,3,24,318,True,1,4.595120,0.198851,...,False,False,True,False,False,False,False,False,False,False
2,Lewisham,51.443590,-0.022750,3,58,302,True,2,5.003946,0.357674,...,False,False,True,False,False,False,False,False,False,False
3,Lewisham,51.443550,-0.023090,3,36,328,True,2,4.976734,0.262364,...,False,False,True,False,False,False,False,False,False,False
4,Lewisham,51.443330,-0.023070,3,54,255,True,2,5.062595,0.300105,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62679,Tower Hamlets,51.525360,-0.072550,1,0,365,False,1,7.241366,0.000000,...,False,False,False,False,True,False,False,False,False,False
62680,Hackney,51.543040,-0.076190,90,0,361,False,0,4.564348,0.000000,...,False,False,False,False,False,False,False,False,False,False
62681,Westminster,51.518096,-0.168024,2,0,359,False,1,5.703782,0.000000,...,False,False,False,False,False,False,True,False,False,False
62682,Westminster,51.496230,-0.133060,1,0,365,False,1,8.006701,0.000000,...,False,False,False,False,False,False,True,False,False,False


In [263]:
data = pd.concat([data, dummies], axis=1)

In [264]:
data = data.drop(["borough", "room_type", "borough_grouped"], axis=1)

In [265]:
data = data.fillna(0)

In [266]:
import sklearn as sk

In [267]:
y = data["log_price"]
X = data.drop(columns="log_price")
RAND_SEED = 42

In [268]:
bool_cols = X.select_dtypes(include="bool").columns
dtype_map = {col: "int64" for col in bool_cols}
X = X.astype(dtype_map)

In [269]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=RAND_SEED)

In [None]:
# Ridge

ridge = sk.linear_model.Ridge(alpha=1.0, random_state=RAND_SEED)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)
print(f"Ridge RMSE: {sk.metrics.mean_squared_error(y_test, y_pred_ridge)}")
print(f"Ridge R²: {sk.metrics.r2_score(y_test, y_pred_ridge)}")

Ridge RMSE: 0.31220393716969674
Ridge R²: 0.5223781392634553


In [None]:
# Random Forest Regressor

rf = sk.ensemble.RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    random_state=RAND_SEED,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("RF RMSE:", sk.metrics.mean_squared_error(y_test, y_pred_rf))
print("RF R²:  ", sk.metrics.r2_score(y_test, y_pred_rf))

RF RMSE: 0.22634712889779435
RF R²:   0.6537252609412886


In [None]:
# Feature Importance

In [275]:
# Ridge Hyperparameter Tuning

In [None]:
# Random Forest Regressor Hyperparameter Tuning

In [None]:
# Gradient Boosting