In [63]:
import pandas as pd
import numpy as np

In [64]:
data = pd.read_csv("data/listings_clean.csv")

In [65]:
import re

# 1. Map number-words to integers
word_to_num = {
    "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10
}

# 2. Define the extractor
def extract_bedroom_count(title: str) -> int:
    txt = title.lower()
    # Studio → 0 bedrooms
    if "studio" in txt:
        return 0
    # Regex: digits or words before 'bedroom' or 'bedrooms'
    pattern = r'\b(\d+|' + '|'.join(word_to_num.keys()) + r')\b[\s-]*bedrooms?'
    match = re.search(pattern, txt, flags=re.IGNORECASE)
    if match:
        val = match.group(1)
        return int(val) if val.isdigit() else word_to_num[val.lower()]
    # Default if no explicit mention
    return 1

# 3. Apply to your DataFrame
# Assuming your DataFrame is called `data` and has a column `name`
data['bedroom_count'] = data['name'].apply(extract_bedroom_count)

# 4. Quick check
print(data[['name', 'bedroom_count']].head())

                                              name  bedroom_count
0                      Huge Four Bedroom Apartment              4
1                            One Bedroom Apartment              1
2          Two Bedroom Newly Refurbished Apartment              2
3                Refurbished Two Bedroom Apartment              2
4  Spacious refurbished 2 bedroom apt with balcony              2


In [66]:
data["log_price"] = np.log1p(data["price"])
data["log_reviews_per_month"] = np.log1p(data["reviews_per_month"])
data["log_reviews_ltm"] = np.log1p(data["number_of_reviews_ltm"])
data["log_days_since_review"] = np.log1p(data["days_since_review"])
data["log_host_listings"] = np.log1p(data["calculated_host_listings_count"])

In [67]:
top8 = data["neighbourhood"].value_counts().nlargest(8).index
data["borough_grouped"] = data["neighbourhood"].where(data["neighbourhood"].isin(top8), "Other")

In [68]:
data = data.drop(columns=["Unnamed: 0", "id", "name", "host_id", "host_name", "price", "number_of_reviews", "last_review", "reviews_per_month", "calculated_host_listings_count", "number_of_reviews_ltm", "days_since_review", "neighbourhood"], axis=1)

In [69]:
data

Unnamed: 0,latitude,longitude,room_type,minimum_nights,availability_365,has_review,bedroom_count,log_price,log_reviews_per_month,log_reviews_ltm,log_days_since_review,log_host_listings,borough_grouped
0,51.443060,-0.019480,Entire home/apt,3,293,True,4,5.697093,0.412110,2.564949,4.204693,2.484907,Other
1,51.442840,-0.019970,Entire home/apt,3,318,True,1,4.595120,0.198851,1.609438,5.459586,2.484907,Other
2,51.443590,-0.022750,Entire home/apt,3,302,True,2,5.003946,0.357674,1.945910,4.543295,2.484907,Other
3,51.443550,-0.023090,Entire home/apt,3,328,True,2,4.976734,0.262364,2.079442,4.744932,2.484907,Other
4,51.443330,-0.023070,Entire home/apt,3,255,True,2,5.062595,0.300105,1.609438,5.379897,2.484907,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62679,51.525360,-0.072550,Entire home/apt,1,365,False,1,7.241366,0.000000,0.000000,,1.386294,Tower Hamlets
62680,51.543040,-0.076190,Entire home/apt,90,361,False,0,4.564348,0.000000,0.000000,,1.609438,Hackney
62681,51.518096,-0.168024,Entire home/apt,2,359,False,1,5.703782,0.000000,0.000000,,1.386294,Westminster
62682,51.496230,-0.133060,Entire home/apt,1,365,False,1,8.006701,0.000000,0.000000,,3.912023,Westminster


In [70]:
data = pd.get_dummies(data, 
                      columns=["borough_grouped", "room_type"],
                      prefix=["boro", "room"],
                      drop_first=True)

In [71]:
data.columns

Index(['latitude', 'longitude', 'minimum_nights', 'availability_365',
       'has_review', 'bedroom_count', 'log_price', 'log_reviews_per_month',
       'log_reviews_ltm', 'log_days_since_review', 'log_host_listings',
       'boro_Hackney', 'boro_Kensington and Chelsea', 'boro_Lambeth',
       'boro_Other', 'boro_Southwark', 'boro_Tower Hamlets', 'boro_Wandsworth',
       'boro_Westminster', 'room_Hotel room', 'room_Private room',
       'room_Shared room'],
      dtype='object')

In [72]:
data = data.fillna(0)

In [73]:
bool_cols = data.select_dtypes(include="bool").columns
data[bool_cols] = data[bool_cols].astype(int)

In [74]:
data

Unnamed: 0,latitude,longitude,minimum_nights,availability_365,has_review,bedroom_count,log_price,log_reviews_per_month,log_reviews_ltm,log_days_since_review,...,boro_Kensington and Chelsea,boro_Lambeth,boro_Other,boro_Southwark,boro_Tower Hamlets,boro_Wandsworth,boro_Westminster,room_Hotel room,room_Private room,room_Shared room
0,51.443060,-0.019480,3,293,1,4,5.697093,0.412110,2.564949,4.204693,...,0,0,1,0,0,0,0,0,0,0
1,51.442840,-0.019970,3,318,1,1,4.595120,0.198851,1.609438,5.459586,...,0,0,1,0,0,0,0,0,0,0
2,51.443590,-0.022750,3,302,1,2,5.003946,0.357674,1.945910,4.543295,...,0,0,1,0,0,0,0,0,0,0
3,51.443550,-0.023090,3,328,1,2,4.976734,0.262364,2.079442,4.744932,...,0,0,1,0,0,0,0,0,0,0
4,51.443330,-0.023070,3,255,1,2,5.062595,0.300105,1.609438,5.379897,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62679,51.525360,-0.072550,1,365,0,1,7.241366,0.000000,0.000000,0.000000,...,0,0,0,0,1,0,0,0,0,0
62680,51.543040,-0.076190,90,361,0,0,4.564348,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
62681,51.518096,-0.168024,2,359,0,1,5.703782,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,1,0,0,0
62682,51.496230,-0.133060,1,365,0,1,8.006701,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,1,0,0,0


In [75]:
RAND_SEED = 42

In [76]:
from sklearn.model_selection import train_test_split

y = data["log_price"]
X = data.drop(columns="log_price")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RAND_SEED)

In [78]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1.0, random_state=RAND_SEED)
ridge_model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [82]:
y_pred = ridge_model.predict(X_test)

In [94]:
def RMSE_to_pct_error(RMSE: float):
    return f"{np.exp(RMSE) - 1:.2%}"

In [90]:
from sklearn.metrics import mean_squared_error, r2_score

ridge_RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
ridge_r2 = r2_score(y_test, y_pred)

print(f"Ridge || RMSE: {ridge_RMSE:.3f} | R²: {ridge_r2:.3f}")

Ridge || RMSE: 0.559 | R²: 0.522


In [96]:
RMSE_to_pct_error(ridge_RMSE)

'74.85%'

In [99]:
coef_series = pd.Series(ridge_model.coef_, index=X_train.columns)
top10 = coef_series.abs().sort_values(ascending=False).head(10)
top10

room_Shared room               1.524520
room_Private room              0.929574
has_review                     0.551736
boro_Other                     0.350597
room_Hotel room                0.344038
boro_Hackney                   0.249941
boro_Westminster               0.227553
latitude                       0.217895
boro_Tower Hamlets             0.212755
boro_Kensington and Chelsea    0.210430
dtype: float64

In [100]:
coef_series.loc[top10.index]

room_Shared room              -1.524520
room_Private room             -0.929574
has_review                    -0.551736
boro_Other                    -0.350597
room_Hotel room               -0.344038
boro_Hackney                  -0.249941
boro_Westminster               0.227553
latitude                       0.217895
boro_Tower Hamlets            -0.212755
boro_Kensington and Chelsea    0.210430
dtype: float64

In [None]:
from sklearn.