In [105]:
import os
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from scipy.stats import uniform, randint
import time
from sklearn.cluster import KMeans

In [106]:
cwd = os.getcwd()
if os.path.basename(cwd) == "Notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd
path = os.path.join(project_root, "data", "Airbnb_DK_Processed_Data.csv")
df = pd.read_csv(path)

In [107]:
df['price_log'] = np.log1p(df['price'])

In [108]:
le = LabelEncoder()
# ------- Combining less common property types into 'Other' and encoding
common = df['property_type'].value_counts()[lambda x: x >= 50].index
df['property_type'] = df['property_type'].where(df['property_type'].isin(common), 'Other')
df['property_type_encoded'] = le.fit_transform(df['property_type'])
# ------- Encoding neighbourhood_cleansed
df['neighbourhood_cleansed_encoded'] = le.fit_transform(df['neighbourhood_cleansed'])
# ------- Converting room_type to binary feature (Entire home/private room and similar)
df['is_entire_place'] = (df['room_type'] == 'Entire home/apt').astype(int)
# ------- Combining less common bathroom info into 'Other' and encoding
common = df['bathrooms_text'].value_counts()[lambda x: x >= 100].index
df['bathrooms_text'] = df['bathrooms_text'].where(df['bathrooms_text'].isin(common), 'Other')
df['bathrooms_text_encoded'] = le.fit_transform(df['bathrooms_text'])
# ------- Capping minimum_nights at 10, removes unnecessary outliers
df['minimum_nights_capped'] = df['minimum_nights'].clip(upper=10)
# ------- Capping number_of_reviews at 25, removes unnecessary outliers
df['number_of_reviews_capped'] = df['number_of_reviews'].clip(upper=25)
# ------- Converting some features to binary feature
df['instant_bookable'] = (df['instant_bookable'] == 't').astype(int)
df['host_is_superhost'] = (df['host_is_superhost'] == 't').astype(int)
# ------- Encoding host_response_time
df['host_response_time_encoded'] = le.fit_transform(df['host_response_time'])
# ------- Dont use estimated_occupancy_l365d and estimated_revenue_l365d due to data leakag
# ------- Combining less common bathroom info into 'Other' and encoding
common = df['host_verifications'].value_counts()[lambda x: x >= 100].index
df['host_verifications'] = df['host_verifications'].where(df['host_verifications'].isin(common), 'Other')
df['host_verifications_encoded'] = le.fit_transform(df['host_verifications'])
# ------- Creating amenities count features
df['amenities_list'] = df['amenities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['amenities_count'] = df['amenities_list'].apply(len)
keywords = ['Wifi', 'Pool', 'Hot tub', 'Air conditioning', 
            'Free parking', 'Kitchen', 'Washer', 'Dryer', 
            'Heating', 'TV', 'Pets allowed']
df['count_high_value_amenities'] = df['amenities_list'].apply(lambda lst: sum(any(kw.lower() in a.lower() for a in lst) for kw in keywords))
#
coords = df[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=20, random_state=42)
df['location_cluster'] = kmeans.fit_predict(coords)
# ------- Ignore name, description, comments for now


In [109]:
feature_for_model = ['property_type_encoded', 'neighbourhood_cleansed_encoded', 'is_entire_place', 'accommodates', 'bedrooms', 'bathrooms_text_encoded', 'minimum_nights_capped', 'latitude', 'longitude', 'review_scores_rating', 'review_scores_cleanliness', 'number_of_reviews_capped', 'instant_bookable', 'host_is_superhost', 'host_response_time_encoded', 'host_verifications_encoded', 'amenities_count', 'count_high_value_amenities', 'dist_to_raadhus_km', 'location_cluster']

# ===============================
# 1Ô∏è‚É£ Prepare data
# ===============================
X = df[feature_for_model]
y = df['price_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# 2Ô∏è‚É£ Define base model
# ===============================
xgb_base = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    tree_method='hist',          # fast on large datasets
    n_jobs=-1,
    random_state=42
)

# ===============================
# 3Ô∏è‚É£ Define hyperparameter search space
# ===============================
param_dist = {
    'n_estimators': randint(300, 1200),
    'learning_rate': uniform(0.01, 0.2),       # 0.01‚Äì0.21
    'max_depth': randint(3, 10),
    'subsample': uniform(0.6, 0.4),            # 0.6‚Äì1.0
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 0.4),
    'reg_lambda': uniform(0.5, 2.0),
    'reg_alpha': uniform(0, 0.5)
}

# ===============================
# 4Ô∏è‚É£ Randomized Search (CV)
# ===============================
search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=250,                          # 50 random combos (good tradeoff)
    scoring='neg_root_mean_squared_error',
    cv=3,                               # 3-fold CV
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("üîç Running hyperparameter search...")
start = time.time()
search.fit(X_train, y_train)
print(f"‚úÖ Search done in {time.time()-start:.1f}s")
print("Best parameters:", search.best_params_)

# ===============================
# 5Ô∏è‚É£ Refit best model with early stopping
# ===============================
best_params = search.best_params_
best_model = XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    eval_metric='rmse',
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)

best_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False
)

# ===============================
# 6Ô∏è‚É£ Evaluate
# ===============================
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))
print(f"üèÅ Final RMSE on unscaled price: {rmse:.3f}")

# ===============================
# 7Ô∏è‚É£ Feature importance
# ===============================
importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüîù Top 10 important features:")
print(importance.head(10))




üîç Running hyperparameter search...
Fitting 3 folds for each of 250 candidates, totalling 750 fits
‚úÖ Search done in 39.8s
Best parameters: {'colsample_bytree': 0.8105610643744453, 'gamma': 0.29263580870213274, 'learning_rate': 0.026325996406117914, 'max_depth': 8, 'min_child_weight': 7, 'n_estimators': 823, 'reg_alpha': 0.07977234005659417, 'reg_lambda': 2.2435671331844036, 'subsample': 0.6876855949432177}
üèÅ Final RMSE on unscaled price: 534.811

üîù Top 10 important features:
                           feature  importance
2                  is_entire_place    0.497974
3                     accommodates    0.141874
4                         bedrooms    0.130240
18              dist_to_raadhus_km    0.030242
5           bathrooms_text_encoded    0.024924
0            property_type_encoded    0.019022
1   neighbourhood_cleansed_encoded    0.016145
8                        longitude    0.015160
19                location_cluster    0.014743
12                instant_bookable    0.

In [111]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))
print(f"MAE: {mae:.1f}")

MAE: 320.4


In [112]:
mae / np.expm1(y_test).mean()


0.23570202552965924