In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [3]:
# Load the data
listing_path = 'listings_clean.csv'
listing_data = pd.read_csv(listing_path)
print(listing_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7391 entries, 0 to 7390
Data columns (total 80 columns):
 #   Column                                                                         Non-Null Count  Dtype  
---  ------                                                                         --------------  -----  
 0   host_is_superhost                                                              7391 non-null   float64
 1   host_listings_count                                                            7391 non-null   float64
 2   host_identity_verified                                                         7391 non-null   int64  
 3   latitude                                                                       7391 non-null   float64
 4   longitude                                                                      7391 non-null   float64
 5   accommodates                                                                   7391 non-null   float64
 6   bedrooms                

In [None]:
# Split the data into training and test sets
X_train_valid, X_test, y_train_valid, y_test = train_test_split(listing_data.drop(columns='price'), listing_data['price'], test_size=0.2, shuffle=True, random_state=42)

In [None]:
ridge = Ridge(random_state=42)
ridge.fit(X_train_valid, y_train_valid)
print(f"R2 with Ridge: {r2_score(y_test, ridge.predict(X_test))}")
print(f"RMSE with Ridge: {root_mean_squared_error(y_test, ridge.predict(X_test))}")

lasso = Lasso(random_state=42)
lasso.fit(X_train_valid, y_train_valid)
print(f"R2 with Lasso: {r2_score(y_test, lasso.predict(X_test))}")
print(f"RMSE with Lasso: {root_mean_squared_error(y_test, lasso.predict(X_test))}")

R2 with Ridge: 0.4154598925377361
RMSE with Ridge: 150.78713737345626


R2 with Lasso: 0.3660244324713632
RMSE with Lasso: 157.03389375763956


In [None]:
# Set hyperparameters for the Random Forest Regressor
param_grid_rf = {
    'n_estimators': [150, 300, 450],
    'max_depth': [None, 10, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Perform GridSearchCV to find the best hyperparameters for the Random Forest Regressor
grid_rf = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1), param_grid_rf, cv=3, scoring='r2')
# Fit the model
grid_rf.fit(X_train_valid, y_train_valid)
print(f"Best params for Random Forest Regressor: {grid_rf.best_params_}")

Best params for Random Forest Regressor: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [None]:
# Best model evaluation
best_model_rf = grid_rf.best_estimator_ 
print(f"Train Score R²: {best_model_rf.score(X_train_valid, y_train_valid)}") 
print(f"Test Score R²: {best_model_rf.score(X_test, y_test)}") 
print(f"RMSE with RF: {np.sqrt(mean_squared_error(y_test, best_model_rf.predict(X_test)))}")

Train Score R²: 0.9519102032599146
Test Score R²: 0.7475979821675414
RMSE with RF: 99.08400444924503


In [None]:
param_grid_gb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300, 450],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 0.9, 1.0],
    'min_samples_leaf': [1, 3, 5],
}

# Perform GridSearchCV to find the best hyperparameters for the Gradient Boosting Regressor
grid_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=3, scoring='r2')
# Fit the model
grid_gb.fit(X_train_valid, y_train_valid)
print(f"Best params for GradientBoosting: {grid_gb.best_params_}")

Best params for GradientBoosting: {'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 3, 'n_estimators': 450, 'subsample': 0.8}


In [54]:
# Best model evaluation
best_model_gb = grid_gb.best_estimator_ 
print(f"Train Score R²: {best_model_gb.score(X_train_valid, y_train_valid)}") 
print(f"Test Score R²: {best_model_gb.score(X_test, y_test)}") 
print(f"RMSE with GB: {np.sqrt(mean_squared_error(y_test, best_model_gb.predict(X_test)))}")

Train Score R²: 0.9464959280356925
Test Score R²: 0.7690964000982994
RMSE with GB: 94.77035136285062


In [None]:
estimators = [
    ('gb', GradientBoostingRegressor(random_state=42, **grid_gb.best_params_)),
    ('rf', RandomForestRegressor(random_state=42, n_jobs=-1 , **grid_rf.best_params_)),
]
param_grid_final = {
    'final_estimator__learning_rate': [0.01, 0.05, 0.1],
    'final_estimator__n_estimators': [100, 300],
    'final_estimator__max_depth': [3, 5, 7],
}

stacking_reg = StackingRegressor(
    estimators=estimators,
    final_estimator=GradientBoostingRegressor(random_state=42)
)

# Perform GridSearchCV to find the best hyperparameters for the Stacking Regressor
grid_stacking = GridSearchCV(stacking_reg, param_grid_final, cv=3, scoring='r2')
# Fit the model
grid_stacking.fit(X_train_valid, y_train_valid)
print(f"Best params for Stacking: {grid_stacking.best_params_}")
# Best model evaluation
best_model = grid_stacking.best_estimator_
# Cross validation
cv_scores = cross_val_score(best_model, X_train_valid, y_train_valid, cv=3, scoring='r2')
print(f"Mean CV R²: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

predictions = best_model.predict(X_test)
# Evaluate the model
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

best_model = grid_stacking.best_estimator_ 
# Print the results
print(f"Train Score R²: {best_model.score(X_train_valid, y_train_valid)}") 
print(f"Final Stacking R²: {r2:.4f}")
print(f"Final Stacking RMSE: {rmse:.4f}")

Best params for Stacking: {'final_estimator__learning_rate': 0.05, 'final_estimator__max_depth': 3, 'final_estimator__n_estimators': 100}
Mean CV R²: 0.6583 ± 0.0359
Train Score R²: 0.952088184224459
Final Stacking R²: 0.7788
Final Stacking RMSE: 92.7632


In [None]:
# Get the features to drop
feature_importances = grid_rf.best_estimator_.feature_importances_

feature_importances_df = pd.DataFrame({
    'Feature': X_train_valid.columns,
    'Importance': feature_importances
})
# Get the features with importance less than 0.004
features_to_drop = feature_importances_df[feature_importances_df['Importance'] < 0.004]['Feature'].tolist()

# Drop the features from the training and test sets
X_train_valid_dropped = X_train_valid.drop(columns=features_to_drop)
X_test_dropped = X_test.drop(columns=features_to_drop)

# Retrain the model
reg_dropped = RandomForestRegressor(random_state=42 , max_depth=30, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 300)
reg_dropped.fit(X_train_valid_dropped, y_train_valid)

# Calculate and print the R² score
r2_score_dropped = r2_score(y_test, reg_dropped.predict(X_test_dropped))
print(f"Test Score R²: {r2_score_dropped}") 
print(f"RMSE with RF: {np.sqrt(mean_squared_error(y_test, reg_dropped.predict(X_test_dropped)))}")

Test Score R²: 0.7475189561885485
RMSE with RF: 99.09951462169941


Multi-Layer Perceptron 

In [None]:
# defime the hyperparameters for the MLP Regressor
param_grid = {
    'hidden_layer_sizes': [(100, 100)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'learning_rate': ['adaptive'],
    'learning_rate_init': [0.001],
    'max_iter': [200],
    'random_state': [42]
}

# Perform GridSearchCV to find the best hyperparameters for the MLP Regressor
mlp = MLPRegressor()
grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1, verbose=2)
# Fit the model
grid_search.fit(X_train_valid, y_train_valid)
print(f"Best hyperparameters: {grid_search.best_params_}")
# Best model evaluation
best_model = grid_search.best_estimator_
print(f"Train Score: {best_model.score(X_train_valid, y_train_valid)}")
print(f"Test Score: {best_model.score(X_test, y_test)}")
print(f"RMSE with MLP: {root_mean_squared_error(y_test, best_model.predict(X_test))}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best hyperparameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_iter': 200, 'random_state': 42, 'solver': 'adam'}
Train Score: 0.8332932049153928
Test Score: 0.7163134461742924
RMSE with MLP: 105.04527313980944
