# Modeling

## Load libraries

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import os

## Load data

In [23]:
modeling_data = pd.read_csv('../data/modeling/modeling_data.csv')

In [24]:
modeling_data.head()

Unnamed: 0,id,name,host_id,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,...,property_type,host_acceptance_rate,maximum_nights,listing_url,region,distance_to_cristo_redentor,distance_to_pan_de_azucar,distance_to_copacabana_beach,distance_to_ipanema_beach,distance_to_botanical_garden
0,2979359,"Quarto em Botafogo perto da praia, show da Mad...",12459092,-22.954477,-43.182747,2,51.0,2,1,2018-02-13,...,37,99,1125,https://www.airbnb.com/rooms/2979359,130,2.856086,2.885337,1.849209,3.95772,4.96844
1,48931984,LOVELY 4BR WITH AMAZING OCEAN VIEW IN COPACABANA,235496,-22.96596,-43.17555,0,164.05,2,106,2024-06-09,...,16,98,30,https://www.airbnb.com/rooms/48931984,32,3.905034,2.832851,0.888922,3.572558,5.495067
2,1162632127795605756,"Copacabana, Av Atlântica - AC + Laundry + Wi-Fi",10406037,-22.97247,-43.18471,0,20.23,1,1,2024-06-11,...,16,98,89,https://www.airbnb.com/rooms/1162632127795605756,93,3.493401,4.013975,0.298728,2.391609,4.589571
3,11691696,Hostel Copacabana Pacotes Trilhas,16613801,-22.96726,-43.18928,2,22.95,2,4,2024-05-06,...,41,99,1125,https://www.airbnb.com/rooms/11691696,51,2.763596,4.050868,0.841259,2.418955,4.087054
4,493569,Quarto familiar em Copacabana,2438532,-22.96515,-43.1891,2,23.12,2,138,2024-06-03,...,41,99,10,https://www.airbnb.com/rooms/493569,41,2.640371,3.919596,0.967863,2.613891,4.113101


In [25]:
modeling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20964 entries, 0 to 20963
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            20964 non-null  int64  
 1   name                          20964 non-null  object 
 2   host_id                       20964 non-null  int64  
 3   latitude                      20964 non-null  float64
 4   longitude                     20964 non-null  float64
 5   room_type                     20964 non-null  int64  
 6   price                         20964 non-null  float64
 7   minimum_nights                20964 non-null  int64  
 8   number_of_reviews             20964 non-null  int64  
 9   last_review                   20964 non-null  object 
 10  availability_365              20964 non-null  int64  
 11  number_of_reviews_ltm         20964 non-null  int64  
 12  neighbourhood_cleansed        20964 non-null  int64  
 13  h

Dropping the object columns that are not useful for the model.

In [26]:
modeling_data_df = modeling_data.copy()
modeling_data_df = modeling_data_df = modeling_data_df.drop(columns=modeling_data_df.select_dtypes(include='object').columns)
modeling_data_df.head()

Unnamed: 0,id,host_id,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,number_of_reviews_ltm,...,review_scores_value,property_type,host_acceptance_rate,maximum_nights,region,distance_to_cristo_redentor,distance_to_pan_de_azucar,distance_to_copacabana_beach,distance_to_ipanema_beach,distance_to_botanical_garden
0,2979359,12459092,-22.954477,-43.182747,2,51.0,2,1,364,0,...,5.0,37,99,1125,130,2.856086,2.885337,1.849209,3.95772,4.96844
1,48931984,235496,-22.96596,-43.17555,0,164.05,2,106,256,42,...,4.75,16,98,30,32,3.905034,2.832851,0.888922,3.572558,5.495067
2,1162632127795605756,10406037,-22.97247,-43.18471,0,20.23,1,1,18,1,...,5.0,16,98,89,93,3.493401,4.013975,0.298728,2.391609,4.589571
3,11691696,16613801,-22.96726,-43.18928,2,22.95,2,4,365,1,...,4.75,41,99,1125,51,2.763596,4.050868,0.841259,2.418955,4.087054
4,493569,2438532,-22.96515,-43.1891,2,23.12,2,138,347,12,...,4.8,41,99,10,41,2.640371,3.919596,0.967863,2.613891,4.113101


## Split the data

In [27]:
X = modeling_data_df.drop(columns=['price'])
y = modeling_data_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scale the data

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## General models

### 1. Random forest

In [29]:
model_rf = RandomForestRegressor(random_state=42)

In [30]:
model_rf.fit(X_train_scaled, y_train)

In [35]:
mae = mean_absolute_error(y_test, model_rf.predict(X_test_scaled))
mse = mean_squared_error(y_test, model_rf.predict(X_test_scaled))
r2 = r2_score(y_test, model_rf.predict(X_test_scaled))
rmse = root_mean_squared_error(y_test, model_rf.predict(X_test_scaled))

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')
print(f'RMSE: {rmse}')

MAE: 21.699486852206455
MSE: 888.5054683996179
R2: 0.5451337384357373
RMSE: 29.807808849353854


### 2. Gradient boosting classifier

In [36]:
model_gbc = GradientBoostingRegressor(random_state=42)

In [37]:
model_gbc.fit(X_train_scaled, y_train)

In [39]:
mae = mean_absolute_error(y_test, model_gbc.predict(X_test_scaled))
mse = mean_squared_error(y_test, model_gbc.predict(X_test_scaled))
r2 = r2_score(y_test, model_gbc.predict(X_test_scaled))
rmse = root_mean_squared_error(y_test, model_gbc.predict(X_test_scaled))

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')
print(f'RMSE: {rmse}')

MAE: 22.057237204098147
MSE: 919.2205204170231
R2: 0.5294093097385522
RMSE: 30.318649712957587


### 3. XGBoost

In [40]:
model_xgb = xgb.XGBRegressor(random_state=42)

In [41]:
model_xgb.fit(X_train_scaled, y_train)

In [42]:
mae = mean_absolute_error(y_test, model_xgb.predict(X_test_scaled))
mse = mean_squared_error(y_test, model_xgb.predict(X_test_scaled))
r2 = r2_score(y_test, model_xgb.predict(X_test_scaled))
rmse = root_mean_squared_error(y_test, model_xgb.predict(X_test_scaled))

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')
print(f'RMSE: {rmse}')

MAE: 20.87187120241476
MSE: 847.5889093510237
R2: 0.5660807814337177
RMSE: 29.113380246048788


### 4. KNNeighbors

In [43]:
model_knn = KNeighborsRegressor()

In [44]:
model_knn.fit(X_train_scaled, y_train)

In [45]:
mae = mean_absolute_error(y_test, model_knn.predict(X_test_scaled))
mse = mean_squared_error(y_test, model_knn.predict(X_test_scaled))
r2 = r2_score(y_test, model_knn.predict(X_test_scaled))
rmse = root_mean_squared_error(y_test, model_knn.predict(X_test_scaled))

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')
print(f'RMSE: {rmse}')

MAE: 25.7573088900558
MSE: 1266.6890261491733
R2: 0.35152441669632073
RMSE: 35.590574962329185


### 5. Support vector machine

In [46]:
model_svr = SVR()

In [47]:
model_svr.fit(X_train_scaled, y_train)

In [48]:
mae = mean_absolute_error(y_test, model_svr.predict(X_test_scaled))
mse = mean_squared_error(y_test, model_svr.predict(X_test_scaled))
r2 = r2_score(y_test, model_svr.predict(X_test_scaled))
rmse = root_mean_squared_error(y_test, model_svr.predict(X_test_scaled))

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')
print(f'RMSE: {rmse}')

MAE: 24.263971971664148
MSE: 1240.6826825399924
R2: 0.3648382439210096
RMSE: 35.22332583019372


I decided to use Random Forest and Gradient Boosting Classifier for the final model because they are the ones that have the best performance.

In [49]:
models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror'),
    'KNeighbors': KNeighborsRegressor(),
    'SVR': SVR()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name}: RMSE = {rmse}')
    print(f'{name}: R2 = {r2}')


Random Forest: RMSE = 29.86438649339263
Random Forest: R2 = 0.5434053534094359
Gradient Boosting: RMSE = 30.318649712957587
Gradient Boosting: R2 = 0.5294093097385522
XGBoost: RMSE = 29.113380246048788
XGBoost: R2 = 0.5660807814337177
KNeighbors: RMSE = 35.590574962329185
KNeighbors: R2 = 0.35152441669632073
SVR: RMSE = 35.22332583019372
SVR: R2 = 0.3648382439210096


## Hyperparameter tuning: Random Forest and Gradient Boosting Classifier

### 1. Random Forest

In [55]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)

print("Best Hyperparameters:", random_search.best_params_)

best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE) on Test Set: {mse}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse}')
print(f'R2 Score on Test Set: {r2}')


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
Mean Squared Error (MSE) on Test Set: 855.254084547912
Root Mean Squared Error (RMSE) on Test Set: 29.24472746578282
R2 Score on Test Set: 0.5621566304745527


### 2. XGBoost

In [56]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

xgb_model = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')

param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'min_child_weight': [1, 2, 3, 4],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0, 0.01, 0.1, 1]
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

X_train_scaled = np.nan_to_num(X_train_scaled)
y_train = np.nan_to_num(y_train)

random_search.fit(X_train_scaled, y_train)

print("Best Hyperparameters:", random_search.best_params_)

best_xgb = random_search.best_estimator_
y_pred = best_xgb.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE) on Test Set: {mse}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse}')
print(f'R2 Score on Test Set: {r2}')


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Hyperparameters: {'subsample': 0.7, 'reg_lambda': 0.1, 'reg_alpha': 0, 'n_estimators': 1000, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.9}
Mean Squared Error (MSE) on Test Set: 782.4688124954017
Root Mean Squared Error (RMSE) on Test Set: 27.97264400258584
R2 Score on Test Set: 0.5994187135713476
