Note: you must run this notebook by: https://www.kaggle.com/code/stegosaurus3000/tree-models

In [None]:
import pandas as pd
from scipy.stats import uniform, randint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Prepare Data

In [None]:
df = pd.read_csv('/kaggle/input/cleaned-df-gesa-csv/airbnb_cleaned_final.csv')

In [None]:
corr=df.corr()
#sns.heatmap(x=corr[['price']],y=corr)
sns.heatmap(corr['price'].values.reshape(-1,1), xticklabels=['price'], yticklabels=corr.columns, annot=True)

# Visualization Functions

In [None]:
def plot_actual_vs_predicted(y_true, y_pred, model_name):
    """Plot actual vs predicted prices und print the metrics"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name} Performance Metrics:")
    print(f"{'='*40}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"R²:   {r2:.4f}")
    print(f"{'='*40}")
    plt.figure(figsize=(10, 8))
    

    plt.scatter(y_true, y_pred, alpha=0.6, s=30, color='hotpink', label='Predicted vs Actual')
    
    min_test = min(y_true.min(), y_pred.min())
    max_test = max(y_true.max(), y_pred.max())
    plt.plot([min_test, max_test], [min_test, max_test], color='deeppink', linestyle='--', lw=2, label='Perfect Prediction')
    
    plt.xlabel('Actual Price', fontsize=12, color='hotpink')
    plt.ylabel('Predicted Price', fontsize=12, color='hotpink')
    plt.title(f'{model_name} - Actual vs Predicted Prices', fontsize=14, fontweight='bold', color='hotpink')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


# Model Training

## Data Splitting

In [None]:
X = df.drop(columns=['price', 'id', 'host_id'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Hyperparameter Tunning

In [None]:
xgb_param_grid = {
    'n_estimators': [300, 500],
    'learning_rate': [0.01, 0.03, 0.05],  
    'max_depth': [3, 4, 5],  
    'min_child_weight': [3, 5, 7],  
    'subsample': [0.7, 0.8],  
    'colsample_bytree': [0.7, 0.8],  
    'reg_alpha': [0.1, 0.5, 1.0],  #  L1 
    'reg_lambda': [2, 5, 10]  #L2 
}
gb_param_dist = {
    'n_estimators': [300, 500],  
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': [3, 4, 5], 
    'min_samples_split': [10, 20, 30],  
    'min_samples_leaf': [5, 10, 15],  
    'subsample': [0.7, 0.8], 
    'max_features': ['sqrt', 'log2', 0.6]
}
gr_space = {
    'n_estimators': [300, 500],
    'max_depth': [3,4,5],
    'min_samples_leaf': [ 5, 10, 15],
    'max_features': ['sqrt', 'log2', 0.6],
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'] 
}

## Random Forests

In [None]:
reg_grid = RandomForestRegressor()
grid = RandomizedSearchCV(
    reg_grid, 
    gr_space, 
    cv=3, 
    scoring='r2', 
    verbose=1)
gr_grid = grid.fit(
    X_train, y_train,
    
)


print(f'Best Random Forest Parameters: {gr_grid.best_params_}')
print(f'Best R2 score: {gr_grid.best_score_:2f}')

## Gradient Boosting

In [None]:
gb_model = GradientBoostingRegressor(
    random_state=42,
    validation_fraction=0.2,
    n_iter_no_change=15,      # Stop if no improvement for 15 iterations
    tol=1e-4
)

gb_grid = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=gb_param_dist,
    n_iter=40,  
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

gb_grid.fit(
    X_train, y_train,
    
)

print(f"Best Gradient Boosting parameters: {gb_grid.best_params_}")
print(f"Best Gradient Boosting R2 score: {gb_grid.best_score_:.2f}")

## XGBoost

In [None]:
xgb_base = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='mae',
    verbosity=0,
    random_state=42
)

xgb_grid =  RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=xgb_param_grid,
    n_iter=50,  
    cv=3,  
    scoring='r2',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_grid.fit(
    X_train, y_train,
    
)

print(f"Best XGBoost parameters: {xgb_grid.best_params_}")
print(f"Best XGBoost R2 score: {xgb_grid.best_score_:.2f}")

Model Training

In [None]:
#Train best models
xgb_best = xgb_grid.best_estimator_
gb_best = gb_grid.best_estimator_
gr_best = gr_grid.best_estimator_

#Get predictions from the best models
xgb_pred_best = xgb_best.predict(X_test)
gb_pred_best = gb_best.predict(X_test)
gr_pred_best = gr_best.predict(X_test)

# Visualizatuins
plot_actual_vs_predicted(y_test, xgb_pred_best, "XGBoost Best Model")
plot_actual_vs_predicted(y_test, gb_pred_best, "GradientBoosting Best Model")
plot_actual_vs_predicted(y_test, gr_pred_best, "GradientRegression Best Model")
