In [2]:
#!pip install catboost

In [3]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder,PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import tensorflow as tf
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate, KFold

import warnings
warnings.filterwarnings("ignore")


In [4]:
df = pd.read_csv("Cleaned_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,host_id,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_identity_verified,zipcode,is_location_exact,accommodates,...,host_commitment,bedroom_quality,space_per_guest,seasonal_demand,recent_review_boost,essential_amenities,luxury_amenities,review_consistency,positive_momentum,price_value
0,0,3881880.0,2,100.0,0,19.0,1,92101.0,1,1.0,...,1530.333333,0.666667,0.47619,-0.47116,1.0,3,3,0.011628,58.91751,40.0
1,1,4888818.0,2,100.0,0,1.0,0,92122.0,1,8.0,...,166.111111,1.4,1.568627,0.972118,0.5,2,0,0.132653,255.988561,18.875
2,2,9832430.0,2,100.0,1,12.0,0,92103.0,1,3.0,...,24.590643,0.666667,2.727273,-0.128748,0.5,3,1,0.858586,436.526035,39.666667
3,3,183755154.0,1,100.0,1,7.0,0,92101.0,1,2.0,...,12.14554,0.0,1.818182,0.523416,0.5,3,0,1.070707,457.937226,59.5
4,4,8336938.0,1,100.0,1,21.0,1,92109.0,0,6.0,...,1426.333333,1.333333,1.463415,-0.985948,1.0,3,0,0.009901,69.314718,77.916667


In [5]:
# Different Features than the First one to enhance Model
enhanced_features = ['host_response_rate','host_is_superhost','host_listings_count','accommodates','bathrooms','bedrooms','beds','number_of_reviews','number_of_stays','review_duration_days','host_duration_days','price_value',
'host_response_power','host_commitment','bedroom_quality','space_per_guest', 'essential_amenities','review_consistency','instant_bookable','host_identity_verified','room_type_cleaned', 'cancellation_policy_cleaned']

In [6]:
## Preprocessing using Pipeline

# Categorize the columns
num_cols = df.select_dtypes(include=['number']).columns.tolist()
binary_cols = [c for c in num_cols if df[c].nunique() == 2]
cont_cols = [c for c in num_cols if c not in binary_cols]
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numerical columns:", num_cols)
print("Binary columns:", binary_cols)
print("Continuous columns:", cont_cols)
print("Categorical columns:", cat_cols)

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, [f for f in enhanced_features if f not in ['room_type_cleaned', 'cancellation_policy_cleaned']]),
    ('cat', categorical_transformer, ['room_type_cleaned', 'cancellation_policy_cleaned'])
])


Numerical columns: ['Unnamed: 0', 'host_id', 'host_response_time', 'host_response_rate', 'host_is_superhost', 'host_listings_count', 'host_identity_verified', 'zipcode', 'is_location_exact', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'nightly_price', 'price_per_stay', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 'number_of_stays', 'review_scores_rating', 'instant_bookable', 'has_heating', 'has_carbon_monoxide_detector', 'has_hot_water', 'has_smoke_detector', 'has_iron', 'has_laptop_friendly_workspace', 'has_kitchen', 'has_air_conditioning', 'has_essentials', 'has_hangers', 'has_wifi', 'has_tv', 'has_lock_on_bedroom_door', 'has_gym', 'has_pool', 'has_hot_tub', 'has_free_parking_on_premises', 'has_private_entrance', 'has_elevator', 'description_cleaned', 'neighborhood_overview_cleaned', 'notes_cleaned', 'transit_cleaned', 'access_cleaned', 'interaction_cleaned', 'house_rules_cleaned', 'host_location_

In [7]:
# Gradient Boosting with Early Stopping
gb_enhanced = GradientBoostingRegressor(n_estimators=200,learning_rate=0.05,max_depth=4,min_samples_split=20,subsample=0.7, validation_fraction=0.2,n_iter_no_change=10, random_state=42)

enhanced_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', gb_enhanced)
])

X_enhanced = df[enhanced_features]
y = df['review_scores_rating']

# K-Fold Cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(enhanced_pipeline,X_enhanced,y,cv=kf,scoring='r2')
print(f"\nGradient Boosting Cross-Validation R²: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, y, test_size=0.2, random_state=42)

enhanced_pipeline.fit(X_train, y_train)

# Evaluation
train_pred = enhanced_pipeline.predict(X_train)
test_pred = enhanced_pipeline.predict(X_test)

print("Gradient Boosting Model Performance:")
print(f"Train R²: {r2_score(y_train, train_pred):.3f}")
print(f"Test R²: {r2_score(y_test, test_pred):.3f}")
print(f"Overfit: {r2_score(y_train, train_pred) - r2_score(y_test, test_pred):.3f}")


Gradient Boosting Cross-Validation R²: 0.689 (±0.014)
Gradient Boosting Model Performance:
Train R²: 0.735
Test R²: 0.691
Overfit: 0.044


In [8]:
results = []

models = {
    "Linear Regression": LinearRegression(),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0),
    "Ridge Regression": Ridge(alpha=1.0, random_state=42),
    "Lasso Regression": Lasso(alpha=0.1, random_state=42),
    "ElasticNet Regression": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
    "MLP Regressor": MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

# K-Fold Cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")

    model_pipeline = Pipeline([('preprocessor', preprocessor), ('model', model)])

    # Cross-validation scores
    cv_train_scores = cross_val_score(model_pipeline, X_enhanced, y, cv=kf, scoring='r2')
    print(f"Cross-Validation Train R²: {cv_train_scores.mean():.3f} (±{cv_train_scores.std():.3f})")

    X_train, X_test, y_train, y_test = train_test_split(X_enhanced, y, test_size=0.2, random_state=42)

    model_pipeline.fit(X_train, y_train)

    # Predictions
    train_pred = model_pipeline.predict(X_train)
    test_pred = model_pipeline.predict(X_test)

    print(f"Final Train R²: {r2_score(y_train, train_pred):.3f}")
    print(f"Final Test R²: {r2_score(y_test, test_pred):.3f}")
    print(f"Overfit: {r2_score(y_train, train_pred) - r2_score(y_test, test_pred):.3f}")

    results.append({'Model': model_name,'Cross-Validation R² Mean': cv_train_scores.mean(),'Cross-Validation R² Std': cv_train_scores.std(),'Train R²': r2_score(y_train, train_pred),'Test R²': r2_score(y_test, test_pred),'Overfit': r2_score(y_train, train_pred) - r2_score(y_test, test_pred)})

# Convert results to a DataFrame for comparison
model_comparison = pd.DataFrame(results)

# Display the model comparison table
print("\nModel Comparison:")
print(model_comparison)



Evaluating Linear Regression...
Cross-Validation Train R²: 0.341 (±0.011)
Final Train R²: 0.345
Final Test R²: 0.341
Overfit: 0.005

Evaluating Gradient Boosting...
Cross-Validation Train R²: 0.618 (±0.018)
Final Train R²: 0.636
Final Test R²: 0.622
Overfit: 0.014

Evaluating Support Vector Regressor...
Cross-Validation Train R²: 0.104 (±0.029)
Final Train R²: 0.136
Final Test R²: 0.108
Overfit: 0.027

Evaluating XGBoost...
Cross-Validation Train R²: 0.878 (±0.006)
Final Train R²: 0.977
Final Test R²: 0.883
Overfit: 0.093

Evaluating LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1708
[LightGBM] [Info] Number of data points in the train set: 6979, number of used features: 24
[LightGBM] [Info] Start training from score 95.882075
[LightGBM] [Info] Auto-choosing row-wise 