In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder,PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
import tensorflow as tf
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate, KFold

import warnings
warnings.filterwarnings("ignore")


In [2]:
listing_features_df = pd.read_csv("Features1.csv")
listing_features_df = listing_features_df.drop(columns = "Unnamed: 0")
listing_features_df.head()

Unnamed: 0,host_listings_count,accommodates,beds,cleaning_fee,minimum_nights,maximum_nights,review_scores_rating,host_about_cleaned,house_rules_cleaned,notes_cleaned,cancellation_policy_cleaned,description_cleaned,interaction_cleaned,access_cleaned,room_type_cleaned,property_type_cleaned,host_response_time
0,19.0,1.0,2.0,50.0,5.0,1125.0,85.0,0.000986,0.000998,0.000986,1,0.00095,0.000989,0.000998,1,9,2
1,1.0,8.0,5.0,100.0,4.0,15.0,97.0,0.00099,0.000988,0.00099,1,0.000939,0.000994,0.000995,0,9,2
2,12.0,3.0,1.0,90.0,1.0,1125.0,98.0,0.000968,0.00096,0.000993,1,0.000941,0.000996,0.000993,0,9,2
3,7.0,2.0,1.0,75.0,2.0,1125.0,98.0,0.00099,0.000996,0.000991,1,0.000984,0.00099,0.000981,0,9,1
4,21.0,6.0,4.0,165.0,2.0,1125.0,100.0,0.000959,0.000998,0.000988,1,0.000953,0.000983,0.000982,0,9,1


In [3]:
binary_df = pd.read_csv("Binary_Features.csv")
binary_df = binary_df.drop(columns = "Unnamed: 0")
binary_df.head()

Unnamed: 0,has_smoke_detector,has_heating,has_hot_water,has_carbon_monoxide_detector,has_essentials,has_kitchen,has_laptop_friendly_workspace,has_iron,has_air_conditioning,has_hangers,has_tv,has_wifi,has_lock_on_bedroom_door,has_pool,has_hot_tub,has_gym,has_free_parking_on_premises,has_private_entrance,has_elevator
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,1,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,0
3,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0


In [4]:
listing_features_df = listing_features_df.join(binary_df)
listing_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8724 entries, 0 to 8723
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   host_listings_count            8724 non-null   float64
 1   accommodates                   8724 non-null   float64
 2   beds                           8724 non-null   float64
 3   cleaning_fee                   8724 non-null   float64
 4   minimum_nights                 8724 non-null   float64
 5   maximum_nights                 8724 non-null   float64
 6   review_scores_rating           8724 non-null   float64
 7   host_about_cleaned             8724 non-null   float64
 8   house_rules_cleaned            8724 non-null   float64
 9   notes_cleaned                  8724 non-null   float64
 10  cancellation_policy_cleaned    8724 non-null   int64  
 11  description_cleaned            8724 non-null   float64
 12  interaction_cleaned            8724 non-null   f

In [5]:
selected_columns = ['cancellation_policy_cleaned','description_cleaned','interaction_cleaned','access_cleaned','room_type_cleaned','property_type_cleaned','host_response_time','has_carbon_monoxide_detector', 'has_hot_water', 'has_smoke_detector','has_heating','has_air_conditioning', 'has_essentials','has_kitchen','has_iron','has_hangers','has_laptop_friendly_workspace',]

# Display the first 10 rows of the selected columns
first_10_rows = listing_features_df[selected_columns].head(10)
print(first_10_rows)

   cancellation_policy_cleaned  description_cleaned  interaction_cleaned  \
0                            1             0.000950             0.000989   
1                            1             0.000939             0.000994   
2                            1             0.000941             0.000996   
3                            1             0.000984             0.000990   
4                            1             0.000953             0.000983   
5                            1             0.000948             0.000983   
6                            1             0.000947             0.000994   
7                            1             0.000955             0.000998   
8                            1             0.000936             0.000977   
9                            1             0.000968             0.000998   

   access_cleaned  room_type_cleaned  property_type_cleaned  \
0        0.000998                  1                      9   
1        0.000995                  0 

In [6]:
# Interaction between numerical features
listing_features_df['accommodates_per_bed'] = listing_features_df['accommodates'] / (listing_features_df['beds'] + 1e-9)
# Interaction between binary features
listing_features_df['has_kitchen_and_iron'] = (listing_features_df['has_kitchen'] == 1) & (listing_features_df['has_iron'] == 1)
listing_features_df['has_kitchen_and_laptop_workspace'] = (listing_features_df['has_kitchen'] == 1) & (listing_features_df['has_laptop_friendly_workspace'] == 1)

In [7]:
# Select numerical columns for polynomial features
numerical_cols = ['host_listings_count', 'accommodates', 'beds', 'cleaning_fee', 'minimum_nights', 'maximum_nights']
poly = PolynomialFeatures(degree=2, include_bias=False)

# Generate polynomial features
poly_features = poly.fit_transform(listing_features_df[numerical_cols])
poly_feature_names = poly.get_feature_names_out(numerical_cols)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)

# Add polynomial features to the main DataFrame
listing_features_df = pd.concat([listing_features_df, poly_df], axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder

object_cols = listing_features_df.select_dtypes(include=['object']).columns
for col in object_cols:
    le = LabelEncoder()
    listing_features_df[col] = le.fit_transform(listing_features_df[col].astype(str))

In [9]:
avg_cleaning_fee_by_property_type = listing_features_df.groupby('property_type_cleaned')['cleaning_fee'].transform('mean')
listing_features_df['avg_cleaning_fee_per_property_type'] = avg_cleaning_fee_by_property_type.iloc[:, 0]

In [10]:
X = listing_features_df.drop(columns=['review_scores_rating'])
y = listing_features_df['review_scores_rating']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data, then transform the testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0),
}

results = []

# Train, predict, and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")

    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({ "Model": model_name,"MAE": mae, "MSE": mse,"R2 Score": r2})

    print(f"{model_name}:")
    print(f"  MAE: {mae:.2f}")
    print(f"  MSE: {mse:.2f}")
    print(f"  R2 Score: {r2:.2f}")
    print("-" * 50)

# Convert results to a DataFrame for comparison
model_comparison = pd.DataFrame(results)
print("\nModel Comparison:")
print(model_comparison)

Training Linear Regression...
Linear Regression:
  MAE: 3.32
  MSE: 17.95
  R2 Score: 0.11
--------------------------------------------------
Training Random Forest...
Random Forest:
  MAE: 3.27
  MSE: 17.81
  R2 Score: 0.11
--------------------------------------------------
Training Gradient Boosting...
Gradient Boosting:
  MAE: 3.25
  MSE: 17.29
  R2 Score: 0.14
--------------------------------------------------
Training XGBoost...
XGBoost:
  MAE: 3.30
  MSE: 19.25
  R2 Score: 0.04
--------------------------------------------------
Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4676
[LightGBM] [Info] Number of data points in the train set: 6979, number of used features: 66
[LightGBM] [Info] Start training from score 95.882075
LightGBM:
  MAE: 3.19
  MSE: 17.1