In [1]:
# Install required libraries
%pip install pandas numpy scikit-learn matplotlib seaborn shap missingno

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting shap
  Downloading shap-0.48.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting missingno
  Downloading missingno-0.5.2-py3-none-any.whl.metadata (639 bytes)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting tqdm>=4.27.0 (from shap)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.2-cp312-cp312-win_amd64.whl.metadata (2.9 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\Sebastian\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import shap

# Load dataset (assume downloaded from Kaggle)
df = pd.read_csv("Airbnb_Open_Data.csv", low_memory=False)

# Initial inspection
print(df.shape)
df.head()

  from .autonotebook import tqdm as notebook_tqdm


(102599, 26)


Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,...,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [3]:
# Drop non-actionable columns
df = df.drop(columns=['id', 'host id', 'NAME', 'host name', 'license', 'country'])

# Convert price to numeric
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

# Filter price outliers (top 2%)
price_upper_limit = df['price'].quantile(0.98)
df = df[df['price'] <= price_upper_limit]

# Handle missing values
df = df.dropna(subset=['price'])  # Remove listings without price
df['review rate number'] = df['review rate number'].fillna(df['review rate number'].median())
df = df.drop(columns=['house_rules', 'lat', 'long'])  # High missingness

# Encode categorical features
categorical_cols = ['neighbourhood group', 'room type', 'instant_bookable']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Final cleaning
df = df.dropna()  # Remove remaining missing values
print(f"Final dataset shape: {df.shape}")

  df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)


Final dataset shape: (83296, 24)


In [4]:
# Select key modifiable features (identified via EDA)
modifiable_features = [
    'room type_Private room', 
    'room type_Shared room',
    'review rate number',
    'instant_bookable_True',
    'Construction year',
    'minimum nights',
    'number of reviews'
]

X = df[modifiable_features]
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
print(f"Train MAE: {mean_absolute_error(y_train, train_preds):.2f}")
print(f"Test MAE: {mean_absolute_error(y_test, test_preds):.2f}")

Train MAE: 130.37
Test MAE: 244.56


6

In [None]:
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Global feature importance
shap.summary_plot(shap_values, X_test, feature_names=modifiable_features)

# Local explanation for a single listing
sample_idx = 0
shap.force_plot(
    explainer.expected_value, 
    shap_values[sample_idx], 
    X_test.iloc[sample_idx],
    feature_names=modifiable_features
)

7

In [None]:
def generate_host_recommendations(listing_data):
    """
    Generates price optimization recommendations for hosts
    based on model feature contributions.
    """
    # Get SHAP values for the listing
    listing_shap = explainer.shap_values(listing_data.values.reshape(1, -1))[0]
    
    recommendations = []
    feature_effects = dict(zip(modifiable_features, listing_shap))
    
    # Generate suggestions for modifiable features
    if 'room type_Private room' in feature_effects:
        if feature_effects['room type_Private room'] < -10:
            recommendations.append(
                "⚠️ Consider upgrading to ENTIRE HOME: Private rooms reduce price by "
                f"${abs(feature_effects['room type_Private room']):.2f} on average"
            )
            
    if 'instant_bookable_True' in feature_effects:
        if feature_effects['instant_bookable_True'] > 5:
            recommendations.append(
                "✅ Enable INSTANT BOOKING: Increases price by "
                f"${feature_effects['instant_bookable_True']:.2f}"
            )
    
    if 'review rate_number' in feature_effects:
        if feature_effects['review rate_number'] > 8:
            recommendations.append(
                "⭐ Maintain HIGH RATINGS (Current: {listing_data['review rate_number']}/5): "
                f"+${feature_effects['review rate_number']:.2f} to price"
            )
    
    return recommendations

# Example usage
sample_listing = X_test.iloc[0]
print("Host Recommendations:")
for rec in generate_host_recommendations(sample_listing):
    print(f"- {rec}")