# Airbnb Price Prediction
This notebook loads the trained model and generates predictions for the test dataset.

In [2]:
import numpy as np
import pandas as pd
import joblib

# Load the cleaned test dataset
%store -r test_df_cleaned
print("Test data shape:", test_df_cleaned.shape)

Test data shape: (20000, 32)


In [3]:
# Load the trained model
pipe = joblib.load('../models/airbnb_price_model.joblib')
print("Model loaded successfully")

Model loaded successfully


In [4]:
# Define feature columns (same as in training)
numeric_features = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'review_scores_rating', 'number_of_reviews', 'month'
]

binary_features = [
    'has_wifi', 'host_verified', 'host_has_pic', 'instant_bookable'
]

categorical_features = [
    'room_type', 'property_type', 'cancellation_policy', 'city'
]

# Combine all features
feature_cols = numeric_features + binary_features + categorical_features

# Prepare test features
X_test = test_df_cleaned[feature_cols]
print("Test features shape:", X_test.shape)

Test features shape: (20000, 15)


In [7]:
# Generate predictions
print("Generating predictions...")
y_pred_test = pipe.predict(X_test)

# Convert log predictions back to original scale
y_pred_test_original = np.exp(y_pred_test) - 1

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df_cleaned.index,
    'price': y_pred_test_original
})

# Ensure output directory exists and save predictions
import os
output_dir = os.path.abspath(os.path.join('..', 'output'))
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'predictions.csv')
submission.to_csv(output_path, index=False)
print(f"\nPredictions saved to '{output_path}'")
print("Preview of predictions:")
print(submission.head())

Generating predictions...

Predictions saved to 'c:\Users\MOHAMMED AFFAN\OneDrive\Desktop\AIRBNB\output\predictions.csv'
Preview of predictions:
   id       price
0   0   61.403321
1   1  128.005335
2   2  164.199786
3   3  168.448307
4   4  144.088999




In [3]:
# SHAP explanations (robust: loads model/data if missing)
import os
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib

# Ensure model is loaded
if 'pipe' not in globals():
    try:
        pipe = joblib.load('../models/airbnb_price_model.joblib')
        print("Model loaded from '../models/airbnb_price_model.joblib'")
    except Exception as e:
        raise RuntimeError("Trained model not found. Run training notebook first or ensure model file exists.") from e

# Ensure train/test cleaned data and feature_cols are available
if 'train_df_cleaned' not in globals():
    get_ipython().run_line_magic('store', '-r train_df_cleaned')
if 'test_df_cleaned' not in globals():
    get_ipython().run_line_magic('store', '-r test_df_cleaned')

if 'feature_cols' not in globals():
    # Define fallback feature lists (must match training)
    numeric_features = [
        'accommodates', 'bathrooms', 'bedrooms', 'beds',
        'review_scores_rating', 'number_of_reviews', 'month'
    ]
    binary_features = [
        'has_wifi', 'host_verified', 'host_has_pic', 'instant_bookable'
    ]
    categorical_features = [
        'room_type', 'property_type', 'cancellation_policy', 'city'
    ]
    feature_cols = numeric_features + binary_features + categorical_features

# Use the pipeline's preprocessor to transform features
pre = pipe.named_steps.get('preprocessor') or pipe.named_steps.get('pre')
reg = pipe.named_steps.get('regressor') or pipe.named_steps.get('model')
if pre is None or reg is None:
    raise RuntimeError('Expected pipeline to contain "preprocessor" and "regressor" steps')

X_train_pre = pre.transform(train_df_cleaned[feature_cols])
X_test_pre = pre.transform(test_df_cleaned[feature_cols])

# Prepare background sample
rng = np.random.default_rng(42)
bg_size = min(100, X_train_pre.shape[0])
bg_idx = rng.choice(X_train_pre.shape[0], size=bg_size, replace=False)
background = X_train_pre[bg_idx]

# Try LinearExplainer first (fast for linear models), else fall back
try:
    explainer = shap.LinearExplainer(reg, background, feature_dependence='independent')
    shap_values = explainer.shap_values(X_test_pre[:200])
except Exception as e:
    print('LinearExplainer failed (falling back to KernelExplainer):', e)
    explainer = shap.KernelExplainer(reg.predict, background)
    shap_values = explainer.shap_values(X_test_pre[:200], nsamples=100)

# Build feature names after preprocessing (numeric + binary + one-hoted categorical)
feature_names = []
try:
    # numeric and binary keep their names
    feature_names.extend(numeric_features)
    feature_names.extend(binary_features)
    # categorical one-hot names
    cat_tm = pre.named_transformers_.get('cat')
    if cat_tm is not None:
        # cat_tm is a Pipeline with a OneHotEncoder named 'onehot'
        if hasattr(cat_tm, 'named_steps') and 'onehot' in cat_tm.named_steps:
            onehot = cat_tm.named_steps['onehot']
        else:
            onehot = cat_tm
        try:
            cat_names = onehot.get_feature_names_out(categorical_features).tolist()
        except AttributeError:
            cat_names = onehot.get_feature_names(categorical_features).tolist()
        feature_names.extend(cat_names)
except Exception:
    # fallback to generic names
    feature_names = [f'f{i}' for i in range(X_test_pre.shape[1])]

# Plot and save
plt.figure(figsize=(10,6))
shap.summary_plot(shap_values, features=X_test_pre[:200], feature_names=feature_names, show=False)
output_dir = os.path.abspath(os.path.join('..', 'output'))
os.makedirs(output_dir, exist_ok=True)
shap_output = os.path.join(output_dir, 'shap_summary.png')
plt.savefig(shap_output, bbox_inches='tight')
print(f"SHAP summary saved to {shap_output}")
plt.close()

Model loaded from '../models/airbnb_price_model.joblib'




LinearExplainer failed (falling back to KernelExplainer): The option feature_dependence has been renamed to feature_perturbation!


100%|██████████| 200/200 [00:02<00:00, 75.13it/s]



SHAP summary saved to c:\Users\MOHAMMED AFFAN\OneDrive\Desktop\AIRBNB\output\shap_summary.png


In [9]:

%store -r train_df_cleaned
# Save listings for dashboard (robust)
import os

# Restore train dataframe if needed
if 'train_df_cleaned' not in globals():
    get_ipython().run_line_magic('store', '-r', 'train_df_cleaned')

# Prepare output directory at project root
output_dir = os.path.abspath(os.path.join('..', 'output'))
os.makedirs(output_dir, exist_ok=True)

train_df_export = train_df_cleaned.copy()
if 'id' not in train_df_export.columns:
    train_df_export = train_df_export.reset_index().rename(columns={'index': 'id'})

cols_list = ['id', 'neighbourhood', 'room_type', 'property_type', 'accommodates',
             'latitude', 'longitude', 'review_scores_rating', 'price']
cols_list_existing = [c for c in cols_list if c in train_df_export.columns]

if not cols_list_existing:
    print('No expected columns found in train_df_cleaned; skipping CSV save.')
else:
    listings_path = os.path.join(output_dir, 'listings_for_dashboard.csv')
    train_df_export[cols_list_existing].to_csv(listings_path, index=False)
    print(f"Saved listings for dashboard to {listings_path}")

Saved listings for dashboard to c:\Users\MOHAMMED AFFAN\OneDrive\Desktop\AIRBNB\output\listings_for_dashboard.csv
