# NZ Aibnb Data Analysis

### Data preprocessing

In [97]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

from sklearn.model_selection import (train_test_split,StratifiedKFold,cross_val_score,cross_validate,cross_val_predict)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler,OneHotEncoder,LabelEncoder)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import metrics
from sklearn.metrics import (roc_curve,precision_recall_curve,auc,make_scorer,log_loss)
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [98]:
df = pd.read_csv('listings.csv', encoding='cp1252')
df.head()
d= df.copy()
print("Shape:", d.shape)
print("Columns:", len(d.columns))

Shape: (47097, 85)
Columns: 85


### Block processing

In [99]:
# Part 1: Build Feature Blocks (Branch A)
def detect_col(columns, candidates):
    lower = {c.lower(): c for c in columns}
    for cand in candidates:
        if cand.lower() in lower:
            return lower[cand.lower()]
    def norm(x): return re.sub(r"[\s_]+", "", str(x).lower())
    for c in columns:
        cn = norm(c)
        for cand in candidates:
            if norm(cand) in cn:
                return c
    return None

def percent_to_01(s):
    x = s.astype(str).str.strip().str.replace("%", "", regex=False)
    x = pd.to_numeric(x, errors="coerce")
    x = np.where(x > 1, x / 100.0, x)
    return pd.Series(x, index=s.index).astype(float)

def to_bool01(s):
    return (s.astype(str).str.strip().str.lower()
            .map({"t":1,"true":1,"1":1,"y":1,"yes":1,
                  "f":0,"false":0,"0":0,"n":0,"no":0})
            .astype(float))

def price_to_num(s):
    x = s.astype(str).str.strip().str.replace(r"[\$,]", "", regex=True)
    return pd.to_numeric(x, errors="coerce").astype(float)

# ---- target y ----
SUPER = detect_col(df.columns, ["host_is_superhost"])
if SUPER is None:
    raise ValueError("Cannot find host_is_superhost column.")

df = df.copy()
df["y_superhost"] = to_bool01(df[SUPER])
df = df[df["y_superhost"].notna()].copy()
df["y_superhost"] = df["y_superhost"].astype(int)
y = df["y_superhost"]

# ---- minimal derived fields used by blocks ----

# Response time -> ordinal (faster is smaller)
RSP_TIME = detect_col(df.columns, ["host_response_time"])
rsp_map = {"within an hour":0, "within a few hours":1, "within a day":2, "a few days or more":3}
if RSP_TIME and "host_response_time_ord" not in df.columns:
    df["host_response_time_ord"] = (df[RSP_TIME].astype(str).str.strip().str.lower().map(rsp_map))

# Response / acceptance rate -> numeric 0~1
RSP_RATE = detect_col(df.columns, ["host_response_rate"])
ACC_RATE = detect_col(df.columns, ["host_acceptance_rate"])
if RSP_RATE and "host_response_rate_num" not in df.columns:
    df["host_response_rate_num"] = percent_to_01(df[RSP_RATE])
if ACC_RATE and "host_acceptance_rate_num" not in df.columns:
    df["host_acceptance_rate_num"] = percent_to_01(df[ACC_RATE])

# Instant bookable -> 0/1
INSTANT = detect_col(df.columns, ["instant_bookable"])
if INSTANT and "instant_bookable_01" not in df.columns:
    df["instant_bookable_01"] = to_bool01(df[INSTANT]).fillna(0).astype(int)

# last_review -> days_since_last_review
SCRAPED = detect_col(df.columns, ["last_scraped"])
LAST_REVIEW = detect_col(df.columns, ["last_review"])
snap = pd.to_datetime(df[SCRAPED], errors="coerce") if SCRAPED else pd.Timestamp.today().normalize()
if LAST_REVIEW:
    lr = pd.to_datetime(df[LAST_REVIEW], errors="coerce")
    df["days_since_last_review"] = (snap - lr).dt.days

# price -> numeric + log
PRICE = detect_col(df.columns, ["price"])
if PRICE and "price_num" not in df.columns:
    df["price_num"] = price_to_num(df[PRICE])
df["log_price"] = np.log1p(df["price_num"]) if "price_num" in df.columns else np.nan

# bathrooms_text -> numeric (optional)
BATH_TXT = detect_col(df.columns, ["bathrooms_text"])
if BATH_TXT and "bathrooms_num" not in df.columns:
    df["bathrooms_num"] = pd.to_numeric(df[BATH_TXT].astype(str).str.extract(r"(\d+(\.\d+)?)")[0],
                                        errors="coerce")

# ---- Block A: Host Response ----
cols_A = [c for c in ["host_response_rate_num", "host_response_time_ord",
                      "host_acceptance_rate_num", "instant_bookable_01"]
          if c in df.columns]
X_A = df[cols_A]

# ---- Block B: Ratings (review_scores_*) ----
cols_B = [c for c in df.columns if c.startswith("review_scores_")]
X_B = df[cols_B] if cols_B else pd.DataFrame(index=df.index)

# ---- Block C: Review Activity ----
cols_C = [c for c in ["number_of_reviews", "reviews_per_month",
                      "number_of_reviews_ltm", "number_of_reviews_l30d",
                      "days_since_last_review"]
          if c in df.columns]
X_C = df[cols_C]

# ---- Block D: Listing Structure (type/capacity/price/location) ----
PROP = detect_col(df.columns, ["property_type"])
ROOM = detect_col(df.columns, ["room_type"])
LOC1 = detect_col(df.columns, ["region_name", "region_parent_name"])
LOC2 = detect_col(df.columns, ["neighbourhood", "neighborhood"])
# choose one or two location columns
loc_cols = [c for c in [LOC1, LOC2] if c is not None]

num_D = [c for c in ["accommodates", "bedrooms", "beds", "bathrooms_num",
                     "minimum_nights", "log_price"]
         if c in df.columns]
cat_D = [c for c in [PROP, ROOM] if c is not None] + loc_cols
cols_D = num_D + cat_D
X_D = df[cols_D]

# ---- pack blocks ----
blocks = {
    "A_response": {"X": X_A, "y": y, "cols": cols_A},
    "B_ratings":  {"X": X_B, "y": y, "cols": cols_B},
    "C_activity": {"X": X_C, "y": y, "cols": cols_C},
    "D_listing":  {"X": X_D, "y": y, "cols": cols_D},
}

# quick check
for k, v in blocks.items():
    print(f"{k}: X shape={v['X'].shape}, cols={len(v['cols'])}")
print("label superhost rate:", y.mean())

A_response: X shape=(45991, 4), cols=4
B_ratings: X shape=(45991, 7), cols=7
C_activity: X shape=(45991, 5), cols=5
D_listing: X shape=(45991, 10), cols=10
label superhost rate: 0.4803331086516927


In [100]:
def eval_binary_from_proba(y_true, proba, threshold=0.5):
    pred = (proba >= threshold).astype(int)
    return {
        "AUC": roc_auc_score(y_true, proba),
        "PR_AUC": average_precision_score(y_true, proba),
        "Accuracy": accuracy_score(y_true, pred),
        "F1": f1_score(y_true, pred),
        "PosRate(y)": float(np.mean(y_true)),
        "PosRate(pred)": float(np.mean(pred)),
    }

### Logistic Regression Block ABC

In [101]:
def run_logit_numeric_block(X, y, cv=5, threshold=0.5, C=1.0):
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=2000, C=C, solver="lbfgs"))
    ])

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    proba = cross_val_predict(pipe, X, y, cv=skf, method="predict_proba")[:, 1]
    metrics = eval_binary_from_proba(y, proba, threshold=threshold)

    # fit on full data for coefficients
    pipe.fit(X, y)
    coef = pipe.named_steps["clf"].coef_[0]
    feat = X.columns.to_list()

    coef_df = pd.DataFrame({
        "feature": feat,
        "coef": coef,
        "odds_ratio": np.exp(coef),
        "abs_coef": np.abs(coef)
    }).sort_values("abs_coef", ascending=False)

    return pipe, metrics, coef_df


In [103]:
results = {}

# A / B / C
for name in ["A_response", "B_ratings", "C_activity"]:
    X = blocks[name]["X"]
    y = blocks[name]["y"]
    if X.shape[1] == 0:
        print(name, "skipped (no features)")
        continue
    _, m, coef_df = run_logit_numeric_block(X, y, cv=5, threshold=0.5, C=1.0)
    results[name] = (m, coef_df)
    print(f"\n=== {name} metrics ===")
    print(pd.Series(m).round(4))
    print("\nTop 10 factors:")
    display(coef_df.head(10))



=== A_response metrics ===
AUC              0.6201
PR_AUC           0.5414
Accuracy         0.5812
F1               0.6450
PosRate(y)       0.4803
PosRate(pred)    0.6994
dtype: float64

Top 10 factors:


Unnamed: 0,feature,coef,odds_ratio,abs_coef
2,host_acceptance_rate_num,0.844442,2.32668,0.844442
0,host_response_rate_num,0.321745,1.379532,0.321745
3,instant_bookable_01,-0.155313,0.856147,0.155313
1,host_response_time_ord,-0.073526,0.929112,0.073526



=== B_ratings metrics ===
AUC              0.5921
PR_AUC           0.5116
Accuracy         0.5536
F1               0.5873
PosRate(y)       0.4803
PosRate(pred)    0.6012
dtype: float64

Top 10 factors:


Unnamed: 0,feature,coef,odds_ratio,abs_coef
2,review_scores_cleanliness,0.28901,1.335105,0.28901
0,review_scores_rating,0.230825,1.259638,0.230825
6,review_scores_value,0.128109,1.136676,0.128109
4,review_scores_communication,0.090798,1.095048,0.090798
5,review_scores_location,-0.07754,0.92539,0.07754
1,review_scores_accuracy,0.045229,1.046268,0.045229
3,review_scores_checkin,0.042571,1.04349,0.042571



=== C_activity metrics ===
AUC              0.8122
PR_AUC           0.7851
Accuracy         0.7408
F1               0.6949
PosRate(y)       0.4803
PosRate(pred)    0.3691
dtype: float64

Top 10 factors:


Unnamed: 0,feature,coef,odds_ratio,abs_coef
2,number_of_reviews_ltm,1.533968,4.636539,1.533968
4,days_since_last_review,-0.418649,0.657935,0.418649
1,reviews_per_month,-0.307407,0.735351,0.307407
0,number_of_reviews,0.237938,1.26863,0.237938
3,number_of_reviews_l30d,-0.014573,0.985532,0.014573


### Logistic regression Block D

In [None]:
def run_logit_mixed_block(X, y, cv=5, threshold=0.5, C=1.0):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    pre = ColumnTransformer([
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ], remainder="drop")

    pipe = Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(max_iter=3000, C=C, solver="lbfgs"))
    ])

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    proba = cross_val_predict(pipe, X, y, cv=skf, method="predict_proba")[:, 1]
    metrics = eval_binary_from_proba(y, proba, threshold=threshold)

    pipe.fit(X, y)

    ohe = pipe.named_steps["pre"].named_transformers_["cat"].named_steps["onehot"]
    cat_names = ohe.get_feature_names_out(cat_cols).tolist()
    feat_names = num_cols + cat_names

    coef = pipe.named_steps["clf"].coef_[0]

    coef_df = pd.DataFrame({
        "feature": feat_names,
        "coef": coef,
        "odds_ratio": np.exp(coef),
        "abs_coef": np.abs(coef)
    }).sort_values("abs_coef", ascending=False)

    return pipe, metrics, coef_df

In [104]:
# D：mixed
name = "D_listing"
X = blocks[name]["X"]
y = blocks[name]["y"]
_, m, coef_df = run_logit_mixed_block(X, y, cv=5, threshold=0.5, C=1.0)
results[name] = (m, coef_df)

print(f"\n=== {name} metrics ===")
print(pd.Series(m).round(4))
print("\nTop 15 factors:")
display(coef_df.head(15))


=== D_listing metrics ===
AUC              0.6954
PR_AUC           0.6591
Accuracy         0.6417
F1               0.6150
PosRate(y)       0.4803
PosRate(pred)    0.4503
dtype: float64

Top 15 factors:


Unnamed: 0,feature,coef,odds_ratio,abs_coef
460,"neighbourhood_Auckland, Au, New Zealand",2.629722,13.869914,2.629722
1477,"neighbourhood_Onetangi, Waiheke, New Zealand",2.190159,8.936637,2.190159
2260,"neighbourhood_West Coast, NZ, New Zealand",-2.099172,0.122558,2.099172
90,property_type_Room in hostel,-2.029424,0.131411,2.029424
990,"neighbourhood_Katikati, Bay Of Plenty, New Zea...",1.908536,6.743206,1.908536
1312,"neighbourhood_Mount Maunganui, New Zealand",1.854916,6.391163,1.854916
263,region_name_Ruataniwha Ward,1.821743,6.182623,1.821743
949,"neighbourhood_Kaikoura Flat, Canterbury, New Z...",-1.787464,0.167384,1.787464
1576,"neighbourhood_Papatowai, Otago, New Zealand",-1.738909,0.175712,1.738909
2125,"neighbourhood_Waiheke Island, Auckland, New Ze...",1.685513,5.395215,1.685513


### ToP 25 features

In [108]:
topN = 8
all_top = []
for k, (m, dfc) in results.items():
    tmp = dfc.head(topN).copy()
    tmp["block"] = k
    all_top.append(tmp)

priority = (pd.concat(all_top, ignore_index=True)
              .sort_values(["abs_coef"], ascending=False))

display(priority.head(25))

Unnamed: 0,feature,coef,odds_ratio,abs_coef,block
16,"neighbourhood_Auckland, Au, New Zealand",2.629722,13.869914,2.629722,D_listing
17,"neighbourhood_Onetangi, Waiheke, New Zealand",2.190159,8.936637,2.190159,D_listing
18,"neighbourhood_West Coast, NZ, New Zealand",-2.099172,0.122558,2.099172,D_listing
19,property_type_Room in hostel,-2.029424,0.131411,2.029424,D_listing
20,"neighbourhood_Katikati, Bay Of Plenty, New Zea...",1.908536,6.743206,1.908536,D_listing
21,"neighbourhood_Mount Maunganui, New Zealand",1.854916,6.391163,1.854916,D_listing
22,region_name_Ruataniwha Ward,1.821743,6.182623,1.821743,D_listing
23,"neighbourhood_Kaikoura Flat, Canterbury, New Z...",-1.787464,0.167384,1.787464,D_listing
11,number_of_reviews_ltm,1.533968,4.636539,1.533968,C_activity
0,host_acceptance_rate_num,0.844442,2.32668,0.844442,A_response


### HistGradientBoostingClassifier + Permutation Importance

#### Block A B C

In [109]:
def eval_metrics(y_true, proba, thr=0.5):
    pred = (proba >= thr).astype(int)
    return {
        "AUC": roc_auc_score(y_true, proba),
        "PR_AUC": average_precision_score(y_true, proba),
        "Accuracy": accuracy_score(y_true, pred),
        "F1": f1_score(y_true, pred),
    }

def run_hgb_numeric_block(X, y, block_name="block", thr=0.5,
                          max_depth=3, learning_rate=0.05, max_iter=400):

    if X is None or X.shape[1] == 0:
        print(f"\n=== {block_name} skipped (no features) ===")
        return None, None, None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )


    model = HistGradientBoostingClassifier(
        random_state=42,
        max_depth=max_depth,
        learning_rate=learning_rate,
        max_iter=max_iter
    )
    model.fit(X_train, y_train)

    proba = model.predict_proba(X_test)[:, 1]
    m = eval_metrics(y_test, proba, thr=thr)

    # Permutation importance
    perm = permutation_importance(
        model, X_test, y_test,
        n_repeats=20, random_state=42, scoring="roc_auc"
    )
    imp = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)

    print(f"\n=== {block_name} | HGB metrics ===")
    print(pd.Series(m).round(4))

    print("\nTop 10 permutation importance (AUC drop):")
    display(imp.head(10).to_frame("perm_importance"))

    return model, m, imp


#Run A / B / C
hgb_results = {}

for block_name in ["A_response", "B_ratings", "C_activity"]:
    X = blocks[block_name]["X"]
    y = blocks[block_name]["y"]

    model, metrics, importance = run_hgb_numeric_block(
        X, y, block_name=block_name, thr=0.5
    )

    hgb_results[block_name] = {
        "model": model,
        "metrics": metrics,
        "importance": importance
    }

summary = pd.DataFrame({
    k: v["metrics"] for k, v in hgb_results.items() if v["metrics"] is not None
}).T

print("\n===== HGB Metrics Summary (A/B/C) =====")
display(summary.round(4))


=== A_response | HGB metrics ===
AUC         0.8084
PR_AUC      0.7504
Accuracy    0.7351
F1          0.7593
dtype: float64

Top 10 permutation importance (AUC drop):


Unnamed: 0,perm_importance
host_acceptance_rate_num,0.190362
host_response_rate_num,0.070503
host_response_time_ord,0.016515
instant_bookable_01,0.006623



=== B_ratings | HGB metrics ===
AUC         0.7984
PR_AUC      0.7721
Accuracy    0.7421
F1          0.7304
dtype: float64

Top 10 permutation importance (AUC drop):


Unnamed: 0,perm_importance
review_scores_rating,0.018202
review_scores_cleanliness,0.016192
review_scores_checkin,0.008762
review_scores_location,0.008744
review_scores_value,0.00806
review_scores_communication,0.005949
review_scores_accuracy,0.005479



=== C_activity | HGB metrics ===
AUC         0.8166
PR_AUC      0.7916
Accuracy    0.7674
F1          0.7518
dtype: float64

Top 10 permutation importance (AUC drop):


Unnamed: 0,perm_importance
number_of_reviews_ltm,0.201001
days_since_last_review,0.011398
number_of_reviews,0.005342
reviews_per_month,0.003541
number_of_reviews_l30d,0.000484



===== HGB Metrics Summary (A/B/C) =====


Unnamed: 0,AUC,PR_AUC,Accuracy,F1
A_response,0.8084,0.7504,0.7351,0.7593
B_ratings,0.7984,0.7721,0.7421,0.7304
C_activity,0.8166,0.7916,0.7674,0.7518


#### Block D

In [112]:
def run_hgb_mixed_block_dense(X, y, block_name="D_listing", thr=0.5):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    pre = ColumnTransformer([
        ("num", "passthrough", num_cols),
        ("cat", ohe, cat_cols)
    ])

    pipe = Pipeline([
        ("pre", pre),
        ("clf", HistGradientBoostingClassifier(
            random_state=42, max_depth=3, learning_rate=0.05, max_iter=400
        ))
    ])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )

    pipe.fit(X_train, y_train)

    proba = pipe.predict_proba(X_test)[:, 1]

    # metrics
    from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
    pred = (proba >= thr).astype(int)
    m = {
        "AUC": roc_auc_score(y_test, proba),
        "PR_AUC": average_precision_score(y_test, proba),
        "Accuracy": accuracy_score(y_test, pred),
        "F1": f1_score(y_test, pred),
    }
    print(f"\n=== {block_name} | HGB metrics ===")
    print(pd.Series(m).round(4))

    # permutation importance
    perm = permutation_importance(
        pipe, X_test, y_test,
        n_repeats=10, random_state=42, scoring="roc_auc")

    imp = pd.Series(perm.importances_mean, index=X_test.columns).sort_values(ascending=False)

    print("\nTop permutation importance (AUC drop) — original features:")
    display(imp.head(15).to_frame("perm_importance"))

    return pipe, m, imp

# run D
X = blocks["D_listing"]["X"]
y = blocks["D_listing"]["y"]
run_hgb_mixed_block_dense(X, y, block_name="D_listing")


=== D_listing | HGB metrics ===
AUC         0.7160
PR_AUC      0.6849
Accuracy    0.6578
F1          0.6415
dtype: float64

Top permutation importance (AUC drop) — original features:


Unnamed: 0,perm_importance
region_name,0.067468
log_price,0.052671
room_type,0.033169
neighbourhood,0.032775
property_type,0.027509
minimum_nights,0.02427
bathrooms_num,0.007695
beds,0.001495
accommodates,0.000782
bedrooms,0.000315


(Pipeline(steps=[('pre',
                  ColumnTransformer(transformers=[('num', 'passthrough',
                                                   ['accommodates', 'bedrooms',
                                                    'beds', 'bathrooms_num',
                                                    'minimum_nights',
                                                    'log_price']),
                                                  ('cat',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 sparse_output=False),
                                                   ['property_type', 'room_type',
                                                    'region_name',
                                                    'neighbourhood'])])),
                 ('clf',
                  HistGradientBoostingClassifier(learning_rate=0.05, max_depth=3,
                                           

### Evaluation

In [113]:
def evaluate_cv_hgb(X, y, name="block", thr=0.5, cv=5):
    model = HistGradientBoostingClassifier(
        random_state=42, max_depth=3, learning_rate=0.05, max_iter=400
    )
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # out-of-fold probability
    proba = cross_val_predict(model, X, y, cv=skf, method="predict_proba")[:, 1]
    pred  = (proba >= thr).astype(int)

    out = {
        "block": name,
        "AUC": roc_auc_score(y, proba),
        "PR_AUC": average_precision_score(y, proba),
        "Accuracy": accuracy_score(y, pred),
        "F1": f1_score(y, pred),
        "Precision": precision_score(y, pred, zero_division=0),
        "Recall": recall_score(y, pred, zero_division=0),
        "thr": thr,
        "pos_rate(y)": float(np.mean(y)),
        "pos_rate(pred)": float(np.mean(pred)),
    }
    return out

# run A/B/C
rows = []
for bn in ["A_response","B_ratings","C_activity"]:
    X = blocks[bn]["X"]
    y = blocks[bn]["y"]
    if X.shape[1] == 0:
        continue
    rows.append(evaluate_cv_hgb(X, y, name=bn, thr=0.5, cv=5))

summary = pd.DataFrame(rows).set_index("block")
display(summary.round(4))

Unnamed: 0_level_0,AUC,PR_AUC,Accuracy,F1,Precision,Recall,thr,pos_rate(y),pos_rate(pred)
block,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A_response,0.8077,0.7528,0.7347,0.759,0.6733,0.8697,0.5,0.4803,0.6205
B_ratings,0.7973,0.7704,0.7385,0.7274,0.7284,0.7263,0.5,0.4803,0.4789
C_activity,0.8155,0.792,0.7656,0.7512,0.7665,0.7365,0.5,0.4803,0.4615


In [114]:
def evaluate_cv_logit_mixed(X, y, name="D_listing", thr=0.5, cv=5):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    pre = ColumnTransformer([
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ])

    pipe = Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(max_iter=3000))
    ])

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    proba = cross_val_predict(pipe, X, y, cv=skf, method="predict_proba")[:, 1]
    pred  = (proba >= thr).astype(int)

    out = {
        "block": name,
        "AUC": roc_auc_score(y, proba),
        "PR_AUC": average_precision_score(y, proba),
        "Accuracy": accuracy_score(y, pred),
        "F1": f1_score(y, pred),
        "Precision": precision_score(y, pred, zero_division=0),
        "Recall": recall_score(y, pred, zero_division=0),
        "thr": thr
    }
    return out

# runD
X = blocks["D_listing"]["X"]
y = blocks["D_listing"]["y"]
d_eval = evaluate_cv_logit_mixed(X, y, name="D_listing(Logit)", thr=0.5, cv=5)
print(pd.Series(d_eval).round(4))

block        D_listing(Logit)
AUC                  0.695387
PR_AUC               0.659085
Accuracy             0.641713
F1                      0.615
Precision            0.635521
Recall               0.595763
thr                       0.5
dtype: object


### Part 1 — Objective & Approach

Objective
Predict whether a host is a Superhost (y=1) and identify the most actionable factors to prioritize.

Approach
We structured features into four blocks and evaluated each block’s explanatory/predictive power using ROC-AUC, PR-AUC, Accuracy, and F1:

Block A (Host Response): response rate, response time, acceptance rate, instant book

Block B (Ratings): review_scores_* dimensions

Block C (Review Activity): review volume and recency signals

Block D (Listing Structure): price, property/room type, and location

### Part 2 — Model Evaluation (Key Finding: C strongest, D second)

| Block               |    ROC-AUC |     PR-AUC | Accuracy |     F1 | Interpretation                                                               |
| ------------------- | ---------: | ---------: | -------: | -----: | ---------------------------------------------------------------------------- |
| **C_activity**      | **0.8122** | **0.7851** |   0.7408 | 0.6949 | Strongest: closely linked to **review numbers & recent activity**           |
| **D_listing (HGB)** | **0.7160** | **0.6849** |   0.6578 | 0.6415 | Moderate: **location + pricing + property structure** explain part of the gap |
| A_response          |     0.6201 |     0.5414 |   0.5812 | 0.6450 | Useful but limited alone; more like a **baseline operational requirement**   |
| B_ratings           |     0.5921 |     0.5116 |   0.5536 | 0.5873 | Weakest: ratings show a **ceiling effect** → less discriminative             |

Becoming a superhost depends on consistently positive reviews, recent activity levels, and market property structure factors (location/price).

### Part 3 — Key Drivers by Block (What matters most)
Block A — Host Response (Logistic Regression): “Operational baseline”

Top drivers (Odds Ratio, OR):

Acceptance rate: OR ≈ 2.33 (positive; critical baseline)

Response rate: OR ≈ 1.38 (positive)

Slower response time: OR ≈ 0.93 (negative)

Instant bookable: OR ≈ 0.86 (slightly negative; interpret cautiously)

Interpretation: Response/acceptance are strongly associated with Superhost status, but the block alone has limited predictive power—best viewed as minimum operational compliance.

Block B — Ratings (Logistic Regression): “Cleanliness is the strongest lever”

Top drivers (OR):

Cleanliness: OR ≈ 1.34 (strongest)

Overall rating: OR ≈ 1.26

Value: OR ≈ 1.14

Communication: OR ≈ 1.10

Interpretation: Ratings are not highly discriminative overall (AUC ~0.59), but cleanliness is the most actionable rating lever.

Block C — Review Activity (Logistic Regression): “Momentum + recency = strongest signal”

Top drivers (OR):

Reviews in last 12 months (LTM): OR ≈ 4.64 (strongest positive)

Days since last review: OR ≈ 0.66 (negative; longer inactivity is worse)

Total reviews: OR ≈ 1.27 (positive)

Interpretation: The strongest signal is consistent review momentum and recent ongoing activity, not just high scores.

Block D — Listing Structure (HGB + permutation importance): “Location & price dominate”

Permutation importance (AUC drop; higher = more important):

region_name ≈ 0.067

log_price ≈ 0.053

room_type ≈ 0.033

neighbourhood ≈ 0.033

property_type ≈ 0.028

minimum_nights ≈ 0.024
(others contribute less)

Interpretation: Location and pricing are the most influential structural drivers, followed by room/property type and minimum nights.

### Part 4 — Leadership Takeaways & Priority Actions 
Priority 1 — Review Momentum (Highest impact)

Goal: Increase LTM review volume and reduce time since last review.
Actions: Improve guest experience consistency, structured follow-up after checkout, and review-friendly workflows.

Priority 2 — Location & Pricing Strategy (Structural optimization)

Goal: Optimize within comparable markets (same region/room type).
Actions: Calibrate pricing and minimum nights by region/season; benchmark against local comps.

Priority 3 — Response & Acceptance (Operational SLA)

Goal: Maintain high acceptance + response rate; minimize response time.
Actions: Set response SLA, automate messaging, and streamline booking approvals.

Priority 4 — Ratings Improvement (Focus on cleanliness)

Goal: Improve the most actionable rating lever: cleanliness.
Actions: Cleaning SOP, checklist QA, alignment between listing description and actual experience.

### Final Conclusion
Overall, Superhost differentiation is primarily driven by sustained review momentum and recent activity (AUC ≈ 0.81), followed by structural factors such as location and pricing (AUC ≈ 0.72). Response and acceptance act as baseline operational requirements, while ratings are less discriminative overall—yet cleanliness remains the most actionable rating lever. Resources should be prioritized toward: review momentum & recency + market-based pricing optimization + response/acceptance SLA + cleanliness SOP.