In [13]:
import os
import ast
import time
from collections import Counter
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from category_encoders import TargetEncoder
from xgboost import XGBRegressor

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# ===========================================================
# 0Ô∏è‚É£ LOAD DATA
# ===========================================================
cwd = os.getcwd()
if os.path.basename(cwd) == "Notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

path = os.path.join(project_root, "data", "Airbnb_DK_Processed_Data.csv")
df = pd.read_csv(path)

# Keep a pristine copy if you want
df_original = df.copy()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11579 entries, 0 to 11578
Data columns (total 52 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11579 non-null  int64  
 1   name                            11579 non-null  object 
 2   description                     11353 non-null  object 
 3   host_id                         11579 non-null  int64  
 4   host_name                       11243 non-null  object 
 5   host_since                      11239 non-null  object 
 6   host_location                   9406 non-null   object 
 7   host_response_time              10120 non-null  object 
 8   host_response_rate              10120 non-null  object 
 9   host_acceptance_rate            10852 non-null  object 
 10  host_is_superhost               11490 non-null  object 
 11  host_listings_count             11239 non-null  float64
 12  host_total_listings_count       

In [None]:
# Target in log space
df["price_log"] = np.log1p(df["price"])

# ===========================================================
# 1Ô∏è‚É£ GLOBAL FEATURE ENGINEERING (NO TARGET USED HERE)
# ===========================================================

# --- Bathrooms parsing ---
def parse_bathroom_text(x):
    if pd.isna(x):
        return np.nan, np.nan
    s = str(x).lower()
    nums = re.findall(r'([\d\.]+)', s)
    num = float(nums[0]) if nums else np.nan
    shared = 1 if 'shared' in s else 0
    return num, shared

df[["bathrooms_num", "bathrooms_shared"]] = df["bathrooms_text"].apply(
    lambda x: pd.Series(parse_bathroom_text(x))
)

# --- Capped features ---
df["minimum_nights_capped"] = df["minimum_nights"].clip(upper=10)
df["number_of_reviews_capped"] = df["number_of_reviews"].clip(upper=25)

# --- Binary flags ---
df["instant_bookable_bin"] = (df["instant_bookable"] == "t").astype(int)
df["host_is_superhost_bin"] = (df["host_is_superhost"] == "t").astype(int)

# --- Review presence & missingness ---
df["has_reviews"] = (df["number_of_reviews"] > 0).astype(int)
df["review_scores_rating_missing"] = df["review_scores_rating"].isna().astype(int)

# --- Dates & time-based features ---
for col in ["host_since", "first_review", "last_review"]:
    df[col + "_dt"] = pd.to_datetime(df[col], errors="coerce")

ref_date = df["last_review_dt"].max()

df["host_since_days"] = (ref_date - df["host_since_dt"]).dt.days
df["days_since_first_review"] = (ref_date - df["first_review_dt"]).dt.days
df["days_since_last_review"] = (ref_date - df["last_review_dt"]).dt.days

# --- Comments features & sentiment ---
df["comments"] = df["comments"].fillna("")
df["comments_length"] = df["comments"].str.len()
df["comments_word_count"] = df["comments"].apply(lambda x: len(x.split()))
df["comments_exclamation"] = df["comments"].str.count("!")
df["comments_sentiment"] = df["comments"].apply(
    lambda x: sia.polarity_scores(x)["compound"]
)

# --- Amenities basic parsing ---
df["amenities_list"] = df["amenities"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)
df["amenities_count"] = df["amenities_list"].apply(len)

important_amenities = [
    "Wifi", "Kitchen", "Heating", "Air conditioning", "Washer", "Dryer",
    "TV", "Hot tub", "Pool", "Pets allowed", "Free parking"
]

def count_high_value_amenities(lst):
    s = [str(a).lower() for a in lst]
    return sum(any(kw.lower() in a for a in s) for kw in important_amenities)

df["count_high_value_amenities"] = df["amenities_list"].apply(count_high_value_amenities)

# ===========================================================
# 2Ô∏è‚É£ OUT-OF-FOLD SETUP
# ===========================================================
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_predictions = np.zeros(len(df))  # on ORIGINAL price scale

def safe_col_name(s):
    s = str(s).lower()
    s = re.sub(r"[^0-9a-z]+", "_", s)
    return s.strip("_")

# For sanity checks later
all_feature_names = None

# ===========================================================
# 3Ô∏è‚É£ FOLD LOOP (NO LEAKAGE)
# ===========================================================
fold = 1
for train_idx, valid_idx in kf.split(df):

    print(f"\n======================")
    print(f"üîÅ Fold {fold}/{n_splits}")
    print(f"======================")

    # Fresh copies per fold (we never write back into df inside loop)
    df_train = df.iloc[train_idx].copy()
    df_valid = df.iloc[valid_idx].copy()

    y_train = df_train["price_log"]
    y_valid = df_valid["price_log"]

    # -------------------------------
    # 3.1 KMeans on train coords only
    # -------------------------------
    kmeans = KMeans(n_clusters=100, random_state=42)
    df_train["location_cluster"] = kmeans.fit_predict(
        df_train[["latitude", "longitude"]]
    )
    df_valid["location_cluster"] = kmeans.predict(
        df_valid[["latitude", "longitude"]]
    )

    # Radian coords
    df_train["lat_rad"] = np.radians(df_train["latitude"])
    df_train["lon_rad"] = np.radians(df_train["longitude"])
    df_valid["lat_rad"] = np.radians(df_valid["latitude"])
    df_valid["lon_rad"] = np.radians(df_valid["longitude"])

    # -------------------------------
    # 3.2 TF-IDF on name (train only)
    # -------------------------------
    tfidf = TfidfVectorizer(max_features=80)
    name_train = tfidf.fit_transform(df_train["name"].fillna(""))
    name_valid = tfidf.transform(df_valid["name"].fillna(""))

    tfidf_cols = [f"name_tfidf_{i}" for i in range(name_train.shape[1])]

    df_train = pd.concat(
        [df_train,
         pd.DataFrame(name_train.toarray(), columns=tfidf_cols, index=df_train.index)],
        axis=1
    )
    df_valid = pd.concat(
        [df_valid,
         pd.DataFrame(name_valid.toarray(), columns=tfidf_cols, index=df_valid.index)],
        axis=1
    )

    # -------------------------------
    # 3.3 Target Encoding (train only)
    # -------------------------------
    te_cols = [
        "property_type",
        "neighbourhood_cleansed",
        "host_response_time",
        "host_verifications",
        "bathrooms_text",
        "room_type",
    ]

    te = TargetEncoder(cols=te_cols, smoothing=5.0)
    te.fit(df_train[te_cols], y_train)

    df_train_te = te.transform(df_train[te_cols]).add_suffix("_te")
    df_valid_te = te.transform(df_valid[te_cols]).add_suffix("_te")

    df_train = pd.concat([df_train, df_train_te], axis=1)
    df_valid = pd.concat([df_valid, df_valid_te], axis=1)

    # -------------------------------
    # 3.4 Build numeric feature matrix
    # -------------------------------
    drop_cols = [
        "id",
        "price",
        "price_log",
        "name",
        "description",
        "comments",
        "amenities",
        "amenities_list",
        "property_type",
        "neighbourhood_cleansed",
        "host_response_time",
        "host_verifications",
        "bathrooms_text",
        "room_type",
        "first_review",
        "last_review",
        "first_review_dt",
        "last_review_dt",
        "host_since",
        "host_since_dt",
        "estimated_occupancy_l365d",
        "estimated_revenue_l365d",
    ]

    df_train_model = df_train.drop(columns=drop_cols, errors="ignore")
    df_valid_model = df_valid.drop(columns=drop_cols, errors="ignore")

    feature_cols = df_train_model.select_dtypes(include=[np.number]).columns.tolist()

    # Optionally check all folds use same features
    if all_feature_names is None:
        all_feature_names = feature_cols
    else:
        if feature_cols != all_feature_names:
            raise ValueError("Feature mismatch between folds ‚Äì check your pipeline!")

    X_train = df_train_model[feature_cols]
    X_valid = df_valid_model[feature_cols]

    # -------------------------------
    # 3.5 Train XGBoost (no HP search here)
    # -------------------------------
    model = XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        early_stopping_rounds=50,
        tree_method="hist",
        random_state=42,
        n_jobs=-1,
        objective="reg:squarederror",
        eval_metric="rmse",
    )

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False,
    )

    preds_log = model.predict(X_valid)
    preds_price = np.expm1(preds_log)  # back to original scale

    # Store OOF preds for this fold
    oof_predictions[valid_idx] = preds_price

    # Optional: fold metrics
    fold_mae = mean_absolute_error(np.expm1(y_valid), preds_price)
    fold_rmse = np.sqrt(mean_squared_error(np.expm1(y_valid), preds_price))
    print(f"Fold {fold} MAE:  {fold_mae:.3f}")
    print(f"Fold {fold} RMSE: {fold_rmse:.3f}")

    fold += 1

# ===========================================================
# 4Ô∏è‚É£ GLOBAL OOF EVALUATION (NO LEAKAGE)
# ===========================================================
y_true = df["price"].values

oof_mae = mean_absolute_error(y_true, oof_predictions)
oof_rmse = np.sqrt(mean_squared_error(y_true, oof_predictions))

print("\n======================")
print(f"üìä OOF MAE:  {oof_mae:.3f}")
print(f"üìä OOF RMSE: {oof_rmse:.3f}")
print("======================")

# ===========================================================
# 5Ô∏è‚É£ SAVE PREDICTIONS
# ===========================================================
# a) Just id + prediction (for later merge)
oof_df = pd.DataFrame({
    "id": df["id"],
    "price_oof_pred": oof_predictions
})
pred_path = os.path.join(project_root, "data", "Airbnb_OOF_Predictions.csv")
oof_df.to_csv(pred_path, index=False)
print(f"\nüíæ Saved id + OOF predictions to:\n{pred_path}")

# b) Full dataset with new column
df_with_pred = df_original.copy()
df_with_pred["price_oof_pred"] = oof_predictions
full_path = os.path.join(project_root, "data", "Airbnb_DK_Processed_Data.csv")
df_with_pred.to_csv(full_path, index=False)
print(f"üíæ Saved full dataset with OOF preds to:\n{full_path}")



üîÅ Fold 1/5




Fold 1 MAE:  274.060
Fold 1 RMSE: 458.998

üîÅ Fold 2/5




Fold 2 MAE:  273.738
Fold 2 RMSE: 469.989

üîÅ Fold 3/5




Fold 3 MAE:  292.152
Fold 3 RMSE: 539.499

üîÅ Fold 4/5




Fold 4 MAE:  291.634
Fold 4 RMSE: 516.743

üîÅ Fold 5/5




Fold 5 MAE:  339.461
Fold 5 RMSE: 1706.760

üìä OOF MAE:  294.205
üìä OOF RMSE: 883.375

üíæ Saved id + OOF predictions to:
/Users/benyla/Documents/GitHub/02807-Comp-Tools-for-DS-Project/data/Airbnb_OOF_Predictions.csv
üíæ Saved full dataset with OOF preds to:
/Users/benyla/Documents/GitHub/02807-Comp-Tools-for-DS-Project/data/Airbnb_DK_Processed_Data_with_oof.csv
