In [1]:
import warnings
warnings.filterwarnings('ignore')

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import KFold, cross_validate, HalvingGridSearchCV
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from xgboost import XGBRegressor

RANDOM_STATE = 42

处理category列

In [2]:
def clean_category(string):
    if not isinstance(string, str):
        return "others"
        
    if "housing/rent/apartment" in string:
        return "apartment"
    elif "housing/rent/commercial/retail" in string:
        return "commercial/retail"
    elif "housing/rent/home" in string:
        return "home"
    elif "housing/rent/condo" in string:
        return "condo"
    elif "housing/rent/short_term" in string:
        return "short_term"
    else:
        return "others"

def apply_clean_category(X):
    if isinstance(X, pd.DataFrame):
        s = X.iloc[:, 0]
    else:
        s = pd.Series(X.ravel())
    cleaned_series = s.apply(clean_category)
    return cleaned_series.to_frame()   # 保证二维

category_cleaner = FunctionTransformer(apply_clean_category)

category_pipe = Pipeline(steps=[
    ('clean', category_cleaner),
    ('one_hot', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=0.05))
])

处理title和body列

In [3]:
class SimpleTextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.url_pat = re.compile(r'https?://\S+|www\.\S+', re.I)
        self.mail_pat = re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b')
        self.ws_pat = re.compile(r'\s+')
    def fit(self, X, y=None): return self
    def transform(self, X):
        def norm(s):
            if s is None: return ""
            if isinstance(s, float) and np.isnan(s): return ""
            return str(s)
        cleaned = []
        for t in X:
            s = norm(t).lower()
            s = self.url_pat.sub(" ", s)
            s = self.mail_pat.sub(" ", s)
            s = self.ws_pat.sub(" ", s).strip()
            cleaned.append(s)
        return np.array(cleaned, dtype=object)

class PriceMasker(BaseEstimator, TransformerMixin):
    _pat = re.compile(
        r'((?:\$|€|£)\s*\d[\d,\.]*|\b(?:usd|eur|gbp)\b\s*\d[\d,\.]*|\b\d[\d,\.]{3,}\b)\s*(?:per\s*(?:month|mo)|/mo|/m|/month)?',
        flags=re.I
    )
    def fit(self, X, y=None): return self
    def transform(self, X):
        def norm(s):
            if s is None: return ""
            if isinstance(s, float) and np.isnan(s): return ""
            return str(s)
        return np.array([self._pat.sub(" __PRICE__ ", norm(t)) for t in X], dtype=object)

In [4]:
class EnsureTextDF(BaseEstimator, TransformerMixin):
    def __init__(self, title_col="title", body_col="body"):
        self.title_col = title_col
        self.body_col = body_col
    def fit(self, X, y=None): return self
    def transform(self, X):
        if hasattr(X, "columns"):  # pandas DataFrame
            df = X
            if self.title_col in df.columns and self.body_col in df.columns:
                return df[[self.title_col, self.body_col]]
            df2 = df.iloc[:, :2].copy()
            df2.columns = [self.title_col, self.body_col]
            return df2
        arr = np.asarray(X, dtype=object)
        if arr.ndim == 1:
            arr = np.c_[arr, np.full_like(arr, "", dtype=object)]
        if arr.shape[1] < 2:
            arr = np.hstack([arr, np.full((arr.shape[0], 1), "", dtype=object)])
        return pd.DataFrame({self.title_col: arr[:, 0], self.body_col: arr[:, 1]})

In [5]:
def title_body_pipeline(title_weight: float = 3.0) -> Pipeline:
    title_pipe = Pipeline([
        ("clean", SimpleTextCleaner()),
        ("mask_price", PriceMasker()),
        ("tfidf_word", TfidfVectorizer(
            analyzer="word",
            ngram_range=(1, 2),
            min_df=3,
            strip_accents="unicode",
            sublinear_tf=True,
            smooth_idf=True,
            stop_words=None,
            max_features=15355
        ))
    ])

    body_pipe = Pipeline([
        ("clean", SimpleTextCleaner()),
        ("mask_price", PriceMasker()),
        ("tfidf_char", TfidfVectorizer(
            analyzer="char",
            ngram_range=(3, 5),
            min_df=3,
            strip_accents="unicode",
            sublinear_tf=True,
            smooth_idf=True,
            max_features=210849
        ))
    ])

    text_union = ColumnTransformer(
        transformers=[
            ("title", title_pipe, "title"),
            ("body",  body_pipe,  "body"),
        ],
        transformer_weights={"title": title_weight, "body": 1.0},
        remainder="drop"
    )

    return Pipeline([
        ("ensure_df", EnsureTextDF("title", "body")),
        ("text_union", text_union)
    ])

处理amenities列

In [6]:
to_list = FunctionTransformer(
    lambda X: [[t.strip() for t in str(s).split(',') if t.strip()] for s in np.asarray(X).reshape(-1)]
)

In [7]:
class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        self.mlb.fit(X)
        return self

    def transform(self, X):
        return self.mlb.transform(X)

    def get_feature_names_out(self):
        return self.mlb.classes_

In [8]:
amenities_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value='Nothing')),
    ('to_list', to_list),
    ('multi_hot', MultiHotEncoder())
])

处理pets_allowed列

In [9]:
pets_allowed_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value='No')),
    ('to_list', to_list),
    ('multi_hot', MultiHotEncoder())
])

处理bathrooms, bedrooms和square_feet列

In [10]:
class BedBathSqftSimilarityImputer(BaseEstimator, TransformerMixin):
    """
    bedrooms:   面积相似 + bathrooms 相同 -> 平均
    bathrooms:  面积相似 + bedrooms 相同 -> 平均
    square_feet: bedrooms & bathrooms 相同 -> 平均
    """
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y=None):
        self.train_ = X.copy()
        self.global_means_ = self.train_.mean(numeric_only=True)
        self.mean_by_bath_ = self.train_.dropna(subset=['bedrooms']).groupby('bathrooms')['bedrooms'].mean()
        self.mean_by_bed_  = self.train_.dropna(subset=['bathrooms']).groupby('bedrooms')['bathrooms'].mean()
        self.mean_by_pair_ = self.train_.dropna(subset=['square_feet']).groupby(['bedrooms','bathrooms'])['square_feet'].mean()
        return self

    def _knn_mean_by_area(self, df_cand, target_col, sf_value):
        if df_cand.empty: return np.nan
        cand = df_cand.dropna(subset=[target_col, 'square_feet'])
        if cand.empty: return np.nan
        if pd.isna(sf_value): return cand[target_col].mean()
        d = (cand['square_feet'] - sf_value).abs()
        topk = cand.loc[d.nsmallest(min(self.k, len(cand))).index, target_col]
        return topk.mean() if len(topk) else np.nan

    def transform(self, X):
        X = X.copy()

        # ---- square_feet ----
        mask_sf = X['square_feet'].isna() & X['bedrooms'].notna() & X['bathrooms'].notna()
        for idx in X.loc[mask_sf].index:
            key = (X.at[idx, 'bedrooms'], X.at[idx, 'bathrooms'])
            if key in self.mean_by_pair_.index:
                X.at[idx, 'square_feet'] = self.mean_by_pair_.loc[key]

        # ---- bedrooms ----
        mask_bed = X['bedrooms'].isna()
        cand_bed = self.train_.dropna(subset=['bedrooms'])
        for idx in X.loc[mask_bed].index:
            bth, sf = X.at[idx,'bathrooms'], X.at[idx,'square_feet']
            cand = cand_bed if pd.isna(bth) else cand_bed[cand_bed['bathrooms']==bth]
            val = self._knn_mean_by_area(cand,'bedrooms',sf)
            if pd.isna(val): val = self.mean_by_bath_.get(bth,np.nan)
            if pd.isna(val): val = self.global_means_['bedrooms']
            X.at[idx,'bedrooms']=val

        # ---- bathrooms ----
        mask_bth = X['bathrooms'].isna()
        cand_bth = self.train_.dropna(subset=['bathrooms'])
        for idx in X.loc[mask_bth].index:
            bed, sf = X.at[idx,'bedrooms'], X.at[idx,'square_feet']
            cand = cand_bth if pd.isna(bed) else cand_bth[cand_bth['bedrooms']==bed]
            val = self._knn_mean_by_area(cand,'bathrooms',sf)
            if pd.isna(val): val = self.mean_by_bed_.get(bed,np.nan)
            if pd.isna(val): val = self.global_means_['bathrooms']
            X.at[idx,'bathrooms']=val

        # ---- square_feet 剩余兜底 ----
        mask_sf2 = X['square_feet'].isna()
        for idx in X.loc[mask_sf2].index:
            key = (X.at[idx,'bedrooms'],X.at[idx,'bathrooms'])
            val = self.mean_by_pair_.get(key,np.nan)
            if pd.isna(val): val = self.global_means_['square_feet']
            X.at[idx,'square_feet']=val

        return X

In [11]:
coerce_bed_bath_sqft_to_numeric = FunctionTransformer(lambda X: X.apply(pd.to_numeric, errors='coerce'))

bed_bath_sqft_pipe = Pipeline(steps=[
    ('to_numeric', coerce_bed_bath_sqft_to_numeric),
    ('similar_impute', BedBathSqftSimilarityImputer(k=8))
])

处理cityname, state, latitude和longitude列

In [12]:
class GeoCityStateImputer(BaseEstimator, TransformerMixin):
    """
      state 缺失：优先用 cityname + (lat,lon)；再仅用 cityname 的众数；再用 (lat,lon) 最近邻；最后全局众数
      cityname 缺失：先保证 state 已填；在该 state 内按 (lat,lon) 最近邻；再用该 state 的城市众数；最后全局众数
      latitude/longitude 缺失：用相同 (state, cityname) 组内的均值；若无该组则用全局均值
    """
    def __init__(self):
        pass

    @staticmethod
    def _mode(s):
        s = pd.Series(s).dropna()
        return s.mode().iloc[0] if not s.empty else np.nan

    @staticmethod
    def _norm_key(x):
        if pd.isna(x):
            return pd.NA
        return str(x).strip().lower()

    @staticmethod
    def _nearest_row(train_df, lat, lon, mask=None):
        cand = train_df if mask is None else train_df[mask]
        cand = cand.dropna(subset=['latitude','longitude'])
        if cand.empty or pd.isna(lat) or pd.isna(lon):
            return None
        d2 = (cand['latitude'] - lat)**2 + (cand['longitude'] - lon)**2
        return cand.loc[[d2.idxmin()]]

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['latitude','longitude','cityname','state'])
        self.train_ = X.copy()

        # 规范化键
        self.train_['city_key']  = self.train_['cityname'].apply(self._norm_key)
        self.train_['state_key'] = self.train_['state'].apply(self._norm_key)

        # 全局统计
        tmp = self.train_.copy()
        tmp['latitude']  = pd.to_numeric(tmp['latitude'], errors='coerce')
        tmp['longitude'] = pd.to_numeric(tmp['longitude'], errors='coerce')
        self.global_state_mode_ = self._mode(self.train_['state'])
        self.global_city_mode_  = self._mode(self.train_['cityname'])
        self.global_lat_mean_   = tmp['latitude'].mean()
        self.global_lon_mean_   = tmp['longitude'].mean()

        # city -> state 众数
        self.state_mode_by_city_ = (
            self.train_.dropna(subset=['city_key','state'])
                       .groupby('city_key')['state']
                       .agg(self._mode)
        )
        # state -> city 众数
        self.city_mode_by_state_ = (
            self.train_.dropna(subset=['state_key','cityname'])
                       .groupby('state_key')['cityname']
                       .agg(self._mode)
        )
        # (state, city) -> (lat, lon) 均值
        self.mean_latlon_by_pair_ = (
            tmp.dropna(subset=['state_key','city_key'])
               .groupby(['state_key','city_key'])[['latitude','longitude']]
               .mean()
        )
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['latitude','longitude','cityname','state'])
        X = X.copy()

        # 辅助键
        X['city_key']  = X['cityname'].apply(self._norm_key)
        X['state_key'] = X['state'].apply(self._norm_key)

        # ---------- 1) 填 state ----------
        mask_state_na = X['state'].isna()
        if mask_state_na.any():
            for idx in X.index[mask_state_na]:
                city_k = X.at[idx, 'city_key']
                lat    = X.at[idx, 'latitude']
                lon    = X.at[idx, 'longitude']
                val = np.nan

                # 用该 city 的 state 众数
                if city_k is not pd.NA and city_k in self.state_mode_by_city_.index:
                    val = self.state_mode_by_city_.loc[city_k]
                    # 若有经纬度，尝试同 city 最近邻校正
                    nr = self._nearest_row(self.train_, lat, lon, mask=(self.train_['city_key'] == city_k))
                    if nr is not None and not nr['state'].isna().all():
                        val = nr['state'].iloc[0]

                # 仅用经纬度最近邻
                if pd.isna(val):
                    nr = self._nearest_row(self.train_, lat, lon)
                    if nr is not None and not nr['state'].isna().all():
                        val = nr['state'].iloc[0]

                # 全局众数兜底
                if pd.isna(val):
                    val = self.global_state_mode_

                X.at[idx, 'state'] = val
                X.at[idx, 'state_key'] = self._norm_key(val)

        # ---------- 2) 填 cityname ----------
        mask_city_na = X['cityname'].isna()
        if mask_city_na.any():
            for idx in X.index[mask_city_na]:
                st_k = X.at[idx, 'state_key']
                lat  = X.at[idx, 'latitude']
                lon  = X.at[idx, 'longitude']
                val = np.nan

                if st_k is not pd.NA:
                    # 先在该 state 内最近邻
                    nr = self._nearest_row(self.train_, lat, lon, mask=(self.train_['state_key'] == st_k))
                    if nr is not None and not nr['cityname'].isna().all():
                        val = nr['cityname'].iloc[0]
                    # 再用该 state 的众数
                    if pd.isna(val) and st_k in self.city_mode_by_state_.index:
                        val = self.city_mode_by_state_.loc[st_k]

                # 全局众数兜底
                if pd.isna(val):
                    val = self.global_city_mode_

                X.at[idx, 'cityname'] = val
                X.at[idx, 'city_key'] = self._norm_key(val)

        # ---------- 3) 填经纬度 ----------
        X['latitude']  = pd.to_numeric(X['latitude'], errors='coerce')
        X['longitude'] = pd.to_numeric(X['longitude'], errors='coerce')

        for idx in X.index:
            st_k = X.at[idx, 'state_key']
            ct_k = X.at[idx, 'city_key']

            if pd.isna(X.at[idx, 'latitude']):
                val_lat = np.nan
                if (st_k is not pd.NA) and (ct_k is not pd.NA):
                    try:
                        val_lat = self.mean_latlon_by_pair_.loc[(st_k, ct_k), 'latitude']
                    except KeyError:
                        pass
                if pd.isna(val_lat):
                    val_lat = self.global_lat_mean_
                X.at[idx, 'latitude'] = val_lat

            if pd.isna(X.at[idx, 'longitude']):
                val_lon = np.nan
                if (st_k is not pd.NA) and (ct_k is not pd.NA):
                    try:
                        val_lon = self.mean_latlon_by_pair_.loc[(st_k, ct_k), 'longitude']
                    except KeyError:
                        pass
                if pd.isna(val_lon):
                    val_lon = self.global_lon_mean_
                X.at[idx, 'longitude'] = val_lon

        X.drop(columns=['city_key','state_key'], inplace=True, errors='ignore')
        return X

In [13]:
def _geo_to_typed(X):
    # 兼容 ndarray / DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X, columns=geo_cols)
    df = X.copy()
    # 经纬度：非数字 -> NaN
    df['latitude']  = pd.to_numeric(df['latitude'], errors='coerce')
    df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
    # 文本列：去空白，空串 -> NaN
    for c in ['cityname', 'state']:
        df[c] = (df[c].astype('string')
                        .str.strip()
                        .replace({'': pd.NA}))
    return df

geo_to_typed = FunctionTransformer(_geo_to_typed)

In [14]:
geo_pipe = Pipeline(steps=[
    ('to_typed', geo_to_typed),
    ('impute_geo', GeoCityStateImputer()),
    ('make_city_state', FunctionTransformer(
        lambda X: X.assign(
            city_state=(
                X['cityname'].astype('string').str.strip() + ', ' + X['state'].astype('string').str.strip()
            )
        )
    )),
    ('encode', ColumnTransformer(
        transformers=[
            # 对合并列做独热编码
            ('ohe_city_state', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=0.05), ['city_state']),
            # 经纬度保留为数值特征
            ('pass_latlon', 'passthrough', ['latitude', 'longitude']),
        ],
        remainder='drop'
    ))
])

处理fee, has_photo和source列

In [15]:
fee_photo_source_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=0.05))
])

处理time列

In [16]:
class TimeEncoder(BaseEstimator, TransformerMixin):
    """
    时间戳 -> [sin(month), cos(month), days_since_first]
    """
    def __init__(self):
        self.t0_ = None

    def fit(self, X, y=None):
        x = pd.to_datetime(pd.Series(np.ravel(X)), errors="coerce", utc=True)
        if x.notna().any():
            self.t0_ = x.min()
        else:
            # 极端：全缺失，给个稳定兜底
            self.t0_ = pd.Timestamp("1970-01-01", tz="UTC")
        return self

    def transform(self, X):
        x = pd.to_datetime(pd.Series(np.ravel(X)), errors="coerce", utc=True).fillna(self.t0_)
        month = x.dt.month.to_numpy()
        sin_month = np.sin(2*np.pi*month/12.0)
        cos_month = np.cos(2*np.pi*month/12.0)
        days_since = ((x - self.t0_).dt.total_seconds()/86400.0).to_numpy()
        return np.vstack([sin_month, cos_month, days_since]).T

    def get_feature_names_out(self, input_features=None):
        return np.array(["time_sin_month", "time_cos_month", "time_days_since_first"])

In [17]:
time_pipe = Pipeline(steps=[
    ("encode", TimeEncoder())
])

In [18]:
bed_bath_sqft_cols = ['bedrooms','bathrooms','square_feet']
geo_cols = ['latitude', 'longitude', 'cityname', 'state']

preprocessor = ColumnTransformer(
    transformers=[
        ('category', category_pipe, ['category']),
        ('text', title_body_pipeline(title_weight=3.0), ['title', 'body']),
        ('amenities', amenities_pipe, ['amenities']),
        ('fee_photo_source', fee_photo_source_pipe, ['fee', 'has_photo', 'source']),
        ('pets', pets_allowed_pipe, ['pets_allowed']),
        ('layout', bed_bath_sqft_pipe, bed_bath_sqft_cols),
        ('geo', geo_pipe, geo_cols),
        ('time', time_pipe, ['time']),
    ],
    remainder='drop'
)

In [19]:
full_pipe_xgb = Pipeline([
    ('preprocess', preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:absoluteerror',
        eval_metric='mae',
        subsample=0.8,
        reg_alpha=0.1, 
        reg_lambda=5.0,
        max_bin=128,
        min_child_weight=6.0,
        tree_method="hist",
        device="cuda",
        n_jobs=-1
    ))
])

In [20]:
param_grid_xgb = {
    'xgb__max_depth':       [2, 3, 4],
    'xgb__n_estimators':    [2000, 2200, 2400, 2600, 2800],
    'xgb__colsample_bytree':[0.25, 0.26, 0.27]
}

In [21]:
grid_xgb = HalvingGridSearchCV(
    estimator=full_pipe_xgb,
    param_grid=param_grid_xgb,
    cv=5,
    scoring='neg_mean_absolute_error',
    refit=True,
    verbose=1,
    random_state=RANDOM_STATE
)

In [22]:
df_train = pd.read_csv('train.csv')
        
X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
print(f"训练数据加载完成，包含 {len(df_train)} 条记录。")

训练数据加载完成，包含 63663 条记录。


In [23]:
grid_xgb.fit(X_train, y_train)

print("\n✅ 训练完成！")
print("-" * 50)
print("📊 交叉验证结果:")
print(f"  - 最佳参数: {grid_xgb.best_params_}")
print(f"  - 最佳交叉验证 MAE: {-grid_xgb.best_score_:.4f}")
print("-" * 50)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 2357
max_resources_: 63663
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 2357
Fitting 5 folds for each of 45 candidates, totalling 225 fits
----------
iter: 1
n_candidates: 15
n_resources: 7071
Fitting 5 folds for each of 15 candidates, totalling 75 fits
----------
iter: 2
n_candidates: 5
n_resources: 21213
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 63639
Fitting 5 folds for each of 2 candidates, totalling 10 fits

✅ 训练完成！
--------------------------------------------------
📊 交叉验证结果:
  - 最佳参数: {'xgb__colsample_bytree': 0.26, 'xgb__max_depth': 2, 'xgb__n_estimators': 2800}
  - 最佳交叉验证 MAE: 203.5634
--------------------------------------------------


In [26]:
df_test = pd.read_csv('test.csv')

if 'price' not in df_test.columns:
    raise ValueError("test.csv 中未找到目标列 'price'。请确认测试集包含真实的 price 以便计算指标。")

X_test = df_test.drop(columns=['price'])
y_test = df_test['price'].to_numpy()

# 预测
y_pred = grid_xgb.predict(X_test)

# 指标：MAE
mae = mean_absolute_error(y_test, y_pred)

# 指标：MAPE（对 y_true=0 的样本进行掩码，避免除零）
mask = y_test != 0
if mask.sum() == 0:
    mape = np.nan
    mape_note = "（所有真实值为0，MAPE无法计算）"
else:
    mape = np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100
    mape_note = ""

print("🧪 测试集评估：")
print(f"  - MAE : {mae:.4f}")
print(f"  - MAPE: {mape:.2f}% {mape_note}")

🧪 测试集评估：
  - MAE : 196.9422
  - MAPE: 12.10% 
