<a href="https://colab.research.google.com/github/NZLouislu/nzlouis-property-ai-engine/blob/main/notebooks/Wellington_Property_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a># Wellington房产预测模型 - 基于真实数据

这个notebook使用real_estate表和properties表中的真实数据进行训练，预测房产价格。

## 1. 环境设置和依赖安装

In [1]:
# 安装必要的包
!pip install pandas numpy scikit-learn matplotlib seaborn supabase python-dotenv

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle
from supabase import create_client

Collecting supabase
  Downloading supabase-2.20.0-py3-none-any.whl.metadata (4.5 kB)
Collecting realtime (from supabase)
  Downloading realtime-2.20.0-py3-none-any.whl.metadata (6.9 kB)
Collecting supabase-functions (from supabase)
  Downloading supabase_functions-2.20.0-py3-none-any.whl.metadata (2.2 kB)
Collecting storage3 (from supabase)
  Downloading storage3-2.20.0-py3-none-any.whl.metadata (2.0 kB)
Collecting supabase-auth (from supabase)
  Downloading supabase_auth-2.20.0-py3-none-any.whl.metadata (6.3 kB)
Collecting postgrest (from supabase)
  Downloading postgrest-2.20.0-py3-none-any.whl.metadata (3.3 kB)
Collecting deprecation>=2.1.0 (from postgrest->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting strenum>=0.4.15 (from supabase-functions->supabase)
  Downloading StrEnum-0.4.15-py3-none-any.whl.metadata (5.3 kB)
Downloading supabase-2.20.0-py3-none-any.whl (16 kB)
Downloading postgrest-2.20.0-py3-none-any.whl (22 kB)
Downloading real

## 2. 设置Supabase凭据

In [2]:
# 设置Supabase凭据
# 方法1: 使用Colab的secrets功能（推荐）
try:
    from google.colab import userdata
    os.environ['SUPABASE_URL'] = userdata.get('SUPABASE_URL').strip()
    os.environ['SUPABASE_KEY'] = userdata.get('SUPABASE_KEY').strip()
    print("✅ 从Colab secrets加载数据库配置")
except:
    print("⚠️ 未找到Colab secrets，请手动设置SUPABASE_URL和SUPABASE_KEY")

    # 方法2: 直接设置（不推荐用于生产环境）
    # os.environ['SUPABASE_URL'] = 'your_supabase_url_here'
    # os.environ['SUPABASE_KEY'] = 'your_supabase_key_here'

# 创建Supabase客户端
def create_supabase_client():
    """创建Supabase客户端"""
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")

        if not url or not key:
            raise ValueError("SUPABASE_URL和SUPABASE_KEY环境变量必须设置")

        return create_client(url, key)
    except Exception as e:
        print(f"❌ 创建Supabase客户端失败: {e}")
        return None

supabase_client = create_supabase_client()
if supabase_client:
    print("✅ 数据库连接成功")
else:
    print("❌ 数据库连接失败")

✅ 从Colab secrets加载数据库配置
✅ 数据库连接成功


## 3. 从real_estate表和properties表获取数据

In [10]:
# 从properties表获取数据
print("🔄 从properties表获取数据...")
try:
    properties_data = supabase_client.table('properties').select('*').execute()
    properties_df = pd.DataFrame(properties_data.data)
    print(f"✅ 成功获取 {len(properties_df)} 条properties记录")
    print(f"📋 properties数据列: {list(properties_df.columns)}")
    print("Properties表前5行数据:")
    display(properties_df.head())

    # Clean floor_size and land_area
    def clean_size(size_str):
        if isinstance(size_str, str):
            # Attempt to remove ' m2', commas, and handle potential ranges or non-numeric inputs
            size_str = size_str.replace(' m2', '').replace(',', '').strip()
            # Try converting to float, coercing errors to NaN
            return pd.to_numeric(size_str, errors='coerce')
        return size_str

    if 'floor_size' in properties_df.columns:
        properties_df['floor_size'] = properties_df['floor_size'].apply(clean_size)
        print("✅ 清理 floor_size")

    if 'land_area' in properties_df.columns:
        properties_df['land_area'] = properties_df['land_area'].apply(clean_size)
        print("✅ 清理 land_area")

    # Save the processed data (optional, but good practice)
    properties_df.to_csv('processed_property_data.csv', index=False)
    print("✅ 数据已保存到 processed_property_data.csv")

except Exception as e:
    print(f"❌ 获取数据时发生错误: {e}")

# Assign properties_df to merged_df for subsequent cells that expect it
merged_df = properties_df

🔄 从properties表获取数据...
✅ 成功获取 1000 条properties记录
📋 properties数据列: ['id', 'address', 'suburb', 'city', 'postcode', 'year_built', 'bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area', 'last_sold_price', 'last_sold_date', 'capital_value', 'land_value', 'improvement_value', 'has_rental_history', 'is_currently_rented', 'status', 'property_history', 'normalized_address', 'property_url', 'created_at', 'region', 'cover_image_url']
Properties表前5行数据:


Unnamed: 0,id,address,suburb,city,postcode,year_built,bedrooms,bathrooms,car_spaces,floor_size,...,improvement_value,has_rental_history,is_currently_rented,status,property_history,normalized_address,property_url,created_at,region,cover_image_url
0,f92f77b8cab0bb9c18507ab3c57b2c13,"Aitken Street Apartments, Thorndon, 6011",Thorndon,Wellington City,6011,2005.0,1.0,1.0,,21 m2,...,120000.0,False,False,,Historical data migrated - contains transactio...,Aitken Street Apartments,https://propertyvalue.co.nz/wellington/welling...,2025-06-05T00:53:13.409709+00:00,Wellington,
1,f92f7bde532ebc3cb8ff2c63e503c63d,"11 Brian Place, Titahi Bay, 5022",Titahi Bay,Porirua City,5022,1973.0,4.0,1.0,2.0,200 m2,...,200000.0,False,False,,Historical data migrated - contains transactio...,"Brian Place, Titahi Bay, 5022",https://propertyvalue.co.nz/wellington/porirua...,2025-05-30T13:44:52.345112+00:00,Wellington,
2,f930fdb42f169d3ca4b1c883d9d29202,"St Peters Apartments, Te Aro, 6011",Te Aro,Wellington City,6011,2000.0,1.0,1.0,,51 m2,...,265000.0,False,False,,Historical data migrated - contains transactio...,St Peters Apartments,https://propertyvalue.co.nz/wellington/welling...,2025-06-09T10:02:47.035917+00:00,Wellington,
3,f931dbc16dbec808f948718776fca296,"77 Fraser Crescent, Elderslea, 5018",Elderslea,Upper Hutt City,5018,1964.0,3.0,1.0,3.0,140 m2,...,250000.0,False,False,,Historical data migrated - contains transactio...,"Fraser Crescent, Elderslea, 5018",https://propertyvalue.co.nz/wellington/upper-h...,2025-05-31T10:44:41.448545+00:00,Wellington,
4,f9320dc5db7c51047c89bb282056367a,"404/2 Colombo Street, Newtown, 6021",Newtown,Wellington City,6021,2021.0,1.0,1.0,,38 m2,...,320000.0,False,False,,Historical data migrated - contains transactio...,"Colombo Street, Newtown, 6021",https://propertyvalue.co.nz/wellington/welling...,2025-06-05T01:20:13.989957+00:00,Wellington,


✅ 清理 floor_size
✅ 清理 land_area
✅ 数据已保存到 processed_property_data.csv


## 4. 特征工程

In [17]:
# 特征工程
print("🔄 开始特征工程...")

# Drop rows where the target variable is missing
initial_rows = merged_df.shape[0]
merged_df.dropna(subset=['last_sold_price'], inplace=True)
rows_after_dropping_target_na = merged_df.shape[0]
print(f"✅ 删除缺失 last_sold_price 的记录，剩余 {rows_after_dropping_target_na} 条记录 (删除了 {initial_rows - rows_after_dropping_target_na} 条)")

# Reset index after dropping rows
merged_df.reset_index(drop=True, inplace=True)
print("✅ 重置 DataFrame 索引")

# Process categorical features
if 'suburb' in merged_df.columns:
    # Handle potential NaN values in 'suburb' before one-hot encoding
    merged_df['suburb'] = merged_df['suburb'].fillna('Unknown')
    suburb_dummies = pd.get_dummies(merged_df['suburb'], prefix='suburb')
    merged_df = pd.concat([merged_df, suburb_dummies], axis=1)
    merged_df.drop('suburb', axis=1, inplace=True)
    print(f"✅ 对suburb进行独热编码，新增 {len(suburb_dummies.columns)} 个特征")


# Add new features
if 'year_built' in merged_df.columns:
    # Ensure year_built is numeric before imputing and calculating age
    merged_df['year_built'] = pd.to_numeric(merged_df['year_built'], errors='coerce')
    if merged_df['year_built'].isnull().any():
        year_built_median = merged_df['year_built'].median()
        merged_df['year_built'].fillna(year_built_median, inplace=True)
        print(f"✅ 使用中位数 {year_built_median:.0f} 填充缺失的 year_built")
    merged_df['property_age'] = 2025 - merged_df['year_built']
    print("✅ 添加特征: property_age (房产年龄)")

if 'bedrooms' in merged_df.columns and 'bathrooms' in merged_df.columns:
    # Ensure bedrooms and bathrooms are numeric before imputing and calculating ratio
    merged_df['bedrooms'] = pd.to_numeric(merged_df['bedrooms'], errors='coerce')
    merged_df['bathrooms'] = pd.to_numeric(merged_df['bathrooms'], errors='coerce')

    # Impute missing 'bedrooms' and 'bathrooms' before creating the ratio
    if merged_df['bedrooms'].isnull().any():
        bedrooms_median = merged_df['bedrooms'].median()
        merged_df['bedrooms'].fillna(bedrooms_median, inplace=True)
        print(f"✅ 使用中位数 {bedrooms_median:.0f} 填充缺失的 bedrooms")
    if merged_df['bathrooms'].isnull().any():
        bathrooms_median = merged_df['bathrooms'].median()
        merged_df['bathrooms'].fillna(bathrooms_median, inplace=True)
        print(f"✅ 使用中位数 {bathrooms_median:.0f} 填充缺失的 bathrooms")

    # Avoid division by zero
    merged_df['bedroom_bathroom_ratio'] = merged_df['bedrooms'] / merged_df['bathrooms'].replace(0, 1)
    print("✅ 添加特征: bedroom_bathroom_ratio (卧室与浴室比例)")


# Ensure floor_size and bedrooms are numeric before calculations
if 'floor_size' in merged_df.columns and 'last_sold_price' in merged_df.columns:
    # Explicitly convert to numeric before calculations
    merged_df['floor_size'] = pd.to_numeric(merged_df['floor_size'], errors='coerce')
    # Impute missing 'floor_size' before creating price_per_sqm and sqm_per_bedroom
    if merged_df['floor_size'].isnull().any():
        floor_size_median = merged_df['floor_size'].median()
        merged_df['floor_size'].fillna(floor_size_median, inplace=True)
        print(f"✅ 使用中位数 {floor_size_median:.2f} 填充缺失的 floor_size")

    # Calculate price_per_sqm using vectorized operations, handling division by zero and non-positive floor_size
    merged_df['price_per_sqm'] = 0
    valid_floor_size_mask = np.greater(merged_df['floor_size'].values, 0) # Convert to numpy array
    merged_df.loc[valid_floor_size_mask, 'price_per_sqm'] = merged_df.loc[valid_floor_size_mask, 'last_sold_price'] / merged_df.loc[valid_floor_size_mask, 'floor_size']
    print("✅ 添加特征: price_per_sqm (每平方米价格)")


if 'bedrooms' in merged_df.columns and 'floor_size' in merged_df.columns:
     # Explicitly convert bedrooms to numeric before calculations
    merged_df['bedrooms'] = pd.to_numeric(merged_df['bedrooms'], errors='coerce')

    # Check data types before calculation
    print(f"Debug: Data type of 'bedrooms' before sqm_per_bedroom calculation: {merged_df['bedrooms'].dtype}")
    print(f"Debug: Data type of 'floor_size' before sqm_per_bedroom calculation: {merged_df['floor_size'].dtype}")

     # Only calculate sqm_per_bedroom if bedrooms is positive
    merged_df['sqm_per_bedroom'] = 0
    valid_bedrooms_mask = np.greater(merged_df['bedrooms'].values, 0) # Convert to numpy array
    merged_df.loc[valid_bedrooms_mask, 'sqm_per_bedroom'] = merged_df.loc[valid_bedrooms_mask, 'floor_size'] / merged_df.loc[valid_bedrooms_mask, 'bedrooms']
    print("✅ 添加特征: sqm_per_bedroom (每卧室平方米)")


# Prepare features and target variable
y = merged_df['last_sold_price']
X = merged_df.drop(['last_sold_price', 'id', 'created_at', 'address', 'normalized_address', 'property_history', 'property_url', 'cover_image_url', 'last_sold_date'], axis=1, errors='ignore')


# Check and drop non-numeric features and impute remaining missing numeric values
numeric_cols = X.select_dtypes(include=np.number).columns
non_numeric_cols = X.select_dtypes(exclude=np.number).columns

if len(non_numeric_cols) > 0:
    print(f"⚠️ 删除非数值型特征: {list(non_numeric_cols)}")
    X = X.drop(non_numeric_cols, axis=1)

# Impute missing values in remaining numeric features
for col in X.columns:
    if X[col].isnull().any():
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)
        print(f"✅ 使用中位数 {median_val:.2f} 填充特征 {col} 中的缺失值")


# Standardization is done in the next cell after splitting

print(f"✅ 特征工程完成，最终特征数量: {X.shape[1]}")
print(f"📊 特征列表: {list(X.columns)}")

🔄 开始特征工程...
✅ 删除缺失 last_sold_price 的记录，剩余 744 条记录 (删除了 0 条)
✅ 重置 DataFrame 索引
✅ 添加特征: property_age (房产年龄)
✅ 添加特征: bedroom_bathroom_ratio (卧室与浴室比例)
✅ 添加特征: price_per_sqm (每平方米价格)
Debug: Data type of 'bedrooms' before sqm_per_bedroom calculation: float64
Debug: Data type of 'floor_size' before sqm_per_bedroom calculation: float64
✅ 添加特征: sqm_per_bedroom (每卧室平方米)
⚠️ 删除非数值型特征: ['city', 'postcode', 'has_rental_history', 'is_currently_rented', 'status', 'region', 'suburb_Alicetown', 'suburb_Aotea', 'suburb_Aro Valley', 'suburb_Ascot Park', 'suburb_Avalon', 'suburb_Belmont', 'suburb_Berhampore', 'suburb_Birchville', 'suburb_Blue Mountains', 'suburb_Boulcott', 'suburb_Broadmeadows', 'suburb_Brooklyn', 'suburb_Camborne', 'suburb_Cannons Creek', 'suburb_Churton Park', 'suburb_Clouston Park', 'suburb_Crofton Downs', 'suburb_Eastbourne', 'suburb_Ebdentown', 'suburb_Elderslea', 'suburb_Elsdon', 'suburb_Epuni', 'suburb_Fairfield', 'suburb_Glenside', 'suburb_Grenada Village', 'suburb_Harbour View', '

 11052.63157895  7548.07692308  5044.24778761  1785.71428571
  3194.44444444  5294.11764706  8950.          1368.79432624
  4320.98765432  4715.63981043  4500.          5384.61538462
   900.          1538.46153846  3435.71428571  2300.88495575
  2628.74251497  1089.28571429  2807.29166667  5566.66666667
  7000.           700.           857.14285714  2321.6374269
  1310.52631579   514.70588235  1823.07692308  2700.
  6303.03030303 10472.52747253  6418.91891892    70.58823529
 24264.70588235  6420.38216561  5366.66666667  7231.40495868
 10183.33333333  2073.17073171  2940.29850746  2372.88135593
  8455.88235294  2590.90909091  5428.57142857    78.57142857
 10595.23809524  3913.63636364  4342.46575342  3782.60869565
  2152.17391304  7450.          4000.          5700.
  3358.20895522  8055.55555556  2786.36363636  5131.57894737
  4881.88976378  2604.34782609   866.43835616  1005.71428571
  6366.66666667  1000.          7853.33333333  5000.
  4750.          1393.33333333  7058.82352941  95

## 5. 模型训练和评估

In [None]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"✅ 训练集大小: {X_train.shape[0]}，测试集大小: {X_test.shape[0]}")

# 训练随机森林模型
print("🔄 训练随机森林模型...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 评估模型
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"✅ 模型评估:")
print(f"   - 均方误差 (MSE): {mse:.2f}")
print(f"   - 均方根误差 (RMSE): {rmse:.2f}")
print(f"   - R² 分数: {r2:.4f}")

## 6. 特征重要性分析

In [None]:
# 特征重要性分析
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("📊 特征重要性:")
display(feature_importance)

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 最重要特征')
plt.tight_layout()
plt.show()

## 7. 预测样本房产

In [None]:
# 创建样本房产
sample_properties = pd.DataFrame([
    {
        'bedrooms': 3,
        'bathrooms': 2,
        'floor_size': 120,
        'year_built': 2000,
        'description': 'Wellington Central的三居室房产'
    },
    {
        'bedrooms': 4,
        'bathrooms': 2,
        'floor_size': 180,
        'year_built': 1990,
        'description': 'Lower Hutt的四居室房产'
    },
    {
        'bedrooms': 2,
        'bathrooms': 1,
        'floor_size': 80,
        'year_built': 2010,
        'description': 'Wellington Central的两居室公寓'
    }
])

# 为样本房产添加相同的特征
if 'property_age' in X.columns:
    sample_properties['property_age'] = 2025 - sample_properties['year_built']

if 'bedroom_bathroom_ratio' in X.columns:
    sample_properties['bedroom_bathroom_ratio'] = sample_properties['bedrooms'] / sample_properties['bathrooms']

if 'sqm_per_bedroom' in X.columns:
    sample_properties['sqm_per_bedroom'] = sample_properties['floor_size'] / sample_properties['bedrooms']

# 添加缺失的特征列
for col in X.columns:
    if col not in sample_properties.columns:
        sample_properties[col] = 0

# 确保列顺序与训练数据相同
sample_properties = sample_properties[X.columns]

# 标准化样本特征
sample_properties_scaled = scaler.transform(sample_properties)

# 预测价格
predicted_prices = model.predict(sample_properties_scaled)

# 显示预测结果
results = pd.DataFrame({
    '描述': ['Wellington Central的三居室房产', 'Lower Hutt的四居室房产', 'Wellington Central的两居室公寓'],
    '预测价格 (NZD)': predicted_prices.round(2)
})

print("📊 样本房产价格预测:")
display(results)

## 8. 保存模型

In [None]:
# 保存模型
model_data = {
    'model': model,
    'scaler': scaler,
    'feature_names': list(X.columns),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'r2_score': r2
}

with open('wellington_property_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

# 在Colab环境中下载模型文件
try:
    from google.colab import files
    files.download('wellington_property_model.pkl')
    print("✅ 模型已保存并准备好下载")
except:
    print("✅ 模型已保存为 wellington_property_model.pkl")