<a href="https://colab.research.google.com/github/NZLouislu/nzlouis-property-ai-engine/blob/main/notebooks/Wellington_Property_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a># Wellington房产预测模型 - 基于真实数据

这个notebook使用real_estate表和properties表中的真实数据进行训练，预测房产价格。

## 1. 环境设置和依赖安装

In [6]:
# 安装必要的包
!pip install pandas numpy scikit-learn matplotlib seaborn supabase python-dotenv

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle
from supabase import create_client




[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: E:\Next.js\realEstate\property-forecast-system\property_forecast_env\Scripts\python.exe -m pip install --upgrade pip


## 2. 设置Supabase凭据

In [7]:
# 设置Supabase凭据
# 方法1: 使用Colab的secrets功能（推荐）
try:
    from google.colab import userdata
    os.environ['SUPABASE_URL'] = userdata.get('SUPABASE_URL').strip()
    os.environ['SUPABASE_KEY'] = userdata.get('SUPABASE_KEY').strip()
    print("✅ 从Colab secrets加载数据库配置")
except:
    print("⚠️ 未找到Colab secrets，请手动设置SUPABASE_URL和SUPABASE_KEY")

    # 方法2: 直接设置（不推荐用于生产环境）
    # os.environ['SUPABASE_URL'] = 'your_supabase_url_here'
    # os.environ['SUPABASE_KEY'] = 'your_supabase_key_here'

# 创建Supabase客户端
def create_supabase_client():
    """创建Supabase客户端"""
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")

        if not url or not key:
            raise ValueError("SUPABASE_URL和SUPABASE_KEY环境变量必须设置")

        return create_client(url, key)
    except Exception as e:
        print(f"❌ 创建Supabase客户端失败: {e}")
        return None

supabase_client = create_supabase_client()
if supabase_client:
    print("✅ 数据库连接成功")
else:
    print("❌ 数据库连接失败")

⚠️ 未找到Colab secrets，请手动设置SUPABASE_URL和SUPABASE_KEY
✅ 数据库连接成功


## 3. 从real_estate表和properties表获取数据

In [8]:
# 从real_estate表获取数据
print("🔄 从real_estate表获取数据...")
try:
    real_estate_data = supabase_client.table('real_estate').select('*').execute()
    real_estate_df = pd.DataFrame(real_estate_data.data)
    print(f"✅ 成功获取 {len(real_estate_df)} 条real_estate记录")
    print(f"📋 real_estate数据列: {list(real_estate_df.columns)}")
    print("Real Estate表前5行数据:")
    display(real_estate_df.head())
except Exception as e:
    print(f"❌ 获取数据时发生错误: {e}")

# 从properties表获取数据
print("\n🔄 从properties表获取数据...")
try:
    properties_data = supabase_client.table('properties').select('*').execute()
    properties_df = pd.DataFrame(properties_data.data)
    print(f"✅ 成功获取 {len(properties_df)} 条properties记录")
    print(f"📋 properties数据列: {list(properties_df.columns)}")
    print("Properties表前5行数据:")
    display(properties_df.head())
except Exception as e:
    print(f"❌ 获取数据时发生错误: {e}")

# 合并两个表的数据
print("\n🔄 合并real_estate和properties表数据...")
try:
    merged_df = pd.merge(
        real_estate_df,
        properties_df,
        left_on='normalized_lead_address',
        right_on='normalized_address',
        how='inner'
    )
    print(f"✅ 合并后的数据: {len(merged_df)} 条记录")
    print(f"?? 合并后的数据列: {list(merged_df.columns)}")
    print("合并后的数据前5行:")
    display(merged_df.head())
    
    # 保存合并后的数据
    merged_df.to_csv('processed_property_data.csv', index=False)
    print("✅ 数据已保存到 processed_property_data.csv")
except Exception as e:
    print(f"❌ 合并数据时发生错误: {e}")

🔄 从real_estate表获取数据...
✅ 成功获取 1000 条real_estate记录
📋 real_estate数据列: ['id', 'address', 'status', 'data', 'normalized_lead_address']
Real Estate表前5行数据:
✅ 成功获取 1000 条real_estate记录
📋 real_estate数据列: ['id', 'address', 'status', 'data', 'normalized_lead_address']
Real Estate表前5行数据:


Unnamed: 0,id,address,status,data,normalized_lead_address
0,beac910ca5f6eb78c64318622ff39931,"51 Leicester Street, Cannons Creek",for Sale,2025-05-27T11:13:56.813162,51leicesterstreet
1,8eea86691c11dbcbf8e140d0636db31e,"1/35 Torrens Terrace, Mount Cook",for Sale,2025-05-27T11:13:57.969601,1/35torrensterrace
2,6df140516d55c84c2485da82244a0dd4,"159 Sievers Grove, Cannons Creek",for Sale,2025-05-27T11:13:59.106338,159sieversgrove
3,1be5736c077e8e27b4c2e0067865aecf,"15 Kiriwai Road, Paremata",for Sale,2025-05-27T11:14:00.286188,15kiriwairoad
4,98edc5b4a6301279d9c46aca4ddb8117,"80 Greenwood Boulevard, Otaki",for Sale,2025-05-27T11:14:01.430086,80greenwoodboulevard



🔄 从properties表获取数据...
✅ 成功获取 1000 条properties记录
📋 properties数据列: ['id', 'address', 'suburb', 'city', 'postcode', 'year_built', 'bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area', 'last_sold_price', 'last_sold_date', 'capital_value', 'land_value', 'improvement_value', 'has_rental_history', 'is_currently_rented', 'status', 'property_history', 'normalized_address', 'property_url', 'created_at', 'region', 'cover_image_url']
Properties表前5行数据:
✅ 成功获取 1000 条properties记录
📋 properties数据列: ['id', 'address', 'suburb', 'city', 'postcode', 'year_built', 'bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area', 'last_sold_price', 'last_sold_date', 'capital_value', 'land_value', 'improvement_value', 'has_rental_history', 'is_currently_rented', 'status', 'property_history', 'normalized_address', 'property_url', 'created_at', 'region', 'cover_image_url']
Properties表前5行数据:


Unnamed: 0,id,address,suburb,city,postcode,year_built,bedrooms,bathrooms,car_spaces,floor_size,...,improvement_value,has_rental_history,is_currently_rented,status,property_history,normalized_address,property_url,created_at,region,cover_image_url
0,f3258321aa04a5e6548b94eaa2c755df,"6 Bledisloe Crescent, Wainuiomata, 5014",Wainuiomata,Lower Hutt City,5014,1954.0,3.0,1.0,2.0,110 m2,...,145000.0,False,False,,Historical data migrated - contains transactio...,"Bledisloe Crescent, Wainuiomata, 5014",https://propertyvalue.co.nz/wellington/lower-h...,2025-05-31T04:56:34.629347+00:00,Wellington,
1,f32635a1bdcf4eadce4e3e58b8b29048,"9 Baylands Drive, Newlands, 6037",Newlands,Wellington City,6037,1964.0,3.0,1.0,1.0,150 m2,...,345000.0,False,False,,Historical data migrated - contains transactio...,"Baylands Drive, Newlands, 6037",https://propertyvalue.co.nz/wellington/welling...,2025-06-05T02:31:33.78117+00:00,Wellington,
2,f32650bc505663d705cde9d6e1bb3137,"41b Herewini Street, Titahi Bay, 5022",Titahi Bay,Porirua City,5022,1999.0,3.0,2.0,2.0,147 m2,...,430000.0,False,False,,Historical data migrated - contains transactio...,"Herewini Street, Titahi Bay, 5022",https://propertyvalue.co.nz/wellington/porirua...,2025-05-30T14:28:32.778597+00:00,Wellington,
3,f3267f01d056e75b8d7498428dee651d,"12 Willoughby Street, Woburn, 5010",Woburn,Lower Hutt City,5010,1961.0,3.0,1.0,2.0,250 m2,...,260000.0,False,False,,Historical data migrated - contains transactio...,"Willoughby Street, Woburn, 5010",https://propertyvalue.co.nz/wellington/lower-h...,2025-05-31T00:51:18.237223+00:00,Wellington,
4,f326f7a0e7e1659cec52d2f920e2c10f,"3/131 Queens Drive, Lyall Bay, 6022",Lyall Bay,Wellington City,6022,2003.0,3.0,1.0,1.0,99 m2,...,450000.0,False,False,,Historical data migrated - contains transactio...,"Queens Drive, Lyall Bay, 6022",https://propertyvalue.co.nz/wellington/welling...,2025-06-05T03:30:41.611954+00:00,Wellington,



🔄 合并real_estate和properties表数据...
✅ 合并后的数据: 0 条记录
?? 合并后的数据列: ['id_x', 'address_x', 'status_x', 'data', 'normalized_lead_address', 'id_y', 'address_y', 'suburb', 'city', 'postcode', 'year_built', 'bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area', 'last_sold_price', 'last_sold_date', 'capital_value', 'land_value', 'improvement_value', 'has_rental_history', 'is_currently_rented', 'status_y', 'property_history', 'normalized_address', 'property_url', 'created_at', 'region', 'cover_image_url']
合并后的数据前5行:


Unnamed: 0,id_x,address_x,status_x,data,normalized_lead_address,id_y,address_y,suburb,city,postcode,...,improvement_value,has_rental_history,is_currently_rented,status_y,property_history,normalized_address,property_url,created_at,region,cover_image_url


✅ 数据已保存到 processed_property_data.csv


## 4. 特征工程

In [9]:
# 特征工程
print("🔄 开始特征工程...")

# 处理类别型特征
if 'suburb' in merged_df.columns:
    suburb_dummies = pd.get_dummies(merged_df['suburb'], prefix='suburb')
    merged_df = pd.concat([merged_df, suburb_dummies], axis=1)
    merged_df.drop('suburb', axis=1, inplace=True)
    print(f"✅ 对suburb进行独热编码，新增 {len(suburb_dummies.columns)} 个特征")

# 添加新特征
if 'year_built' in merged_df.columns:
    merged_df['property_age'] = 2025 - merged_df['year_built']
    print("✅ 添加特征: property_age (房产年龄)")

if 'bedrooms' in merged_df.columns and 'bathrooms' in merged_df.columns:
    merged_df['bedroom_bathroom_ratio'] = merged_df['bedrooms'] / merged_df['bathrooms'].replace(0, 1)
    print("✅ 添加特征: bedroom_bathroom_ratio (卧室与浴室比例)")

if 'last_sold_price' in merged_df.columns and 'floor_size' in merged_df.columns:
    merged_df['price_per_sqm'] = merged_df['last_sold_price'] / merged_df['floor_size'].replace(0, 1)
    print("✅ 添加特征: price_per_sqm (每平方米价格)")

if 'bedrooms' in merged_df.columns and 'floor_size' in merged_df.columns:
    merged_df['sqm_per_bedroom'] = merged_df['floor_size'] / merged_df['bedrooms'].replace(0, 1)
    print("✅ 添加特征: sqm_per_bedroom (每卧室平方米)")

# 准备特征和目标变量
y = merged_df['last_sold_price']
X = merged_df.drop(['last_sold_price', 'id', 'created_at', 'updated_at', 'address', 'normalized_address', 'data'], axis=1, errors='ignore')

# 检查并删除非数值型特征
non_numeric_cols = X.select_dtypes(exclude=['int64', 'float64']).columns
if len(non_numeric_cols) > 0:
    print(f"⚠️ 删除非数值型特征: {list(non_numeric_cols)}")
    X = X.drop(non_numeric_cols, axis=1)

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"✅ 特征工程完成，最终特征数量: {X.shape[1]}")
print(f"📊 特征列表: {list(X.columns)}")

🔄 开始特征工程...
✅ 对suburb进行独热编码，新增 0 个特征
✅ 添加特征: property_age (房产年龄)
✅ 添加特征: bedroom_bathroom_ratio (卧室与浴室比例)
✅ 添加特征: price_per_sqm (每平方米价格)
✅ 添加特征: sqm_per_bedroom (每卧室平方米)
⚠️ 删除非数值型特征: ['id_x', 'address_x', 'status_x', 'normalized_lead_address', 'id_y', 'address_y', 'city', 'postcode', 'floor_size', 'land_area', 'last_sold_date', 'has_rental_history', 'is_currently_rented', 'status_y', 'property_history', 'property_url', 'region', 'cover_image_url', 'price_per_sqm', 'sqm_per_bedroom']


ValueError: Found array with 0 sample(s) (shape=(0, 9)) while a minimum of 1 is required by StandardScaler.

## 5. 模型训练和评估

In [None]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"✅ 训练集大小: {X_train.shape[0]}，测试集大小: {X_test.shape[0]}")

# 训练随机森林模型
print("🔄 训练随机森林模型...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 评估模型
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"✅ 模型评估:")
print(f"   - 均方误差 (MSE): {mse:.2f}")
print(f"   - 均方根误差 (RMSE): {rmse:.2f}")
print(f"   - R² 分数: {r2:.4f}")

## 6. 特征重要性分析

In [None]:
# 特征重要性分析
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("📊 特征重要性:")
display(feature_importance)

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 最重要特征')
plt.tight_layout()
plt.show()

## 7. 预测样本房产

In [None]:
# 创建样本房产
sample_properties = pd.DataFrame([
    {
        'bedrooms': 3,
        'bathrooms': 2,
        'floor_size': 120,
        'year_built': 2000,
        'description': 'Wellington Central的三居室房产'
    },
    {
        'bedrooms': 4,
        'bathrooms': 2,
        'floor_size': 180,
        'year_built': 1990,
        'description': 'Lower Hutt的四居室房产'
    },
    {
        'bedrooms': 2,
        'bathrooms': 1,
        'floor_size': 80,
        'year_built': 2010,
        'description': 'Wellington Central的两居室公寓'
    }
])

# 为样本房产添加相同的特征
if 'property_age' in X.columns:
    sample_properties['property_age'] = 2025 - sample_properties['year_built']

if 'bedroom_bathroom_ratio' in X.columns:
    sample_properties['bedroom_bathroom_ratio'] = sample_properties['bedrooms'] / sample_properties['bathrooms']

if 'sqm_per_bedroom' in X.columns:
    sample_properties['sqm_per_bedroom'] = sample_properties['floor_size'] / sample_properties['bedrooms']

# 添加缺失的特征列
for col in X.columns:
    if col not in sample_properties.columns:
        sample_properties[col] = 0

# 确保列顺序与训练数据相同
sample_properties = sample_properties[X.columns]

# 标准化样本特征
sample_properties_scaled = scaler.transform(sample_properties)

# 预测价格
predicted_prices = model.predict(sample_properties_scaled)

# 显示预测结果
results = pd.DataFrame({
    '描述': ['Wellington Central的三居室房产', 'Lower Hutt的四居室房产', 'Wellington Central的两居室公寓'],
    '预测价格 (NZD)': predicted_prices.round(2)
})

print("📊 样本房产价格预测:")
display(results)

## 8. 保存模型

In [None]:
# 保存模型
model_data = {
    'model': model,
    'scaler': scaler,
    'feature_names': list(X.columns),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'r2_score': r2
}

with open('wellington_property_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

# 在Colab环境中下载模型文件
try:
    from google.colab import files
    files.download('wellington_property_model.pkl')
    print("✅ 模型已保存并准备好下载")
except:
    print("✅ 模型已保存为 wellington_property_model.pkl")