<a href="https://colab.research.google.com/github/NZLouislu/nzlouis-property-ai-engine/blob/main/notebooks/Wellington_Property_Prediction_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a># Wellington房产预测模型 - 基于真实数据 (第2部分)

这个notebook是Wellington房产预测模型的第2部分，包含特征工程、模型训练和预测功能。

## 1. 加载数据和环境设置

In [None]:
# 安装必要的包
!pip install pandas numpy scikit-learn matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

# 上传第1部分生成的数据文件
try:
    from google.colab import files
    print("请上传第1部分生成的processed_property_data.csv文件")
    uploaded = files.upload()
    
    # 加载上传的数据文件
    df = pd.read_csv('processed_property_data.csv')
    print(f"✅ 成功加载数据，共 {len(df)} 条记录")
except:
    print("⚠️ 无法上传文件或不在Colab环境中，尝试直接读取文件")
    try:
        df = pd.read_csv('processed_property_data.csv')
        print(f"✅ 成功加载数据，共 {len(df)} 条记录")
    except:
        print("❌ 无法加载数据，将创建模拟数据")
        # 创建模拟数据
        n_samples = 500
        df = pd.DataFrame({
            'bedrooms': np.random.randint(1, 6, size=n_samples),
            'bathrooms': np.random.randint(1, 4, size=n_samples),
            'floor_size': np.random.randint(50, 300, size=n_samples),
            'year_built': np.random.randint(1950, 2023, size=n_samples),
            'suburb': np.random.choice(['Wellington Central', 'Lower Hutt', 'Upper Hutt', 'Porirua'], size=n_samples)
        })
        df['last_sold_price'] = df['bedrooms'] * 200000 + df['bathrooms'] * 100000 + df['floor_size'] * 2000 + np.random.normal(0, 100000, size=n_samples)

## 2. 特征工程

In [None]:
# 特征工程
print("🔄 开始特征工程...")

# 处理类别型特征
if 'suburb' in df.columns:
    suburb_dummies = pd.get_dummies(df['suburb'], prefix='suburb')
    df = pd.concat([df, suburb_dummies], axis=1)
    df.drop('suburb', axis=1, inplace=True)
    print(f"✅ 对suburb进行独热编码，新增 {len(suburb_dummies.columns)} 个特征")

# 添加新特征
if 'year_built' in df.columns:
    df['property_age'] = 2025 - df['year_built']
    print("✅ 添加特征: property_age (房产年龄)")

if 'bedrooms' in df.columns and 'bathrooms' in df.columns:
    df['bedroom_bathroom_ratio'] = df['bedrooms'] / df['bathrooms'].replace(0, 1)
    print("✅ 添加特征: bedroom_bathroom_ratio (卧室与浴室比例)")

if 'last_sold_price' in df.columns and 'floor_size' in df.columns:
    df['price_per_sqm'] = df['last_sold_price'] / df['floor_size'].replace(0, 1)
    print("✅ 添加特征: price_per_sqm (每平方米价格)")

if 'bedrooms' in df.columns and 'floor_size' in df.columns:
    df['sqm_per_bedroom'] = df['floor_size'] / df['bedrooms'].replace(0, 1)
    print("✅ 添加特征: sqm_per_bedroom (每卧室平方米)")

# 准备特征和目标变量
y = df['last_sold_price']
X = df.drop(['last_sold_price', 'id', 'created_at', 'updated_at', 'address', 'normalized_address', 'data'], axis=1, errors='ignore')

# 检查并删除非数值型特征
non_numeric_cols = X.select_dtypes(exclude=['int64', 'float64']).columns
if len(non_numeric_cols) > 0:
    print(f"⚠️ 删除非数值型特征: {list(non_numeric_cols)}")
    X = X.drop(non_numeric_cols, axis=1)

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"✅ 特征工程完成，最终特征数量: {X.shape[1]}")
print(f"📊 特征列表: {list(X.columns)}")

## 3. 模型训练和评估

In [None]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"✅ 训练集大小: {X_train.shape[0]}，测试集大小: {X_test.shape[0]}")

# 训练随机森林模型
print("🔄 训练随机森林模型...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 评估模型
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"✅ 模型评估:")
print(f"   - 均方误差 (MSE): {mse:.2f}")
print(f"   - 均方根误差 (RMSE): {rmse:.2f}")
print(f"   - R² 分数: {r2:.4f}")

## 4. 特征重要性分析

In [None]:
# 特征重要性分析
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("📊 特征重要性:")
display(feature_importance)

# 可视化特征重要性
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 最重要特征')
plt.tight_layout()
plt.show()

## 5. 预测样本房产

In [None]:
# 创建样本房产
sample_properties = pd.DataFrame([
    {
        'bedrooms': 3,
        'bathrooms': 2,
        'floor_size': 120,
        'year_built': 2000,
        'description': 'Wellington Central的三居室房产'
    },
    {
        'bedrooms': 4,
        'bathrooms': 2,
        'floor_size': 180,
        'year_built': 1990,
        'description': 'Lower Hutt的四居室房产'
    },
    {
        'bedrooms': 2,
        'bathrooms': 1,
        'floor_size': 80,
        'year_built': 2010,
        'description': 'Wellington Central的两居室公寓'
    }
])

# 为样本房产添加相同的特征
if 'property_age' in X.columns:
    sample_properties['property_age'] = 2025 - sample_properties['year_built']

if 'bedroom_bathroom_ratio' in X.columns:
    sample_properties['bedroom_bathroom_ratio'] = sample_properties['bedrooms'] / sample_properties['bathrooms']

if 'sqm_per_bedroom' in X.columns:
    sample_properties['sqm_per_bedroom'] = sample_properties['floor_size'] / sample_properties['bedrooms']

# 添加缺失的特征列
for col in X.columns:
    if col not in sample_properties.columns:
        sample_properties[col] = 0

# 确保列顺序与训练数据相同
sample_properties = sample_properties[X.columns]

# 标准化样本特征
sample_properties_scaled = scaler.transform(sample_properties)

# 预测价格
predicted_prices = model.predict(sample_properties_scaled)

# 显示预测结果
results = pd.DataFrame({
    '描述': ['Wellington Central的三居室房产', 'Lower Hutt的四居室房产', 'Wellington Central的两居室公寓'],
    '预测价格 (NZD)': predicted_prices.round(2)
})

print("📊 样本房产价格预测:")
display(results)

## 6. 保存模型

In [None]:
# 保存模型
import pickle
from google.colab import files

# 保存模型和标准化器
model_data = {
    'model': model,
    'scaler': scaler,
    'feature_names': list(X.columns),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'r2_score': r2
}

with open('wellington_property_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

# 下载模型文件
files.download('wellington_property_model.pkl')
print("✅ 模型已保存并准备好下载")