<a href="https://colab.research.google.com/github/NZLouislu/nzlouis-property-ai-engine/blob/main/notebooks/Wellington_Property_Prediction_Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a># Wellington房产预测模型 - 基于真实数据 (第1部分)

这个notebook使用real_estate表和properties表中的真实数据进行训练，预测房产价格。

## 1. 环境设置和依赖安装

In [1]:
# 安装必要的包
!pip install pandas numpy scikit-learn matplotlib seaborn supabase python-dotenv

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from supabase import create_client

Collecting supabase
  Downloading supabase-2.20.0-py3-none-any.whl.metadata (4.5 kB)
Collecting realtime (from supabase)
  Downloading realtime-2.20.0-py3-none-any.whl.metadata (6.9 kB)
Collecting supabase-functions (from supabase)
  Downloading supabase_functions-2.20.0-py3-none-any.whl.metadata (2.2 kB)
Collecting storage3 (from supabase)
  Downloading storage3-2.20.0-py3-none-any.whl.metadata (2.0 kB)
Collecting supabase-auth (from supabase)
  Downloading supabase_auth-2.20.0-py3-none-any.whl.metadata (6.3 kB)
Collecting postgrest (from supabase)
  Downloading postgrest-2.20.0-py3-none-any.whl.metadata (3.3 kB)
Collecting deprecation>=2.1.0 (from postgrest->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting strenum>=0.4.15 (from supabase-functions->supabase)
  Downloading StrEnum-0.4.15-py3-none-any.whl.metadata (5.3 kB)
Downloading supabase-2.20.0-py3-none-any.whl (16 kB)
Downloading postgrest-2.20.0-py3-none-any.whl (22 kB)
Downloading real

## 2. 设置Supabase凭据

In [2]:
# 设置Supabase凭据
# 方法1: 使用Colab的secrets功能（推荐）
try:
    from google.colab import userdata
    os.environ['SUPABASE_URL'] = userdata.get('SUPABASE_URL').strip()
    os.environ['SUPABASE_KEY'] = userdata.get('SUPABASE_KEY').strip()
    print("✅ 从Colab secrets加载数据库配置")
except:
    print("⚠️ 未找到Colab secrets，请手动设置SUPABASE_URL和SUPABASE_KEY")

    # 方法2: 直接设置（不推荐用于生产环境）
    # os.environ['SUPABASE_URL'] = 'your_supabase_url_here'
    # os.environ['SUPABASE_KEY'] = 'your_supabase_key_here'

# 创建Supabase客户端
def create_supabase_client():
    """创建Supabase客户端"""
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")

        if not url or not key:
            raise ValueError("SUPABASE_URL和SUPABASE_KEY环境变量必须设置")

        return create_client(url, key)
    except Exception as e:
        print(f"❌ 创建Supabase客户端失败: {e}")
        return None

supabase_client = create_supabase_client()
if supabase_client:
    print("✅ 数据库连接成功")
else:
    print("❌ 数据库连接失败")

✅ 从Colab secrets加载数据库配置
✅ 数据库连接成功


## 3. 从real_estate表和properties表获取数据

In [3]:
# 从real_estate表获取数据并与properties表关联
print("🔄 从real_estate表获取数据...")
try:
    # 获取real_estate表数据
    real_estate_response = supabase_client.table('real_estate').select('*').execute()

    if real_estate_response.data:
        real_estate_df = pd.DataFrame(real_estate_response.data)
        print(f"✅ 成功获取 {len(real_estate_df)} 条real_estate记录")
        print(f"📋 real_estate数据列: {list(real_estate_df.columns)}")

        # 显示real_estate前5行数据
        print("Real Estate表前5行数据:")
        display(real_estate_df.head())

        # 获取properties表数据
        print("\n🔄 从properties表获取数据...")
        properties_response = supabase_client.table('properties').select('*').execute()

        if properties_response.data:
            properties_df = pd.DataFrame(properties_response.data)
            print(f"✅ 成功获取 {len(properties_df)} 条properties记录")
            print(f"📋 properties数据列: {list(properties_df.columns)}")

            # 显示properties前5行数据
            print("Properties表前5行数据:")
            display(properties_df.head())

            # 合并两个表的数据（基于address字段）
            print("\n🔄 合并real_estate和properties表数据...")

            # 确保address列存在于两个表中
            if 'address' in real_estate_df.columns and 'address' in properties_df.columns:
                # 对地址进行标准化处理（去除空格、转小写）以提高匹配率
                real_estate_df['normalized_address'] = real_estate_df['address'].str.lower().str.strip()
                properties_df['normalized_address'] = properties_df['address'].str.lower().str.strip()

                # 使用标准化后的地址进行合并
                merged_df = pd.merge(
                    real_estate_df,
                    properties_df,
                    left_on='normalized_address',
                    right_on='normalized_address',
                    how='left',
                    suffixes=('', '_prop')
                )

                print(f"✅ 合并后的数据: {len(merged_df)} 条记录")
                print(f"📋 合并后的数据列: {list(merged_df.columns)}")

                # 显示合并后的前5行数据
                print("合并后的数据前5行:")
                display(merged_df.head())

                # 使用合并后的数据
                df = merged_df
            else:
                print("⚠️ 两个表中至少有一个没有address列，无法合并")
                df = real_estate_df
        else:
            print("⚠️ properties表中没有数据，将只使用real_estate表数据")
            df = real_estate_df
    else:
        print("⚠️ real_estate表中没有数据")
except Exception as e:
    print(f"❌ 获取数据时发生错误: {e}")

🔄 从real_estate表获取数据...
✅ 成功获取 1000 条real_estate记录
📋 real_estate数据列: ['id', 'address', 'status', 'data', 'normalized_lead_address']
Real Estate表前5行数据:


Unnamed: 0,id,address,status,data,normalized_lead_address
0,beac910ca5f6eb78c64318622ff39931,"51 Leicester Street, Cannons Creek",for Sale,2025-05-27T11:13:56.813162,51leicesterstreet
1,8eea86691c11dbcbf8e140d0636db31e,"1/35 Torrens Terrace, Mount Cook",for Sale,2025-05-27T11:13:57.969601,1/35torrensterrace
2,6df140516d55c84c2485da82244a0dd4,"159 Sievers Grove, Cannons Creek",for Sale,2025-05-27T11:13:59.106338,159sieversgrove
3,1be5736c077e8e27b4c2e0067865aecf,"15 Kiriwai Road, Paremata",for Sale,2025-05-27T11:14:00.286188,15kiriwairoad
4,98edc5b4a6301279d9c46aca4ddb8117,"80 Greenwood Boulevard, Otaki",for Sale,2025-05-27T11:14:01.430086,80greenwoodboulevard



🔄 从properties表获取数据...
✅ 成功获取 1000 条properties记录
📋 properties数据列: ['id', 'address', 'suburb', 'city', 'postcode', 'year_built', 'bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area', 'last_sold_price', 'last_sold_date', 'capital_value', 'land_value', 'improvement_value', 'has_rental_history', 'is_currently_rented', 'status', 'property_history', 'normalized_address', 'property_url', 'created_at', 'region', 'cover_image_url']
Properties表前5行数据:


Unnamed: 0,id,address,suburb,city,postcode,year_built,bedrooms,bathrooms,car_spaces,floor_size,...,improvement_value,has_rental_history,is_currently_rented,status,property_history,normalized_address,property_url,created_at,region,cover_image_url
0,f0bef14bf96f1176fb18c9edb5619762,"47 Andrew Street, Waikanae, 5036",Waikanae,Kapiti Coast District,5036,1990.0,4.0,1.0,1.0,221 m2,...,430000.0,False,False,,Historical data migrated - contains transactio...,"Andrew Street, Waikanae, 5036",https://propertyvalue.co.nz/wellington/kapiti-...,2025-05-31T09:34:40.150915+00:00,Wellington,
1,f0bef434e8ed78eafd66a104aee18244,"Soho Apartments, Te Aro, 6011",Te Aro,Wellington City,6011,2010.0,1.0,1.0,,34 m2,...,200000.0,False,False,,Historical data migrated - contains transactio...,Soho Apartments,https://propertyvalue.co.nz/wellington/welling...,2025-06-09T05:38:04.095139+00:00,Wellington,
2,f0bf2e51997e92a9e6116a7ec066ebcf,"32 Natone Street, Waitangirua, 5024",Waitangirua,Porirua City,5024,1967.0,3.0,1.0,1.0,105 m2,...,220000.0,False,False,,Historical data migrated - contains transactio...,"Natone Street, Waitangirua, 5024",https://propertyvalue.co.nz/wellington/porirua...,2025-05-30T13:02:11.556991+00:00,Wellington,
3,f0bf4aede1b7d4a21b9ef0ebee09cbae,"34 Leinster Avenue, Raumati South, 5032",Raumati South,Kapiti Coast District,5032,1986.0,3.0,1.0,2.0,198 m2,...,370000.0,False,False,,Historical data migrated - contains transactio...,"Leinster Avenue, Raumati South, 5032",https://propertyvalue.co.nz/wellington/kapiti-...,2025-05-31T15:35:19.622508+00:00,Wellington,
4,f0bf535c9384a7664ae5d83ca618d708,"42 Haumia Street, Johnsonville, 6037",Johnsonville,Wellington City,6037,1949.0,3.0,1.0,1.0,84 m2,...,320000.0,False,False,,Historical data migrated - contains transactio...,"Haumia Street, Johnsonville, 6037",https://propertyvalue.co.nz/wellington/welling...,2025-06-05T07:17:33.51782+00:00,Wellington,



🔄 合并real_estate和properties表数据...
✅ 合并后的数据: 1000 条记录
📋 合并后的数据列: ['id', 'address', 'status', 'data', 'normalized_lead_address', 'normalized_address', 'id_prop', 'address_prop', 'suburb', 'city', 'postcode', 'year_built', 'bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area', 'last_sold_price', 'last_sold_date', 'capital_value', 'land_value', 'improvement_value', 'has_rental_history', 'is_currently_rented', 'status_prop', 'property_history', 'property_url', 'created_at', 'region', 'cover_image_url']
合并后的数据前5行:


Unnamed: 0,id,address,status,data,normalized_lead_address,normalized_address,id_prop,address_prop,suburb,city,...,land_value,improvement_value,has_rental_history,is_currently_rented,status_prop,property_history,property_url,created_at,region,cover_image_url
0,beac910ca5f6eb78c64318622ff39931,"51 Leicester Street, Cannons Creek",for Sale,2025-05-27T11:13:56.813162,51leicesterstreet,"51 leicester street, cannons creek",,,,,...,,,,,,,,,,
1,8eea86691c11dbcbf8e140d0636db31e,"1/35 Torrens Terrace, Mount Cook",for Sale,2025-05-27T11:13:57.969601,1/35torrensterrace,"1/35 torrens terrace, mount cook",,,,,...,,,,,,,,,,
2,6df140516d55c84c2485da82244a0dd4,"159 Sievers Grove, Cannons Creek",for Sale,2025-05-27T11:13:59.106338,159sieversgrove,"159 sievers grove, cannons creek",,,,,...,,,,,,,,,,
3,1be5736c077e8e27b4c2e0067865aecf,"15 Kiriwai Road, Paremata",for Sale,2025-05-27T11:14:00.286188,15kiriwairoad,"15 kiriwai road, paremata",,,,,...,,,,,,,,,,
4,98edc5b4a6301279d9c46aca4ddb8117,"80 Greenwood Boulevard, Otaki",for Sale,2025-05-27T11:14:01.430086,80greenwoodboulevard,"80 greenwood boulevard, otaki",,,,,...,,,,,,,,,,


## 4. 数据预处理和特征提取

In [9]:
# 数据预处理和特征提取
print("🔄 开始数据预处理...")
try:
    # 创建一个新的DataFrame来存储提取的特征
    extracted_features = pd.DataFrame(index=df.index)

    # 从data字段提取特征
    if 'data' in df.columns:
        print("🔍 从data字段提取数值特征...")

        # 遍历每一行数据
        for idx, row in df.iterrows():
            # Skip parsing the 'data' column as it contains timestamps, not JSON
            # if pd.notna(row['data']):
            #     try:
            #         # 如果data是字符串，则解析为JSON
            #         if isinstance(row['data'], str):
            #             data_dict = json.loads(row['data'])
            #         else:
            #             data_dict = row['data']

            #         # 提取常见的数值特征
            #         for key in ['bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area',
            #                    'year_built', 'capital_value', 'last_sold_price', 'land_value',
            #                    'improvement_value', 'rental_estimate', 'rental_yield']:
            #             if key in data_dict:
            #                 extracted_features.loc[idx, key] = data_dict[key]

            #         # 提取suburb信息
            #         if 'suburb' in data_dict:
            #             extracted_features.loc[idx, 'suburb'] = data_dict['suburb']
            #     except Exception as e:
            #         print(f"⚠️ 行 {idx} 的data字段解析失败: {e}")

        # This print statement should be outside the for loop
        print(f"✅ Skipped extracting features from 'data' column as it does not contain JSON")

    # 合并提取的特征到原始数据 (extracted_features will be empty if 'data' is not JSON)
    df = pd.concat([df, extracted_features], axis=1)

    # 从properties表中提取的特征也合并到数据中
    property_features = ['bedrooms', 'bathrooms', 'car_spaces', 'floor_size', 'land_area',
                        'year_built', 'capital_value', 'land_value', 'improvement_value']

    for feature in property_features:
        prop_feature = f"{feature}_prop"
        if prop_feature in df.columns:
            # 如果原始特征不存在或为空，则使用properties表中的值
            if feature not in df.columns:
                df[feature] = df[prop_feature]
            else:
                df[feature] = df[feature].fillna(df[prop_feature])

    # 显示处理后的数据
    print("提取特征后的数据前5行:")
    display(df.head())

    # 检查数值特征
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
    print(f"📊 可用的数值特征: {list(numeric_features)}")

    if len(numeric_features) < 2:  # 至少需要一个特征和一个目标变量
        print("❌ 没有足够的数值特征进行训练，将创建模拟特征")
        # 创建模拟特征
        df['bedrooms'] = np.random.randint(1, 6, size=len(df))
        df['bathrooms'] = np.random.randint(1, 4, size=len(df))
        df['floor_size'] = np.random.randint(50, 300, size=len(df))
        df['year_built'] = np.random.randint(1950, 2023, size=len(df))

        # 如果没有价格信息，创建模拟价格
        if 'last_sold_price' not in df.columns:
            df['last_sold_price'] = df['bedrooms'] * 200000 + df['bathrooms'] * 100000 + df['floor_size'] * 2000 + np.random.normal(0, 100000, size=len(df))

    # 确保有目标变量
    if 'last_sold_price' not in df.columns:
        print("❌ 没有找到价格信息作为目标变量，将使用capital_value")
        if 'capital_value' in df.columns:
            df['last_sold_price'] = df['capital_value']
        else:
            print("❌ 没有找到capital_value，将创建模拟价格")
            df['last_sold_price'] = df['bedrooms'] * 200000 + df['bathrooms'] * 100000 + df['floor_size'] * 2000 + np.random.normal(0, 100000, size=len(df))

    # 筛选Wellington和Auckland的数据
    if 'address' in df.columns:
        wellington_auckland_mask = df['address'].str.contains('wellington|auckland', case=False, na=False)
        if wellington_auckland_mask.sum() > 0:
            df = df[wellington_auckland_mask]
            print(f"✅ 筛选出Wellington和Auckland数据: {len(df)} 条")

    # 处理缺失值
    numeric_df = df.select_dtypes(include=['int64', 'float64'])
    if numeric_df.isnull().sum().sum() > 0:
        print("⚠️ 检测到缺失值，使用中位数填充")
        for col in numeric_df.columns:
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(df[col].median())

    print("✅ 数据预处理完成")

    # 保存处理后的数据，以便在第2部分使用
    df.to_csv('processed_property_data.csv', index=False)
    print("✅ 处理后的数据已保存到processed_property_data.csv")

    # 如果在Colab中运行，下载数据文件
    try:
        from google.colab import files
        files.download('processed_property_data.csv')
        print("✅ 数据文件已准备好下载")
    except:
        pass

except Exception as e:
    print(f"❌ 数据预处理时发生错误: {e}")
    print("⚠️ 将创建模拟数据")
    # 创建模拟数据
    n_samples = 500
    df = pd.DataFrame({
        'bedrooms': np.random.randint(1, 6, size=n_samples),
        'bathrooms': np.random.randint(1, 4, size=n_samples),
        'floor_size': np.random.randint(50, 300, size=n_samples),
        'year_built': np.random.randint(1950, 2023, size=n_samples),
        'suburb': np.random.choice(['Wellington Central', 'Lower Hutt', 'Upper Hutt', 'Porirua'], size=n_samples)
    })
    df['last_sold_price'] = df['bedrooms'] * 200000 + df['bathrooms'] * 100000 + df['floor_size'] * 2000 + np.random.normal(0, 100000, size=n_samples)

    # 保存模拟数据
    df.to_csv('processed_property_data.csv', index=False)
    print("✅ 模拟数据已保存到processed_property_data.csv")

    # 如果在Colab中运行，下载数据文件
    try:
        from google.colab import files
        files.download('processed_property_data.csv')
        print("✅ 模拟数据已准备好下载")
    except:
        pass

IndentationError: expected an indented block after 'for' statement on line 12 (ipython-input-2451851854.py, line 35)