In [14]:
!pip install pandas scikit-learn torch

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ==== 2. 读取并预处理数据 ====
df = pd.read_csv("/content/HDBPrice.csv")

# 去重 + 去掉 block/street_name
df = df.drop_duplicates()
df = df.drop(columns=['block', 'street_name'], errors='ignore')

# storey_range → storey_median
def storey_median(s):
    parts = [int(x) for x in s.split(' TO ')]
    return sum(parts) / len(parts)

df['storey_median'] = df['storey_range'].apply(storey_median)
df = df.drop(columns=['storey_range'])

# 类别编码
df['town'] = df['town'].astype('category').cat.codes
df['flat_type'] = df['flat_type'].astype('category').cat.codes

# ==== 3. 特征处理 ====
X_num = df[['floor_area_sqm', 'lease_commence_date', 'storey_median']].copy()
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

X_cat_town = df['town'].values
X_cat_flat = df['flat_type'].values
y = df['resale_price'].values

# ==== 4. 数据切分 ====
X_train_num, X_test_num, X_train_town, X_test_town, X_train_flat, X_test_flat, y_train, y_test = train_test_split(
    X_num_scaled, X_cat_town, X_cat_flat, y, test_size=0.2, random_state=4248)

# ==== 5. 数据封装为Tensor ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = TensorDataset(
    torch.tensor(X_train_num, dtype=torch.float32),
    torch.tensor(X_train_town, dtype=torch.long),
    torch.tensor(X_train_flat, dtype=torch.long),
    torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
)

test_dataset = TensorDataset(
    torch.tensor(X_test_num, dtype=torch.float32),
    torch.tensor(X_test_town, dtype=torch.long),
    torch.tensor(X_test_flat, dtype=torch.long),
    torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)
)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256)

# ==== 6. 构建 NN 模型 ====
class HDBNet(nn.Module):
    def __init__(self, n_num, n_town, n_flat):
        super().__init__()
        self.emb_town = nn.Embedding(n_town + 1, 6)
        self.emb_flat = nn.Embedding(n_flat + 1, 4)
        self.fc1 = nn.Linear(n_num + 6 + 4, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.drop1 = nn.Dropout(0.3)
        self.drop2 = nn.Dropout(0.2)

    def forward(self, x_num, x_town, x_flat):
        x = torch.cat([
            x_num,
            self.emb_town(x_town),
            self.emb_flat(x_flat)
        ], dim=1)
        x = self.relu(self.fc1(x))
        x = self.drop1(x)
        x = self.relu(self.fc2(x))
        x = self.drop2(x)
        return self.out(x)

model = HDBNet(n_num=3, n_town=df['town'].nunique(), n_flat=df['flat_type'].nunique()).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)

# ==== 7. 训练模型 ====
for epoch in range(20):
    model.train()
    for x_num, x_town, x_flat, y_batch in train_loader:
        x_num, x_town, x_flat, y_batch = x_num.to(device), x_town.to(device), x_flat.to(device), y_batch.to(device)
        pred = model(x_num, x_town, x_flat)
        loss = criterion(pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# ==== 8. 测试性能 ====
model.eval()
with torch.no_grad():
    y_preds = []
    y_true = []
    for x_num, x_town, x_flat, y_batch in test_loader:
        x_num, x_town, x_flat = x_num.to(device), x_town.to(device), x_flat.to(device)
        preds = model(x_num, x_town, x_flat).cpu().numpy()
        y_preds.append(preds)
        y_true.append(y_batch.numpy())

    y_preds = np.vstack(y_preds)
    y_true = np.vstack(y_true)

    rmse = np.sqrt(mean_squared_error(y_true, y_preds))
    mape_f = np.mean(np.abs((y_t - y_p) / y_t)) * 100
    r2 = r2_score(y_true, y_preds)

    print(f"Overall - RMSE: {rmse:.2f}, MAPE: {mape:.2f}, R²: {r2:.4f}")


Overall - RMSE: 94183.68, MAPE: 15.19, R²: 0.7265


In [13]:
# 还原 flat_type 的原始标签
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
flat_type_all = pd.read_csv("/content/HDBPrice.csv")['flat_type'].astype('category')
flat_type_labels = flat_type_all.cat.categories
flat_type_test_labels = flat_type_labels[X_test_flat]

# 转为 DataFrame
results_df = pd.DataFrame({
    'y_true': y_true.flatten(),
    'y_pred': y_preds.flatten(),
    'flat_type': flat_type_test_labels
})

# 筛选指定的 flat_type 并计算指标
target_types = ["3 ROOM", "4 ROOM", "5 ROOM"]

print("\n=== Metrics by Flat Type ===")
for flat in target_types:
    group = results_df[results_df['flat_type'] == flat]
    if len(group) == 0:
        continue
    y_t = group['y_true']
    y_p = group['y_pred']
    rmse_f = np.sqrt(mean_squared_error(y_t, y_p))
    mape_f = np.mean(np.abs((y_t - y_p) / y_t)) * 100
    r2_f = r2_score(y_t, y_p)
    print(f"{flat:>6} - RMSE: {rmse_f:.2f}, MAPE: {mape_f:.2f}, R²: {r2_f:.4f}")


=== Metrics by Flat Type ===
3 ROOM - RMSE: 77543.00, MAPE: 17.33, R²: 0.3958
4 ROOM - RMSE: 88779.50, MAPE: 14.26, R²: 0.6414
5 ROOM - RMSE: 109721.21, MAPE: 14.63, R²: 0.5803
