## 6. 数据可视化

In [None]:
# 数据可视化
if 'processed_data' in locals() and len(processed_data) > 0:
    plt.figure(figsize=(15, 10))
    
    # 1. 目标变量分布
    plt.subplot(2, 2, 1)
    target_counts = pd.Series(y).value_counts()
    plt.pie(target_counts.values, labels=['不太可能出售', '可能出售'] if len(target_counts) == 2 else target_counts.index, 
            autopct='%1.1f%%', colors=['lightcoral', 'lightgreen'])
    plt.title('目标变量分布', fontsize=14)
    
    # 2. 数值特征相关性
    plt.subplot(2, 2, 2)
    numeric_cols = processed_data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 1:
        corr = processed_data[numeric_cols].corr()
        sns.heatmap(corr, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('特征相关性矩阵', fontsize=14)
    
    # 3. 特征重要性（如果有足够的特征）
    plt.subplot(2, 2, 3)
    if len(feature_names) > 0:
        # 使用随机森林计算特征重要性
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X, y)
        importances = pd.Series(rf.feature_importances_, index=feature_names)
        importances = importances.sort_values(ascending=False)
        top_features = importances.head(10)
        top_features.plot(kind='barh')
        plt.title('特征重要性 (Top 10)', fontsize=14)
        plt.xlabel('重要性')
    
    # 4. 类别分布（如果有类别特征）
    plt.subplot(2, 2, 4)
    if 'is_wellington' in processed_data.columns and 'is_auckland' in processed_data.columns:
        location_data = pd.DataFrame({
            'Wellington': [processed_data[processed_data['is_wellington'] == 1]['target'].mean()],
            'Auckland': [processed_data[processed_data['is_auckland'] == 1]['target'].mean()],
            'Other': [processed_data[(processed_data['is_wellington'] == 0) & 
                                    (processed_data['is_auckland'] == 0)]['target'].mean()]
        })
        location_data.plot(kind='bar')
        plt.title('各地区出售可能性', fontsize=14)
        plt.ylabel('出售可能性')
    
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ 没有可视化的数据")

## 7. 模型训练

In [None]:
print("🔄 训练模型...")
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 创建集成模型
rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=8, random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000)

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('lr', lr)], 
    voting='soft'
)

# 训练模型
ensemble.fit(X_train_scaled, y_train)
model = ensemble

print("✅ 模型训练完成!")

## 8. 模型评估

In [None]:
print("🔄 评估模型...")
# 评估模型
train_accuracy = model.score(X_train_scaled, y_train)
test_accuracy = model.score(X_test_scaled, y_test)

y_pred = model.predict(X_test_scaled)
classification_rep = classification_report(y_test, y_pred, output_dict=True)

evaluation_results = {
    "训练准确率": train_accuracy,
    "测试准确率": test_accuracy,
    "精确率": classification_rep['weighted avg']['precision'],
    "召回率": classification_rep['weighted avg']['recall'],
    "F1分数": classification_rep['weighted avg']['f1-score']
}

print("\n📋 模型评估结果:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")

print("\n📊 详细分类报告:")
print(classification_report(y_test, y_pred))

# 混淆矩阵可视化
from sklearn.metrics import confusion_matrix
import seaborn as sns

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()