In [12]:
## ✅ 步骤 1.1：读取数据 + 添加标签 + 合并数据

import pandas as pd
import numpy as np

# 读取数据
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# 添加标识列 + 填空标签以便合并
train_df["Dataset"] = "train"
test_df["Dataset"] = "test"
test_df["Survived"] = np.nan

# 保留原始 test_id
test_passenger_ids = test_df["PassengerId"]

# 合并数据以便统一处理
full_df = pd.concat([train_df, test_df], ignore_index=True)

print("✅ 数据读取完成，合并后样本数量：", full_df.shape[0])

✅ 数据读取完成，合并后样本数量： 1309


In [13]:
# 📦 步骤 1.2：增强型特征工程模块（A）

# ✅ 特征包括：
# 	•	Title（称呼）
# 	•	FamilySize + IsAlone
# 	•	Cabin_known（是否知道舱号）
# 	•	AgeGroup（年龄分组）
# 	•	FareGroup（票价分箱）
# 	•	Deck（从 Cabin 中提取甲板）
# 	•	TicketPrefix（票号前缀）
# 	•	FamilyGroup（根据姓氏构建家庭群）

In [15]:
## 📦 步骤 1.2：增强型特征工程模块（A）

# 我们将在此步骤中统一构造以下强力变量：

# - `Title`（称呼）
# - `FamilySize` / `IsAlone`
# - `Cabin_known`（是否知道舱号）
# - `AgeGroup`（年龄分组）
# - `FareGroup`（票价分组）
# - `Deck`（从 Cabin 中提取甲板）
# - `TicketPrefix`（票号前缀）
# - `FamilyGroup`（姓氏家族）

# ```python
# 1. 提取 Title（称呼）并归类稀有值
full_df["Title"] = full_df["Name"].str.extract(" ([A-Za-z]+)\\.", expand=False)
full_df["Title"] = full_df["Title"].replace({
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare',
    'Col': 'Rare', 'Don': 'Rare', 'Dr': 'Rare',
    'Major': 'Rare', 'Rev': 'Rare', 'Sir': 'Rare',
    'Jonkheer': 'Rare', 'Dona': 'Rare'
})

# 2. 构造家庭相关变量
full_df["FamilySize"] = full_df["SibSp"] + full_df["Parch"] + 1
full_df["IsAlone"] = (full_df["FamilySize"] == 1).astype(int)

# 3. 舱位信息是否已知
full_df["Cabin_known"] = full_df["Cabin"].notnull().astype(int)

# 4. 年龄分组（用称呼中位数填补 + 分箱）
def age_to_group(age):
    if age <= 12: return 0
    elif age <= 18: return 1
    elif age <= 35: return 2
    elif age <= 60: return 3
    else: return 4

full_df["Age"] = full_df.groupby("Title")["Age"].transform(lambda x: x.fillna(x.median()))
full_df["AgeGroup"] = full_df["Age"].apply(age_to_group)

# 5. 票价缺失填补 + 分组（qcut 自动等频分箱）
full_df["Fare"] = full_df["Fare"].fillna(full_df["Fare"].median())
full_df["FareGroup"] = pd.qcut(full_df["Fare"], 4, labels=False)

# 6. 提取甲板 Deck 信息
full_df["Deck"] = full_df["Cabin"].astype(str).str[0]
full_df["Deck"] = full_df["Deck"].replace("n", "U")  # 未知统一设为 U

# 7. 提取票号前缀
full_df["TicketPrefix"] = full_df["Ticket"].apply(lambda x: x.replace(".", "").replace("/", "").split()[0] if len(x.split()) > 1 else "None")

# 8. 构造家庭组 FamilyGroup（姓氏频次 >= 2）
full_df["Surname"] = full_df["Name"].apply(lambda x: x.split(",")[0])
surname_counts = full_df["Surname"].value_counts()
full_df["FamilyGroup"] = full_df["Surname"].apply(lambda x: x if surname_counts[x] >= 2 else "NoGroup")

print("✅ 所有增强特征已构造完成！当前列数：", full_df.shape[1])

✅ 所有增强特征已构造完成！当前列数： 23


In [3]:
## 📦 步骤 1.3：类别变量编码 + 缺失值检查 + 特征清理

```python
from sklearn.preprocessing import LabelEncoder

# 编码所有类别变量（将字符串转换为整数）
categorical_cols = ["Sex", "Embarked", "Title", "Deck", "TicketPrefix", "FamilyGroup"]
for col in categorical_cols:
    le = LabelEncoder()
    full_df[col] = le.fit_transform(full_df[col].astype(str))

# 检查所有字段缺失值（调试用）
print("剩余缺失值：")
print(full_df.isnull().sum())

# 拆分回训练集和测试集
train_final = full_df[full_df["Dataset"] == "train"].copy()
test_final = full_df[full_df["Dataset"] == "test"].copy()

# 选择最终特征列
selected_features = [
    "Pclass", "Sex", "AgeGroup", "FareGroup", "Title",
    "FamilySize", "IsAlone", "Cabin_known", "Deck",
    "TicketPrefix", "FamilyGroup"
]

X_train = train_final[selected_features]
y_train = train_final["Survived"].astype(int)
X_test = test_final[selected_features]

print("✅ 编码与缺失值处理完成，训练维度：", X_train.shape)

✅ 缺失值和类别变量处理完成！


In [16]:
## 🤖 步骤 2：构建多模型 + Voting + Stacking 融合系统

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# 构建基础模型（已调优）
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42)
xgb_clf = XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.05, eval_metric='logloss', use_label_encoder=False, random_state=42)
lgbm_clf = LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.05, random_state=42)

# 构建 Soft Voting 融合模型
voting_soft = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('xgb', xgb_clf),
        ('lgbm', lgbm_clf)
    ],
    voting='soft'
)

# 训练 Voting 模型
voting_soft.fit(X_train, y_train)
voting_preds = voting_soft.predict(X_test)

# 构建 Stacking 模型（以 Logistic 为元模型）
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('xgb', xgb_clf),
        ('lgbm', lgbm_clf)
    ],
    final_estimator=LogisticRegression(max_iter=200),
    cv=5
)

# 训练 stacking 模型
stacking_clf.fit(X_train, y_train)
stacking_preds = stacking_clf.predict(X_test)

print("✅ Voting & Stacking 模型训练完成！")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 158
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 11
[LightGBM] [Info] [binary:BoostF

In [18]:
## 📤 步骤 3：生成提交文件 + 可选手工规则修正

# 保存 Voting 模型预测结果
submission_voting = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": voting_preds.astype(int)
})
submission_voting.to_csv("submission_voting_soft.csv", index=False)

# 保存 Stacking 模型预测结果
submission_stacking = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": stacking_preds.astype(int)
})
submission_stacking.to_csv("submission_stacking.csv", index=False)

# ✅ 可选：对极端样本进行手工规则修正（冲击 0.81+）
submission_stacking["Survived"] = np.where(
    (X_test["Sex"] == 1) & (X_test["Pclass"] == 1) & (X_test["FamilySize"] >= 2),
    1,
    submission_stacking["Survived"]
)

submission_stacking["Survived"] = np.where(
    (X_test["Sex"] == 0) & (X_test["AgeGroup"] == 0) & (X_test["FamilySize"] == 1),
    0,
    submission_stacking["Survived"]
)

submission_stacking.to_csv("submission_final_with_rules.csv", index=False)
print("✅ 所有提交文件已保存，包括融合 + 人工规则增强版本")

✅ 所有提交文件已保存，包括融合 + 人工规则增强版本


In [19]:
## 🚀 步骤 4：最终冲击 0.80+ 提分策略（伪标签 + 精修规则）

# ```python
# ========== 第一步：Voting 模型输出预测概率 ==========
probs = voting_soft.predict_proba(X_test)[:, 1]

# 构建伪标签 DataFrame
pseudo_df = X_test.copy()
pseudo_df["Survived"] = (probs > 0.98).astype(int)  # 正类
pseudo_df = pseudo_df[(probs > 0.98) | (probs < 0.02)]  # 高置信度样本（正/负类）

print("🎯 可用于伪标签增强的样本数量：", pseudo_df.shape[0])

# ========== 第二步：构建增强训练集 ==========
X_train_augmented = pd.concat([X_train, pseudo_df[selected_features]])
y_train_augmented = pd.concat([y_train, pseudo_df["Survived"]])

# 重新训练 Voting 模型
voting_soft.fit(X_train_augmented, y_train_augmented)
voting_preds_aug = voting_soft.predict(X_test)

# ========== 第三步：使用 stacking 结果 + 精修规则组合 ==========
final_preds = stacking_preds.copy()

# 示例规则：如果是女性 + 一等舱 + 高票价组 且 stacking 预测错误 → 纠正为生还
final_preds = np.where(
    ((X_test["Sex"] == 1) & (X_test["Pclass"] == 1) & (X_test["FareGroup"] >= 2) & (voting_preds_aug == 1)),
    1,
    final_preds
)

# 示例规则：三等舱男童（高死亡风险）纠正为 0
final_preds = np.where(
    ((X_test["Sex"] == 0) & (X_test["Pclass"] == 3) & (X_test["AgeGroup"] == 0) & (X_test["FamilySize"] == 1)),
    0,
    final_preds
)

# 保存最终提交文件
submission_final = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": final_preds.astype(int)
})
submission_final.to_csv("submission_final_ensemble_boosted.csv", index=False)
print("✅ 冲击版本提交文件已生成：submission_final_ensemble_boosted.csv")

🎯 可用于伪标签增强的样本数量： 4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 346, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 895, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.386592 -> initscore=-0.461660
[LightGBM] [Info] Start training from score -0.461660
✅ 冲击版本提交文件已生成：submission_final_ensemble_boosted.csv
