预处理

In [5]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_titanic(df):
    df = df.copy()

    # ------- 1. Title 提取 -------
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace([
        'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev',
        'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

    # ------- 2. 填 Embarked 缺失（用众数填临时）+ 编码 -------
    df['Embarked'] = df['Embarked'].fillna('S')  # 众数
    le_embarked = LabelEncoder()
    df['Embarked_code'] = le_embarked.fit_transform(df['Embarked'])

    # ------- 3. 填 Fare 缺失（用中位数） -------
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # ------- 4. 组合要补值的列并做标准化 -------
    impute_cols = ['Age', 'Fare', 'Embarked_code']
    impute_data = df[impute_cols]
    scaler = StandardScaler()
    impute_scaled = scaler.fit_transform(impute_data)

    # ------- 5. KNN 补值 -------
    imputer = KNNImputer(n_neighbors=3)
    imputed_scaled = imputer.fit_transform(impute_scaled)
    imputed = scaler.inverse_transform(imputed_scaled)

    # ------- 6. 还原补值结果 -------
    df['Age'] = imputed[:, 0]
    df['Fare'] = imputed[:, 1]
    df['Embarked'] = le_embarked.inverse_transform(imputed[:, 2].round().astype(int))

    # ------- 7. 衍生特征 -------
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # ------- 8. 类别变量编码 -------
    le_sex = LabelEncoder()
    le_title = LabelEncoder()
    le_embarked_final = LabelEncoder()

    df['Sex'] = le_sex.fit_transform(df['Sex'])
    df['Title'] = le_title.fit_transform(df['Title'])
    df['Embarked'] = le_embarked_final.fit_transform(df['Embarked'])

    # ------- 9. 去除不用的列 -------
    drop_cols = ['Name', 'Ticket', 'Cabin', 'Embarked_code']
    for col in drop_cols:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    return df


In [6]:
# 读取原始数据
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

# 保存 PassengerId 供提交用
test_passenger_ids = test_df['PassengerId']

# 分离标签
y_train = train_df['Survived']

# 处理数据
X_train = preprocess_titanic(train_df.drop(columns=['Survived']))
X_test = preprocess_titanic(test_df)


In [7]:
print(X_train.shape, y_train.shape)
print(X_test.shape)
print(X_train.head())


(891, 11) (891,)
(418, 11)
   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Title  \
0            1       3    1  22.0      1      0   7.2500         2      2   
1            2       1    0  38.0      1      0  71.2833         0      3   
2            3       3    0  26.0      0      0   7.9250         2      1   
3            4       1    0  35.0      1      0  53.1000         2      3   
4            5       3    1  35.0      0      0   8.0500         2      2   

   FamilySize  IsAlone  
0           2        0  
1           2        0  
2           1        1  
3           2        0  
4           1        1  


In [8]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [9]:
# 基础模型
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
rf = RandomForestClassifier(random_state=42)
lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, random_state=42))


In [10]:
# Stacking 模型：融合 xgb, rf, lr，最终用 lr 做二层预测器
stack_model = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('rf', rf),
        ('lr', lr)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)

param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.05, 0.1],
    'rf__n_estimators': [100],
    'rf__max_depth': [4, 6]
}

grid_search = GridSearchCV(
    estimator=stack_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("✅ 最佳融合参数组合：", grid_search.best_params_)
print("✅ 最佳交叉验证准确率：", grid_search.best_score_)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
✅ 最佳融合参数组合： {'rf__max_depth': 4, 'rf__n_estimators': 100, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
✅ 最佳交叉验证准确率： 0.8115184232000502


In [11]:
# 最优模型预测
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": y_pred.astype(int)
})
submission.to_csv("submission_stacking.csv", index=False)
print("🎯 已保存融合提交文件：submission_stacking.csv")


🎯 已保存融合提交文件：submission_stacking.csv
