In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# 新增高价值特征
def add_features(df):
    # 姓名中的称呼（Title）
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss').replace('Ms', 'Miss').replace('Mme', 'Mrs')
    
    # 家庭规模与是否独行
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # 用训练集统计量填充测试集
    fare_median = train['Fare'].median()
    # train['Fare'].fillna(fare_median, inplace=True)
    test ['Fare'] = test['Fare'].fillna(fare_median)
    
    # 票价分组（处理偏态分布）
    df['FareBin'] = pd.qcut(df['Fare'], 4, labels=False)
    
    # 年龄分组（儿童生存率高）
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 30, 50, 100], labels=False)
    
    # 高生存率组合特征
    df['RichFemale'] = ((df['Pclass'] == 1) & (df['Sex'] == 'female')).astype(int)
    return df

train = add_features(train)
test = add_features(test)

In [5]:
# Age用Title分组中位数填充（更精确）
train['AgeBin'] = train.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
test['AgeBin'] = test.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

train['Age'] = train.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
test['Age'] = test.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

In [6]:
# 有序类别编码（Pclass和FareBin）
train['Pclass'] = train['Pclass'].astype('category').cat.codes
test['Pclass'] = test['Pclass'].astype('category').cat.codes

In [7]:
# 标签编码Title（比One-Hot更适合树模型）
title_map = {v: i for i, v in enumerate(train['Title'].unique())}
train['Title'] = train['Title'].map(title_map)
test['Title'] = test['Title'].map(title_map)

In [8]:
train['Sex_encoded'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex_encoded'] = test['Sex'].map({'male': 0, 'female': 1})

In [9]:
train.dropna(subset=['Embarked'], inplace=True)
test.dropna(subset=['Embarked'], inplace=True)

In [10]:
print(train.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
FamilySize       0
IsAlone          0
FareBin          0
AgeBin           0
RichFemale       0
Sex_encoded      0
dtype: int64


In [11]:
print(test.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Title            0
FamilySize       0
IsAlone          0
FareBin          0
AgeBin           0
RichFemale       0
Sex_encoded      0
dtype: int64


In [12]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

# 定义特征列
features = ['Pclass', 'Sex_encoded', 'Age', 'Fare', 'FamilySize', 'IsAlone', 
            'Title', 'RichFemale', 'FareBin', 'AgeBin']

# XGBoost调参
xgb = XGBClassifier(
    learning_rate=0.01,
    n_estimators=500,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

# 随机森林
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=5,
    random_state=42
)

# 投票集成
ensemble = VotingClassifier(
    estimators=[('xgb', xgb), ('rf', rf)],
    voting='soft'
)
ensemble.fit(train[features], train['Survived'])

In [13]:
predictions = ensemble.predict(test[features])
output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
