In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# 1等座和2等座所有年龄段的女性几乎全部存活 （0.97）
# 1等座和2等座小于18岁所有人全部存活（1.0）
# 3等座的妇孺只有一半的几率存活 （0.5）
# 有孩子的家庭会比没有的生存率提升20% （0.52）
# 影响力最大的为妇孺优先
# 称呼 master 未成年男孩
#     miss 未婚女性（包括未成年）
#     mrs 已婚女性
#     mr 成年男性
# 除了这四种其他头衔(40人)
# 船长(capt)和神职人员(rev)必死
# 该头衔男性一半一半女性全部存活

In [6]:
train['Title'] = train['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)
train['Title'] = train['Title'].str.strip()

train['Title'] = train['Title'].replace(['Mlle', 'Ms'], 'Miss')
train['Title'] = train['Title'].replace(['Mme', 'Countess', 'Lady', 'Dona'], 'Mrs')
train['Title'] = train['Title'].replace(['Rev', 'Col', 'Major', 'Capt', 'Sir', 'Don', 'Jonkheer'], 'Officer')

# title_counts = train['Title'].value_counts()
# print(title_counts)

In [7]:
test['Title'] = test['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)
test['Title'] = test['Title'].str.strip()

test['Title'] = test['Title'].replace(['Mlle', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace(['Mme', 'Countess', 'Lady', 'Dona'], 'Mrs')
test['Title'] = test['Title'].replace(['Rev', 'Col', 'Major', 'Capt', 'Sir', 'Don', 'Jonkheer'], 'Officer')

In [8]:
# 根据头衔赋予不同默认值
title_defaults = {
    'Master': 5,    # 儿童
    'Miss': 22,     # 年轻女性
    'Mr': 30,       # 成年男性
    'Mrs': 35,      # 已婚女性
    'Dr': 40,       # 医生/博士
    'Officer': 45   # 军官/贵族
}

def fill_default_age_train(row):
    if pd.isnull(row['Age']):
        return title_defaults.get(row['Title'], train['Age'].median())
    return row['Age']

train['Age'] = train.apply(fill_default_age_train, axis=1)

In [9]:
def fill_default_age_test(row):
    if pd.isnull(row['Age']):
        return title_defaults.get(row['Title'], test['Age'].median())
    return row['Age']

test['Age'] = test.apply(fill_default_age_test, axis=1)

In [10]:
# 创建规则特征
def create_rule_features(train):
    # 规则1：1/2等舱女性
    train['is_p12_female'] = ((train['Pclass'].isin([1,2])) & (train['Sex']=='female'))
    
    # 规则2：1/2等舱<18岁
    train['is_p12_under18'] = ((train['Pclass'].isin([1,2])) & (train['Age']<18))
    
    # 规则3：3等舱妇孺
    train['is_p3_women_child'] = ((train['Pclass']==3) & ((train['Sex']=='female') | (train['Age']<18)))

    # 规则4：有孩子的家庭
    train['family_with_child'] = (train['Parch']>0)
    
    return train

In [11]:
# 应用特征工程
train = create_rule_features(train)
test = create_rule_features(test)

In [12]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
           'is_p12_female', 'is_p12_under18', 
            'is_p3_women_child', 'family_with_child']
X_train = pd.get_dummies(train[features])
y_train = train["Survived"]
X_test = pd.get_dummies(test[features])
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [13]:
output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
