# 泰塔尼克号生还预测

In [17]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [18]:
'''
PassengerId     编号
Survived        是否存活 0-死亡 1-存活
Pclass          舱位等级 1-一等舱 2-二等舱 3-三等舱
Name            姓名
Sex             性别
Age             年龄
SibSp           兄弟姐妹和配偶的数量
Parch           父母和孩子的数量
Ticket          船票编号
Fare            票价
Cabin           船舱号
Embarked        登船港口
'''

# Load the dataset
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [19]:
print(train_df.isnull().sum())
print()
print(test_df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


## 缺失值处理

In [20]:
# 删除少量缺失值
train_df = train_df.drop(train_df[train_df['Embarked'].isnull()].index)
# 填充 Fare 缺失值
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

In [21]:
# 处理年龄缺失值
def loc_age(df):
    age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
    age_train = age_df[age_df['Age'].notnull()].values
    age_pred = age_df[age_df['Age'].isnull()].values
    x = age_train[:, 1:]
    y = age_train[:, 0]
    
    # KNN
    regressor = KNeighborsRegressor(n_neighbors=3)
    regressor.fit(x, y)
    pred_age = regressor.predict(age_pred[:, 1:])
    
    df.loc[df['Age'].isnull(), 'Age'] = np.round(pred_age)


loc_age(train_df)
loc_age(test_df)

## 无关字段处理

In [22]:
# PassengerId、Name、Cabin、Ticket 可以丢弃
train_df = train_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
test_df = test_df.drop(['Name', 'Cabin', 'Ticket'], axis=1)     # 为了提交结果，没有丢弃 PassengerId

In [23]:
print(train_df.isnull().sum())
print()
print(test_df.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


## 数据预处理

In [24]:
# 年龄离散化
train_df['Age'] = pd.cut(train_df['Age'], 5, labels=[0, 1, 2, 3, 4])
test_df['Age'] = pd.cut(test_df['Age'], 5, labels=[0, 1, 2, 3, 4])

# 票价离散化
train_df['Fare'] = pd.qcut(train_df['Fare'], 4, labels=[0, 1, 2, 3])
test_df['Fare'] = pd.qcut(test_df['Fare'], 4, labels=[0, 1, 2, 3])

# 性别编码
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# Embarked 编码
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Parch + SibSp > 0 --> 1
train_df['IsAlone'] = (train_df['Parch'] + train_df['SibSp'] == 0).astype(int)
train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df['IsAlone'] = (test_df['Parch'] + test_df['SibSp'] == 0).astype(int)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)

In [25]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,892,3,0,2,0,2,1
1,893,3,1,3,0,0,0
2,894,2,0,4,1,2,1
3,895,3,0,1,1,0,1
4,896,3,1,1,1,0,0


## 模型训练及评估

In [26]:
x_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
x_test = test_df.drop('PassengerId', axis=1).copy()

In [27]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)
accuracy_knn = knn.score(x_train, y_train)
print(accuracy_knn)

0.8481439820022497


In [28]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)
accuracy_dt = dt.score(x_train, y_train)
print(accuracy_dt)

0.8627671541057368


In [29]:
# Naive Bayes
nb = GaussianNB()
nb.fit(x_train, y_train)
y_pred_nb = nb.predict(x_test)
accuracy_nb = nb.score(x_train, y_train)
print(accuracy_nb)

0.7739032620922385


In [30]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
accuracy_lr = lr.score(x_train, y_train)
print(accuracy_lr)

0.7941507311586051


In [31]:
# SVM
svm = SVC()
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)
accuracy_svm = svm.score(x_train, y_train)
print(accuracy_svm)

0.8200224971878515


## 提交结果

In [32]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_dt
})
submission.to_csv('data/submission.csv', index=False)