### titanic：哪类人会存活下来？

### 添加模块

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sklearn
sklearn.__version__

'0.21.3'

### 获取数据

In [4]:
gender_submission = pd.read_csv("./data/titanic/gender_submission.csv")
test = pd.read_csv("./data/titanic/test.csv")
train = pd.read_csv("./data/titanic/train.csv")

In [5]:
#gender_submission.head()

In [6]:
#test.head()

#### 表数据如下：

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 数据探索

#### 表字段信息介绍


Variable | Definition | Key
---|---|---
survival|Survival|0 = No, 1 = Yes
pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd
sex|Sex|
Age|Age in years|
sibsp|# of siblings / spouses aboard the Titanic (在船上充当什么家庭身份，平辈亲人：亲兄弟，表兄弟，丈夫、妻)|
parch|# of parents / children aboard the Titanic (在船上充当什么家庭身份，非平辈亲人：父母、儿子、孙子等)|
ticket|Ticket number|
fare|Passenger fare|
cabin|Cabin number(舱号)|
embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampton


    Variable Notes
    pclass: A proxy for socio-economic status (SES)
    1st = Upper
    2nd = Middle
    3rd = Lower

    age: Age is fractional（小数） if less than 1. If the age is estimated(估计), is it in the form of xx.5

    sibsp: The dataset defines family relations in this way...
    Sibling = brother, sister, stepbrother, stepsister
    Spouse = husband, wife (mistresses and fiancés were ignored)

    parch: The dataset defines family relations in this way...
    Parent = mother, father
    Child = daughter, son, stepdaughter, stepson
    Some children travelled only with a nanny, therefore parch=0 for them.

In [8]:
df = train.copy()

In [9]:
df.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Baclini, Miss. Eugenie",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [10]:
print("Age有空值个数：", len(df[df.Age.isna()]))

Age有空值个数： 177


In [11]:
print("Cabin有空值个数：", len(df[df.Cabin.isna()]))

Cabin有空值个数： 687


In [12]:
print("Embarked有空值个数：", len(df[df.Embarked.isna()]))

Embarked有空值个数： 2


#### 发现1 ：存在缺失值
1. 根据`df.describe().T`方法，可知`Age`有空值
2. 依据原始表数据,`Cabin`存在缺失值
3. 依`df.describe().T`方法，可知`Embarked`有空值

ps:处理这类问题，一般有：
1. 删除这部分数据
2. 填充

#### 发现2 ：存在定性特征
1. 依据原始表数据,可知`Sex`是以 `male`和`female`数据形式
2. 依据原始表数据,可知`Embarked`是以 `S`和`Q`、`C`数据形式）

ps:处理这类问题，一般有：
1. 采用哑变量方式（one-hot），进行处理，结果特征有非线性效果


#### 发现3：无量纲化
1. 依原表，Age数据以23.0, 0.9, 18.1形式
2. 依原表，Fare数据以7.2500, 71.2833, 7.9200形式

ps:处理这类问题，一般有：
1. 标准化
2. 区间缩放法
3. 归一化

### 数据处理

In [13]:
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

#### 训练数据与测试数据合并处理

In [14]:
df_train = train.copy()
df_train["train"] = 1
df_test = test.copy()
df_test["train"] = 0

In [15]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [16]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [17]:
df = df_train.append(df_test)
df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,train
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,1
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,1
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,1


In [18]:
print("测试数据个数为：", len(df_test))

测试数据个数为： 418


In [19]:
print("总数据集中由于加入了测试数据，所以Survived部分是null的，Survived为null的个数为：", df.Survived.isnull().sum())

总数据集中由于加入了测试数据，所以Survived部分是null的，Survived为null的个数为： 418


#### 针对测试集Fare存在缺失值
采用中位数填充

In [20]:
print("未处理前，空值有", df.Fare.isnull().sum(), "个")

未处理前，空值有 1 个


In [21]:
#### 针对Age存在缺失值
imp = SimpleImputer(missing_values=np.nan, strategy = "median")

df["Fare"]= imp.fit_transform(df.Fare.values.reshape(-1,1))

In [22]:
print("处理后，空值有", df.Fare.isnull().sum(), "个")

处理后，空值有 0 个


#### 针对Age存在缺失值
采用平均数填充

In [23]:
print("未处理前，空值有", df.Age.isnull().sum(), "个")

未处理前，空值有 263 个


In [24]:
#### 针对Age存在缺失值
imp = SimpleImputer(missing_values=np.nan, strategy = "mean")

df["Age"]= imp.fit_transform(df.Age.values.reshape(-1,1))

In [25]:
print("处理后，空值有", df.Age.isnull().sum(), "个")

处理后，空值有 0 个


#### 针对Cabin存在缺失值
将Cabin列删除

In [26]:
print("未处理前，列名有：")
df.columns

未处理前，列名有：


Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket', 'train'],
      dtype='object')

In [27]:
newCols = [i for i in df.columns if i != "Cabin"]

In [28]:
df = df[newCols]

In [29]:
print("处理后，列名有：")
df.columns

处理后，列名有：


Index(['Age', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass',
       'Sex', 'SibSp', 'Survived', 'Ticket', 'train'],
      dtype='object')

#### 针对Embarked存在缺失值
采用most_frequent填充

In [30]:
print("未处理前，空值有", df.Embarked.isnull().sum(), "个")

未处理前，空值有 2 个


In [31]:
imp = SimpleImputer(strategy="most_frequent")
df.Embarked = imp.fit_transform(df.Embarked.values.reshape(-1, 1))

In [32]:
print("处理后，空值有", df.Embarked.isnull().sum(), "个")

处理后，空值有 0 个


#### 针对Sex存在定性特征
采用ordinal处理

In [33]:
df.Age.values.reshape(-1,1)

array([[22.        ],
       [38.        ],
       [26.        ],
       ...,
       [38.5       ],
       [29.88113767],
       [29.88113767]])

In [34]:
print("未处理前:")
df.Sex.head()

未处理前:


0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [35]:
enc = preprocessing.OrdinalEncoder()
df.Sex = enc.fit_transform(df.Sex.values.reshape(-1, 1))

In [36]:
print("处理后：")
df.Sex.head()

处理后：


0    1.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: Sex, dtype: float64

#### 针对Embarked存在定性特征
采用onehot

In [37]:
print("处理前：")
df.head()

处理前：


Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,train
0,22.0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1.0,1,0.0,A/5 21171,1
1,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0.0,1,1.0,PC 17599,1
2,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0.0,0,1.0,STON/O2. 3101282,1
3,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0.0,1,1.0,113803,1
4,35.0,S,8.05,"Allen, Mr. William Henry",0,5,3,1.0,0,0.0,373450,1


In [38]:
onehot = preprocessing.OneHotEncoder()
data = onehot.fit_transform(df.Embarked.values.reshape(-1,1)).toarray()

In [39]:
col_names = ["Embarked_"+onehot.categories_[0][i] for i in range(len(onehot.categories_[0]))]

In [40]:
col_names

['Embarked_C', 'Embarked_Q', 'Embarked_S']

In [41]:
for i in range(len(col_names)):
    df[col_names[i]] = data[:,i]

In [42]:
print("处理后：")
df.head()

处理后：


Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,train,Embarked_C,Embarked_Q,Embarked_S
0,22.0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1.0,1,0.0,A/5 21171,1,0.0,0.0,1.0
1,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0.0,1,1.0,PC 17599,1,1.0,0.0,0.0
2,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0.0,0,1.0,STON/O2. 3101282,1,0.0,0.0,1.0
3,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0.0,1,1.0,113803,1,0.0,0.0,1.0
4,35.0,S,8.05,"Allen, Mr. William Henry",0,5,3,1.0,0,0.0,373450,1,0.0,0.0,1.0


#### 针对Age存在无量纲化
采用标准化处理

In [43]:
### print('未处理前：')
df.Age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [44]:
scaler = preprocessing.StandardScaler()
df.Age = scaler.fit_transform(df.Age.values.reshape(-1, 1))

In [45]:
print('处理后：')
df.Age.head()

处理后：


0   -0.611972
1    0.630431
2   -0.301371
3    0.397481
4    0.397481
Name: Age, dtype: float64

#### 针对Fare存在无量纲化
采用标准化处理

In [46]:
print('未处理前：')
df.Fare.head()

未处理前：


0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64

In [47]:
fare_scaler = preprocessing.StandardScaler()
df.Fare = fare_scaler.fit_transform(df.Fare.values.reshape(-1, 1))

In [48]:
print('处理后：')
df.Fare.head()

处理后：


0   -0.503291
1    0.734744
2   -0.490240
3    0.383183
4   -0.487824
Name: Fare, dtype: float64

### 特征构建

In [49]:
print('未处理前：')
df.columns

未处理前：


Index(['Age', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass',
       'Sex', 'SibSp', 'Survived', 'Ticket', 'train', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [50]:
x_col_names = ['PassengerId',"Pclass", "Sex", 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S','Survived', "train"]
df = df[x_col_names]
df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived,train
0,1,3,1.0,-0.611972,1,0,-0.503291,0.0,0.0,1.0,0.0,1
1,2,1,0.0,0.630431,1,0,0.734744,1.0,0.0,0.0,1.0,1
2,3,3,0.0,-0.301371,0,0,-0.49024,0.0,0.0,1.0,1.0,1
3,4,1,0.0,0.397481,1,0,0.383183,0.0,0.0,1.0,1.0,1
4,5,3,1.0,0.397481,0,0,-0.487824,0.0,0.0,1.0,0.0,1


In [51]:
# df_tmp = df.copy()
# df = df_tmp

In [52]:
drop_train_col_names = [i for i in df.columns if  i != "train"]
df_train = df.loc[ df.train == 1, drop_train_col_names]

In [53]:
df_test = df.loc[ df.train == 0, drop_train_col_names]

In [54]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived
0,1,3,1.0,-0.611972,1,0,-0.503291,0.0,0.0,1.0,0.0
1,2,1,0.0,0.630431,1,0,0.734744,1.0,0.0,0.0,1.0
2,3,3,0.0,-0.301371,0,0,-0.49024,0.0,0.0,1.0,1.0
3,4,1,0.0,0.397481,1,0,0.383183,0.0,0.0,1.0,1.0
4,5,3,1.0,0.397481,0,0,-0.487824,0.0,0.0,1.0,0.0


In [55]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived
0,892,3,1.0,0.358655,0,0,-0.492093,0.0,1.0,0.0,
1,893,3,0.0,1.329283,1,0,-0.508125,0.0,0.0,1.0,
2,894,2,1.0,2.494035,0,0,-0.456164,0.0,1.0,0.0,
3,895,3,1.0,-0.223721,0,0,-0.475981,0.0,0.0,1.0,
4,896,3,0.0,-0.611972,1,1,-0.405895,0.0,0.0,1.0,


### 模型构造

In [56]:
from sklearn.svm import SVC
svc_linear = SVC(kernel='linear')#线性核,可以选用不同的核

In [57]:
x_col_names = [i for i in df_train.columns if  i != "Survived"]

In [58]:
x_col_names

['PassengerId',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [59]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived
0,1,3,1.0,-0.611972,1,0,-0.503291,0.0,0.0,1.0,0.0
1,2,1,0.0,0.630431,1,0,0.734744,1.0,0.0,0.0,1.0
2,3,3,0.0,-0.301371,0,0,-0.49024,0.0,0.0,1.0,1.0
3,4,1,0.0,0.397481,1,0,0.383183,0.0,0.0,1.0,1.0
4,5,3,1.0,0.397481,0,0,-0.487824,0.0,0.0,1.0,0.0


In [60]:
model = svc_linear.fit(df_train[x_col_names], df_train["Survived"])

### 模型评价

In [61]:
from sklearn import metrics

In [62]:
df_test = pd.merge(df_test,gender_submission,on='PassengerId')

In [63]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived_x,Survived_y
0,892,3,1.0,0.358655,0,0,-0.492093,0.0,1.0,0.0,,0
1,893,3,0.0,1.329283,1,0,-0.508125,0.0,0.0,1.0,,1
2,894,2,1.0,2.494035,0,0,-0.456164,0.0,1.0,0.0,,0
3,895,3,1.0,-0.223721,0,0,-0.475981,0.0,0.0,1.0,,0
4,896,3,0.0,-0.611972,1,1,-0.405895,0.0,0.0,1.0,,1


In [64]:
df_test[x_col_names].describe(include="all")

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,0.636364,0.024143,0.447368,0.392344,0.044381,0.244019,0.110048,0.645933
std,120.810458,0.841838,0.481622,0.981151,0.89676,0.981429,1.079819,0.430019,0.313324,0.478803
min,892.0,1.0,0.0,-2.307075,0.0,0.0,-0.643464,0.0,0.0,0.0
25%,996.25,1.0,0.0,-0.534322,0.0,0.0,-0.490805,0.0,0.0,0.0
50%,1100.5,3.0,1.0,0.0,0.0,0.0,-0.364003,0.0,0.0,1.0
75%,1204.75,3.0,1.0,0.455718,1.0,0.0,-0.03498,0.0,0.0,1.0
max,1309.0,3.0,1.0,3.581138,8.0,9.0,9.262028,1.0,1.0,1.0


In [65]:
# df_test = df_test[df_test.Fare.isna() == False]

In [66]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Survived_x,Survived_y
0,892,3,1.0,0.358655,0,0,-0.492093,0.0,1.0,0.0,,0
1,893,3,0.0,1.329283,1,0,-0.508125,0.0,0.0,1.0,,1
2,894,2,1.0,2.494035,0,0,-0.456164,0.0,1.0,0.0,,0
3,895,3,1.0,-0.223721,0,0,-0.475981,0.0,0.0,1.0,,0
4,896,3,0.0,-0.611972,1,1,-0.405895,0.0,0.0,1.0,,1


In [67]:
y_predict = model.predict(df_test[x_col_names])

In [71]:
# 实际上， 这个gender_submission数据集只是一个样例而已，初看accuracy以为评分很高的，只有提交到官网排行之后，才是真正的分数了。
# y_test  = df_test.Survived_y.values
# accuracy_rate = metrics.accuracy_score(y_test, y_predict)
# print(metrics.classification_report(y_test, y_predict))#可以获取准确率,召回率等数据

In [69]:
result = pd.DataFrame({'PassengerId':df_test['PassengerId'].as_matrix(), 'Survived':y_predict.astype(np.int32)})
result.to_csv("./output/titanic_predictions.csv", index=False)