## 项目流程：
### 提出问题-->理解数据-->数据清洗-->构建模型-->方案实施

### 1.导入数据
数据集来源：https://www.kaggle.com/c/titanic/data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 读取数据
train = pd.read_csv("D:\\Anaconda\\jupyter_data\\titanic\\train.csv")
test = pd.read_csv("D:\\Anaconda\\jupyter_data\\titanic\\test.csv")

# train = pd.read_csv("D:/Anaconda/jupyter_data/titanic/train.csv")
# test = pd.read_csv("D:/Anaconda/jupyter_data/titanic/test.csv")
print('train:',train.shape, "test:",test.shape)

# 合并数据集，方便同时对两个数据进行清洗
full = train.append(test, ignore_index = True)
print('合并后的数据集：',full.shape)

train: (891, 12) test: (418, 11)
合并后的数据集： (1309, 12)


### 2.初步查看数据集概要信息

In [3]:
# 查看数据
full.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [4]:
# 获取数据类型描述统计信息
full.describe()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
count,1046.0,1308.0,1309.0,1309.0,1309.0,1309.0,891.0
mean,29.881138,33.295479,0.385027,655.0,2.294882,0.498854,0.383838
std,14.413493,51.758668,0.86556,378.020061,0.837836,1.041658,0.486592
min,0.17,0.0,0.0,1.0,1.0,0.0,0.0
25%,21.0,7.8958,0.0,328.0,2.0,0.0,0.0
50%,28.0,14.4542,0.0,655.0,3.0,0.0,0.0
75%,39.0,31.275,0.0,982.0,3.0,1.0,1.0
max,80.0,512.3292,9.0,1309.0,3.0,8.0,1.0


In [5]:
# 查看每一列数据类型和数据总数
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


### 3.数据清洗
#### （1）.处理缺失数据

In [6]:
# age和fare都存在缺失值，且为浮点数类型，可使用平均值进行填充

full['Age'] = full['Age'].fillna(full['Age'].mean())
full['Fare'] = full['Fare'].fillna(full['Fare'].mean())

In [7]:
# 填充客舱号（Cabin）
full['Cabin'].head()

0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object

In [8]:
# Cabin 这一列缺失值较多且无规律，直接填充U（unknow）
full['Cabin'] = full['Cabin'].fillna('U')

In [9]:
# 出发地点：S=南安普敦  途经地点：C=瑟堡，Q=皇后镇
full['Embarked'].head()

0    S
1    C
2    S
3    S
4    S
Name: Embarked, dtype: object

In [10]:
# 统计各个出发地点出现的次数
from collections import Counter
print(Counter(full['Embarked']))

Counter({'S': 914, 'C': 270, 'Q': 123, nan: 2})


In [11]:
# 由于登船港口（Emabrked）这一列只有两个缺失值，将填充为 最频繁出现的S
full['Embarked'] = full['Embarked'].fillna('S')

In [12]:
# 再次查看最终缺失值处理情况
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


#### (2).提取特征

In [13]:
"""
将性别的值映射为数值，
male --> 1
female --> 0
"""
sex_mapDict = {'male':1, 'female':0}
# map函数：对于Series 每个数据应用自定义函数计算
full['Sex'] = full['Sex'].map(sex_mapDict)
full.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,U,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599
2,26.0,U,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803
4,35.0,U,S,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450


In [14]:
full['Embarked'].head()

0    S
1    C
2    S
3    S
4    S
Name: Embarked, dtype: object

In [15]:
# 存放提取后的特征
embarkedDF = pd.DataFrame()

# 使用get_dummies进行one-hot编码，列名前缀为Embarked
embarkedDF = pd.get_dummies(full['Embarked'],prefix = 'Embarked')
embarkedDF.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [16]:
# 添加one-hot编码产生的虚拟变量（dummy variables）到泰坦尼克号数据集full
full = pd.concat([full, embarkedDF], axis = 1)

# 因已对登船港口（Embarked）进行了one-hot编码产生虚拟变量，故删除 Embarked
full.drop('Embarked', axis = 1, inplace = True)
full.head()

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,0,0,1
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,1,0,0
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,0,0,1
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,0,0,1
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,0,0,1


In [17]:
# 存放提取后的特征
pclassDf = pd.DataFrame()

# 使用get_dummies进行one-hot编码，列名前缀为Pclass
pclassDf = pd.get_dummies(full['Pclass'], prefix = 'Pclass')
pclassDf.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [18]:
full = pd.concat([full, pclassDf], axis = 1)

full.drop('Pclass', axis = 1, inplace = True)
full.head()

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,1,1,0.0,A/5 21171,0,0,1,0,0,1
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,0,1,1.0,PC 17599,1,0,0,1,0,0
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,0,0,1.0,STON/O2. 3101282,0,0,1,0,0,1
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,0,1,1.0,113803,0,0,1,1,0,0
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,1,0,0.0,373450,0,0,1,0,0,1


In [19]:
full['Name'].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

In [20]:
# 从姓名中获取头衔
# split（）通过制定分隔符对字符串进行切片
def getTitle(name):
    str1 = name.split(',')[1]
    str2 = str1.split('.')[0]
    # strip()移除字符串头尾制定的字符（默认为空格）
    str3 = str2.strip()
    return str3

titleDf = pd.DataFrame()

# map函数：对于Series每个数据应用自定义函数计数
titleDf['Title'] = full['Name'].map(getTitle)
titleDf.head()

Unnamed: 0,Title
0,Mr
1,Mrs
2,Miss
3,Mrs
4,Mr


In [21]:
# 从姓名中头衔字符串与自定义头衔类别的映射
title_mapDict = {
    'Capt':'Officer',
    'Col':'Officer',
    'Major':'Officer',
    'Jonkheer':'Royalty',
    'Don':'Royalty',
    'Sir':'Royalty',
    'Dr':'Officer',
    'Rev':'Officer',
    'the Countess':'Royalty',
    'Dona':'Royalty',
    'Mme':'Mrs',
    'Mlle':'Miss',
    'Mr':'Mr',
    'Mrs':'Mrs',
    'Miss':'Miss',
    'Master':'Master',
    'Lady':'Royalty'
}
# print(title_mapDict)
titleDf['Title'] = titleDf['Title'].map(title_mapDict)
# 使用get——dummies进行one-hot编码
titleDf = pd.get_dummies(titleDf['Title'])
titleDf.head()

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,1,0,0,0


In [22]:
# 添加one-hot编码产生的虚拟变量到泰坦尼克号数据集full
full = pd.concat([full, titleDf], axis = 1)

# 删除姓名（Name）这一列
full.drop('Name', axis = 1, inplace = True)
full.head()

Unnamed: 0,Age,Cabin,Fare,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,...,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty
0,22.0,U,7.25,0,1,1,1,0.0,A/5 21171,0,...,1,0,0,1,0,0,1,0,0,0
1,38.0,C85,71.2833,0,2,0,1,1.0,PC 17599,1,...,0,1,0,0,0,0,0,1,0,0
2,26.0,U,7.925,0,3,0,0,1.0,STON/O2. 3101282,0,...,1,0,0,1,0,1,0,0,0,0
3,35.0,C123,53.1,0,4,0,1,1.0,113803,0,...,1,1,0,0,0,0,0,1,0,0
4,35.0,U,8.05,0,5,1,0,0.0,373450,0,...,1,0,0,1,0,0,1,0,0,0


In [23]:
full['Cabin'].head()

0       U
1     C85
2       U
3    C123
4       U
Name: Cabin, dtype: object

In [24]:
# 存放客舱号信息
cabinDf = pd.DataFrame()

# 客舱号的类别值是首字母， eg：C85

#定义匿名函数 lambda，用于查找首字母
full['Cabin'] = full['Cabin'].map(lambda c:c[0])

# 使用get_dummies 进行one-hot 编码， 列名前缀为Cabin
cabinDf = pd.get_dummies(full['Cabin'], prefix = 'Cabin')
cabinDf.head()

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1


In [25]:
# 添加one-hot编码产生的虚拟变量到泰坦尼克号数据集full
full = pd.concat([full, cabinDf], axis = 1)
# 删除客舱号等级（Pclass）这一列
full.drop('Cabin', axis = 1, inplace = True)
full.head()

Unnamed: 0,Age,Fare,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,...,Royalty,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,22.0,7.25,0,1,1,1,0.0,A/5 21171,0,0,...,0,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,0,2,0,1,1.0,PC 17599,1,0,...,0,0,0,1,0,0,0,0,0,0
2,26.0,7.925,0,3,0,0,1.0,STON/O2. 3101282,0,0,...,0,0,0,0,0,0,0,0,0,1
3,35.0,53.1,0,4,0,1,1.0,113803,0,0,...,0,0,0,1,0,0,0,0,0,0
4,35.0,8.05,0,5,1,0,0.0,373450,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# 存放家庭信息
familyDf = pd.DataFrame()
# 家庭人数 = 同代直系亲属数（SibSp）+ 不同代直系亲属数（Parch）+ 乘客自己
familyDf['Familysize'] = full['SibSp'] + full['Parch'] + 1
"""
家庭类别：
小家庭Family_Single：家庭人数=1
中等家庭Family_Small：2<=家庭人数<=4
大家庭Family_Large:家庭人数>=5
"""
# if条件为真是返回if前面内容， 否则返回0
familyDf['Family_Single'] = familyDf['Familysize'].map(lambda s : 1 if s == 1 else 0)
familyDf['Family_Small'] = familyDf['Familysize'].map(lambda s : 1 if 2 <= s <=4 else 0)
familyDf['Family_Large'] = familyDf['Familysize'].map(lambda s : 1 if s >= 5 else 0)
familyDf.head()

Unnamed: 0,Familysize,Family_Single,Family_Small,Family_Large
0,2,0,1,0
1,2,0,1,0
2,1,1,0,0
3,2,0,1,0
4,1,1,0,0


In [27]:
# 添加one-hot编码产生的虚拟变量到泰坦尼克号数据集full
full = pd.concat([full, familyDf], axis = 1)
full.head()

Unnamed: 0,Age,Fare,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Familysize,Family_Single,Family_Small,Family_Large
0,22.0,7.25,0,1,1,1,0.0,A/5 21171,0,0,...,0,0,0,0,0,1,2,0,1,0
1,38.0,71.2833,0,2,0,1,1.0,PC 17599,1,0,...,0,0,0,0,0,0,2,0,1,0
2,26.0,7.925,0,3,0,0,1.0,STON/O2. 3101282,0,0,...,0,0,0,0,0,1,1,1,0,0
3,35.0,53.1,0,4,0,1,1.0,113803,0,0,...,0,0,0,0,0,0,2,0,1,0
4,35.0,8.05,0,5,1,0,0.0,373450,0,0,...,0,0,0,0,0,1,1,1,0,0


In [28]:
# 存放年龄信息
ageDf = pd.DataFrame()
"""
年龄类别：
儿童Child：0<年龄<=6
青少年Teenager：6<年龄<18
青年Youth：18<=年龄<=40
中年Middle_age：40<年龄<=60
老年Older:60<年龄
"""
ageDf['Child'] = full['Age'].map(lambda a : 1 if 0 < a <= 6 else 0)
ageDf['Teenager'] = full['Age'].map(lambda a : 1 if 6 < a < 18 else 0)
ageDf['Youth'] = full['Age'].map(lambda a : 1 if 18 <= a <= 40 else 0)
ageDf['Middle_age'] = full['Age'].map(lambda a : 1 if 40 < a <= 60 else 0)
ageDf['Older'] = full['Age'].map(lambda a : 1 if a > 60 else 0)
ageDf.head()

Unnamed: 0,Child,Teenager,Youth,Middle_age,Older
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0


In [29]:
# 添加one-hot编码产生的虚拟变量到泰坦尼克号数据集full
full = pd.concat([full, ageDf], axis = 1)
# 删除Age这一列
full.drop('Age', axis = 1, inplace = True)
full.head()


Unnamed: 0,Fare,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,...,Cabin_U,Familysize,Family_Single,Family_Small,Family_Large,Child,Teenager,Youth,Middle_age,Older
0,7.25,0,1,1,1,0.0,A/5 21171,0,0,1,...,1,2,0,1,0,0,0,1,0,0
1,71.2833,0,2,0,1,1.0,PC 17599,1,0,0,...,0,2,0,1,0,0,0,1,0,0
2,7.925,0,3,0,0,1.0,STON/O2. 3101282,0,0,1,...,1,1,1,0,0,0,0,1,0,0
3,53.1,0,4,0,1,1.0,113803,0,0,1,...,0,2,0,1,0,0,0,1,0,0
4,8.05,0,5,1,0,0.0,373450,0,0,1,...,1,1,1,0,0,0,0,1,0,0


In [30]:
# 查看现已有的特征
full.shape

(1309, 37)

#### (3).特征选择

In [31]:
# 相关矩阵
corrDf = full.corr()
corrDf

Unnamed: 0,Fare,Parch,PassengerId,Sex,SibSp,Survived,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,...,Cabin_U,Familysize,Family_Single,Family_Small,Family_Large,Child,Teenager,Youth,Middle_age,Older
Fare,1.0,0.221522,0.031416,-0.185484,0.160224,0.257307,0.286241,-0.130054,-0.169894,0.599956,...,-0.507197,0.226465,-0.274826,0.197281,0.170853,-0.012853,-0.00461,-0.137112,0.150441,0.081268
Parch,0.221522,1.0,0.008942,-0.213125,0.373587,0.081629,-0.008635,-0.100943,0.071881,-0.013033,...,-0.036806,0.792296,-0.549022,0.248532,0.624627,0.254087,0.147065,-0.204682,0.005728,-0.003976
PassengerId,0.031416,0.008942,1.0,0.013406,-0.055224,-0.005007,0.048101,0.011585,-0.049836,0.026495,...,0.000208,-0.031437,0.028546,0.002975,-0.063415,-0.028229,-0.030378,0.005034,0.036701,-0.009982
Sex,-0.185484,-0.213125,0.013406,1.0,-0.109609,-0.543351,-0.066564,-0.088651,0.115193,-0.107371,...,0.137396,-0.188583,0.284537,-0.255196,-0.077748,-0.059549,-0.057724,0.050455,-0.008696,0.048323
SibSp,0.160224,0.373587,-0.055224,-0.109609,1.0,-0.035322,-0.048396,-0.048678,0.073709,-0.034256,...,0.009064,0.861952,-0.591077,0.25359,0.699681,0.22475,0.206952,-0.177561,-0.042904,-0.030244
Survived,0.257307,0.081629,-0.005007,-0.543351,-0.035322,1.0,0.16824,0.00365,-0.149683,0.285904,...,-0.316912,0.016639,-0.203367,0.279855,-0.125147,0.154447,0.023499,-0.07615,0.005716,-0.051224
Embarked_C,0.286241,-0.008635,0.048101,-0.066564,-0.048396,0.16824,1.0,-0.164166,-0.778262,0.325722,...,-0.258257,-0.036553,-0.107874,0.159594,-0.092825,-0.015896,0.007561,-0.055751,0.079627,-0.009716
Embarked_Q,-0.130054,-0.100943,0.011585,-0.088651,-0.048678,0.00365,-0.164166,1.0,-0.491656,-0.166101,...,0.142369,-0.08719,0.127214,-0.122491,-0.018423,-0.047156,-0.016847,0.114127,-0.112223,0.015018
Embarked_S,-0.169894,0.071881,-0.049836,0.115193,0.073709,-0.149683,-0.778262,-0.491656,1.0,-0.1818,...,0.137351,0.087771,0.014246,-0.062909,0.093671,0.044051,0.00405,-0.023436,0.001147,-0.000983
Pclass_1,0.599956,-0.013033,0.026495,-0.107371,-0.034256,0.285904,0.325722,-0.166101,-0.1818,1.0,...,-0.776987,-0.029656,-0.126551,0.165965,-0.067523,-0.094254,-0.081114,-0.194994,0.299883,0.145332


In [32]:
# 查看各个特征与生成情况（Survived）的相关系数，ascending = False表示按降序排列

corrDf['Survived'].sort_values(ascending = False)

Survived         1.000000
Mrs              0.341994
Miss             0.332795
Pclass_1         0.285904
Family_Small     0.279855
Fare             0.257307
Cabin_B          0.175095
Embarked_C       0.168240
Child            0.154447
Cabin_D          0.150716
Cabin_E          0.145321
Cabin_C          0.114652
Pclass_2         0.093349
Master           0.085221
Parch            0.081629
Cabin_F          0.057935
Royalty          0.033391
Teenager         0.023499
Cabin_A          0.022287
Familysize       0.016639
Cabin_G          0.016040
Middle_age       0.005716
Embarked_Q       0.003650
PassengerId     -0.005007
Cabin_T         -0.026456
Officer         -0.031316
SibSp           -0.035322
Older           -0.051224
Youth           -0.076150
Family_Large    -0.125147
Embarked_S      -0.149683
Family_Single   -0.203367
Cabin_U         -0.316912
Pclass_3        -0.322308
Sex             -0.543351
Mr              -0.549199
Name: Survived, dtype: float64

In [33]:
# 特征选择
full_X = pd.concat([
    titleDf,  # 头衔
    pclassDf,
    full['Fare'],
    full['Sex'],
    cabinDf,
    embarkedDF
], axis = 1)
full_X.head()

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty,Pclass_1,Pclass_2,Pclass_3,Fare,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1,0,0,0,0,0,1,7.25,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,1,0,0,1,0,0,71.2833,...,1,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,1,7.925,...,0,0,0,0,0,0,1,0,0,1
3,0,0,0,1,0,0,1,0,0,53.1,...,1,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,1,8.05,...,0,0,0,0,0,0,1,0,0,1


### 4.构建模型
#### （1）.建立训练集和测试集

In [34]:
# 原始数据共有891行
sourceRow = 891
"""
原始数据集sourceRow是从Kaggle下载的训练集，可知共有891条数据从特征集
full_X中提取原始数据前891行数据时需减去1，因为行号是从0开始
"""
# 原始数据集：特征
source_X = full_X.loc[0:sourceRow-1,:]
# 原始数据集：标签
source_y = full.loc[0:sourceRow-1,'Survived']
# 预测数据集：特征
pred_X = full_X.loc[sourceRow:,:]
# 查看原始数据集有多少行
print('原始数据集：', source_X.shape[0])
# 查看预测数据集有多少行
print('预测数据集：',pred_X.shape[0])

原始数据集： 891
预测数据集： 418


In [35]:
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

# 建立模型所需的训练数据集合测试集
train_X,test_X,train_y,test_y = train_test_split(source_X,source_y,train_size=0.8)
# 输出数据集大小
print('原始数据集特征：',source_X.shape,
     '训练数据集特征：',train_X.shape,
     '测试数据集特征：',test_X.shape,)
print('原始数据集标签：',source_y.shape,
     '训练数据集标签：',train_y.shape,
     '测试数据集标签：',test_y.shape,)

原始数据集特征： (891, 23) 训练数据集特征： (712, 23) 测试数据集特征： (179, 23)
原始数据集标签： (891,) 训练数据集标签： (712,) 测试数据集标签： (179,)




In [36]:
# 查看原始数据集标签
source_y.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

#### (2).选择机器学习算法

In [37]:
# 第一步：导入算法
from sklearn.linear_model import LogisticRegression
# 第二步：创建模型：逻辑回归
model = LogisticRegression()

In [38]:
# 第三步：训练模型
model.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [39]:
# 第四步评估模型
# 分类问题 score 得到的是模型正确率
model.score(test_X, test_y)

0.8100558659217877

### 5.实施方案

In [40]:
# 使用机器学习模型，对预测数据集中的生存情况进行预测
pred_y = model.predict(pred_X)

# 生成的预测值是浮点数，但是Kaggle要求提交的结果是整数型
# 使用astype对数据类型进行转换
pred_y = pred_y.astype(int)
# 乘客id
passenger_id = full.loc[sourceRow:,'PassengerId']
# 数据框：乘客id， 预测生存情况
predDf = pd.DataFrame({'PassengerId':passenger_id, 'Survived':pred_y})
predDf.shape
predDf.head()


Unnamed: 0,PassengerId,Survived
891,892,0
892,893,1
893,894,0
894,895,0
895,896,1


In [41]:
# 保存结果
predDf.to_csv('./titanic_pred.csv', index=False)