In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import matplotlib as mpl
# pandas一些属性设置
pd.set_option('max_colwidth',20000)
pd.set_option('display.width',200)
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
# matlab支持汉字
mpl.rcParams["font.family"]="sans-serif"
mpl.rcParams['font.sans-serif'] = ['SimHei']

In [2]:
print('read data begin')
train_file_path = 'data/Titanic/train.csv'
test_file_path = 'data/Titanic/test.csv'
train_data = pd.read_csv(train_file_path, header=0)
test_data = pd.read_csv(test_file_path, header=0)
df = pd.concat([train_data, test_data], ignore_index=True)
print(df.head())

read data begin
    Age Cabin Embarked     Fare                                                 Name  Parch  PassengerId  Pclass     Sex  SibSp  Survived            Ticket
0  22.0   NaN        S   7.2500                              Braund, Mr. Owen Harris      0            1       3    male      1       0.0         A/5 21171
1  38.0   C85        C  71.2833  Cumings, Mrs. John Bradley (Florence Briggs Thayer)      0            2       1  female      1       1.0          PC 17599
2  26.0   NaN        S   7.9250                               Heikkinen, Miss. Laina      0            3       3  female      0       1.0  STON/O2. 3101282
3  35.0  C123        S  53.1000         Futrelle, Mrs. Jacques Heath (Lily May Peel)      0            4       1  female      1       1.0            113803
4  35.0   NaN        S   8.0500                             Allen, Mr. William Henry      0            5       3    male      0       0.0            373450


In [3]:
print('开始处理缺省值')
df.info()
print('--**--'*10)
test_data.info()

开始处理缺省值
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB
--**----**----**----**----**----**----**----**----**----**--
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418

In [4]:
# 缺省值为：Age,Cabin,Embarked,Fare,Survived，其中Survived的缺省值可以忽略
train_data_index = train_data.shape[0]
# Cabin的缺省值过多
df['Cabin'] = df['Cabin'].fillna(value='U0')
# Embarked使用众数填充,对于分类变量，使用众数或许比平均值更好
df['Embarked'] = df['Embarked'].fillna(value=df['Embarked'].mode().values[0])
df['Embarked'].isnull().sum()

0

In [5]:
# Fare的缺省值根据社会等级的平均票价决定
print(df[df['Fare'].isnull()])
df['Fare'] = df[['Fare','Pclass']].groupby('Pclass')['Fare'].transform(lambda x : x.fillna(x.mean()))

       Age Cabin Embarked  Fare                Name  Parch  PassengerId  Pclass   Sex  SibSp  Survived Ticket
1043  60.5    U0        S   NaN  Storey, Mr. Thomas      0         1044       3  male      0       NaN   3701


In [7]:
'''
使用回归 随机森林等模型来预测缺失属性的值。因为Age在该数据集里是一个相当重要的特征
所以保证一定的缺失值填充准确率是非常重要的，对结果也会产生较大影响。一般情况下，会使用数据完整的条目作为模型的训练集，以此来预测缺失值。
'''
from sklearn.ensemble import RandomForestRegressor
train_data = df.iloc[:train_data_index, :]
age_train = train_data[['Age','Survived','Fare', 'Parch', 'SibSp', 'Pclass']]
age_isnull = age_train[age_train['Age'].isnull()]
age_notnull = age_train[age_train['Age'].notnull()]
X = age_notnull.iloc[:, 1:]
y = age_notnull.iloc[:, 0]
print(age_isnull.shape)
print(age_notnull.shape)
rdf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
rdf.fit(X, y)
predict_ages = rdf.predict(age_isnull.iloc[:, 1:])
predict_ages
train_data.loc[train_data['Age'].isnull(), 'Age']= predict_ages
print(train_data['Age'].isnull().sum())

(0, 6)
(891, 6)


ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required.

In [8]:
print('开始数据可视化处理')
print('生存关系的整体分布')
#DataFrame.plot( )画图函数
train_data['Survived'].value_counts().plot(autopct = '%1.2f%%', kind='pie')
plt.show()

开始数据可视化处理
生存关系的整体分布


In [9]:
print("分析性別与生存之间的关系")
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
plt.show()

分析性別与生存之间的关系


In [10]:
print("船舱等级和生存的关系")
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
plt.show()

船舱等级和生存的关系


In [11]:
print("姓名和生存的关系")
# 对姓名只取称呼部门，因为称呼决定了社会关系等。。。
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(train_data['Title'], train_data['Sex'])
a = train_data[['Title','Survived']].groupby(['Title']).mean()
a.plot(kind='bar')
plt.legend(loc='best')
plt.show()

姓名和生存的关系


In [12]:
print('年龄和生存的关系')
print('各个年龄分布的直方图')
print(np.max(train_data['Age']))
plt.figure(figsize=(8,10))
train_data['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')
plt.show()

年龄和生存的关系
各个年龄分布的直方图
80.0


In [13]:
fig, ax = plt.subplots(1, 2, figsize = (18, 8))
# 琴式图
sns.violinplot("Pclass", "Age", hue="Survived", data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0, 110, 10))

sns.violinplot("Sex", "Age", hue="Survived", data=train_data, split=True, ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0, 110, 10))

plt.show()


In [14]:

train_data.boxplot(column='Age', showfliers=False)
plt.show()


In [15]:
facet = sns.FacetGrid(train_data, hue="Survived",aspect=4)
# kdeplot核密度估计图
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, train_data['Age'].max()))
facet.add_legend()
plt.show()

In [16]:
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
train_data["Age_int"] = train_data["Age"].astype(int)
average_age = train_data[["Age_int", "Survived"]].groupby(['Age_int'],as_index=False).mean()
sns.barplot(x='Age_int', y='Survived', data=average_age)
plt.show()

In [17]:
train_data['Age'].describe()

count    891.000000
mean      29.653887
std       13.729017
min        0.420000
25%       21.000000
50%       28.000000
75%       37.000000
max       80.000000
Name: Age, dtype: float64

In [18]:
# 按照年龄，将乘客划分为儿童、少年、成年和老年，分析四个群体的生还情况
split = [0, 12, 18, 65, 100]
train_data['Age_group'] = pd.cut(train_data['Age'], bins=split)
by_age = train_data.groupby('Age_group')['Survived'].mean()
by_age.plot(kind='bar')
plt.show()

In [19]:
title='SibSp relationship with Survived'
print('船上兄弟姐妹和配偶的数目与生存关系')
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar()
plt.title(title)
plt.show()

船上兄弟姐妹和配偶的数目与生存关系


In [20]:
title='Parch relationship with Survived'
print('船上父母，子女的数目与生存关系')
train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar()
plt.title(title)
plt.show()

船上父母，子女的数目与生存关系


In [21]:
print('船上亲戚的数目与生存的关系')
train_data['Family'] = train_data['Parch']+train_data['SibSp']
train_data[['Family','Survived']].groupby(['Family']).mean().plot.bar()
plt.show()
print('从图表中可以看出，若独自一人，那么其存活率比较低；但是如果亲友太多的话，存活率也会很低。')

船上亲戚的数目与生存的关系
从图表中可以看出，若独自一人，那么其存活率比较低；但是如果亲友太多的话，存活率也会很低。


In [22]:
print('票价分布的直方图')
plt.figure(figsize=(8,15))
train_data['Fare'].hist(bins=70)
plt.show()

票价分布的直方图


In [23]:
print('票价与社会阶级之间的关系')
plt.figure(figsize=(8,10))
train_data.groupby(['Pclass'])['Fare'].mean().plot.bar(width=0.1)
plt.show()

票价与社会阶级之间的关系


In [24]:
train_data['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [25]:
print('船舱编号和生存之间的关系')
# Cabin缺省的字段较多，先将缺省船舱编号的和非缺省船舱编码号的进行对比
train_data['Has_Cabin'] = train_data['Cabin'].apply(lambda x : 0 if x=='U0' else 1)
train_data[['Has_Cabin','Survived']].groupby(['Has_Cabin']).mean().plot.bar(width=0.2)
plt.show()
# 有船舱的人存活率更高

船舱编号和生存之间的关系


In [26]:
print('不同登录港口和生存之间的关系')
train_data[['Embarked','Survived']].groupby(['Embarked']).mean().plot.bar(width=0.2)
plt.show()
# 由上可以看出，在不同的港口上船，生还率不同，C最高，Q次之，S最低。

不同登录港口和生存之间的关系


In [27]:
print('开始特征工程')
# 在进行特征工程的时候，我们不仅需要对训练数据进行处理，还需要同时将测试数据同训练数据一起处理，使得二者具有相同的数据类型和数据分布。
print(df.info())

开始特征工程
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1223 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB
None


In [28]:
print('首先对离散型类别变量进行处理')
# 使用one-hot编码对离散型类别进行处理
print('开始对Embarked进行处理')
emb_dummies_df = pd.get_dummies(df['Embarked'], prefix=df[['Embarked']].columns[0])
df = pd.concat([emb_dummies_df, df], axis=1,)
print(df.info())

首先对离散型类别变量进行处理
开始对Embarked进行处理
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
Embarked_C     1309 non-null uint8
Embarked_Q     1309 non-null uint8
Embarked_S     1309 non-null uint8
Age            1223 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5), uint8(3)
memory usage: 126.6+ KB
None


In [29]:
print('开始对性别进行处理')
sex_dummies_df = pd.get_dummies(df['Sex'], prefix=df[['Sex']].columns[0])
df = pd.concat([sex_dummies_df, df], axis=1)
print(df.info())

开始对性别进行处理
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 17 columns):
Sex_female     1309 non-null uint8
Sex_male       1309 non-null uint8
Embarked_C     1309 non-null uint8
Embarked_Q     1309 non-null uint8
Embarked_S     1309 non-null uint8
Age            1223 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5), uint8(5)
memory usage: 129.2+ KB
None
None


In [30]:
print('开始对姓名进行处理')
print('首先对称呼进行提取')
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print(df['Title'].unique())
print(df['Title'].isnull().sum())
# dict.fromkeys用于创建一个新字典，以序列seq中元素做字典的键，value为字典所有键对应的初始值
title_Dict = {}
# 对应官员
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
# 对应皇室成员
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'Countess', 'Dona', 'Lady'], 'Royalty'))
# Mrs已婚女性
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
# Master 未成年男少主人的称呼,相当于汉语的"少爷
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))
df['Title'] = df['Title'].map(title_Dict)
title_dummies_df = pd.get_dummies(df['Title'], prefix=df[['Title']].columns[0])
df = pd.concat([title_dummies_df, df], axis=1)
print(df.info())

开始对姓名进行处理
首先对称呼进行提取
['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'Countess' 'Jonkheer' 'Dona']
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 24 columns):
Title_Master     1309 non-null uint8
Title_Miss       1309 non-null uint8
Title_Mr         1309 non-null uint8
Title_Mrs        1309 non-null uint8
Title_Officer    1309 non-null uint8
Title_Royalty    1309 non-null uint8
Sex_female       1309 non-null uint8
Sex_male         1309 non-null uint8
Embarked_C       1309 non-null uint8
Embarked_Q       1309 non-null uint8
Embarked_S       1309 non-null uint8
Age              1223 non-null float64
Cabin            1309 non-null object
Embarked         1309 non-null object
Fare             1309 non-null float64
Name             1309 non-null object
Parch            1309 non-null int64
PassengerId      1309 non-null int64
Pclass           1309 non-null int64
Sex              1309 non-null ob

In [38]:
print('开始对Ticket进行处理')
m = df.shape[0]
ticket_len = len(df['Ticket'].unique())
print('数据集的长度为：%d;不同Ticket的长度为：%d'%(m,ticket_len))
if m == ticket_len:
    print('没有重复的船票编号')
else:
    print('有重复的船票编号，因此可能出现家庭/团体票')

开始对Ticket进行处理
数据集的长度为：1309;不同Ticket的长度为：929
有重复的船票编号，因此可能出现家庭/团体票


In [61]:
# 船票编号和票价有必然关系，因此将团体票的人所付的票价与非团体票所付的票价相对比
df['Group_Ticket'] = df[['Ticket','Fare']].groupby('Ticket').transform('count')
# 通过Group_Ticket可以看出每个票编码的购买人数
df['Fare_mean'] = df.groupby('Ticket')['Fare'].transform('mean')
print(df[['Group_Ticket','Fare_mean','Pclass']].head())

   Group_Ticket  Fare_mean  Pclass
0             1     7.2500       3
1             2    71.2833       1
2             1     7.9250       3
3             2    53.1000       1
4             1     8.0500       3


In [62]:
df['Fare'] = df['Fare']/df['Group_Ticket']
df['Fare'].head()

0     7.25000
1    35.64165
2     7.92500
3    26.55000
4     8.05000
Name: Fare, dtype: float64

In [85]:
# 将票价划分为5组,qcut将数据分组
df['Fare_bin'] = pd.qcut(df['Fare'], 5)
df["Fare_bin"].unique()
# factorize方法将标称型类别变量映射为数值型类别变量，返回值为一个二元组
df['Fare_bin_id'] = pd.factorize(df['Fare_bin'])[0]
df['Fare_bin_id']

0       0
1       1
2       2
3       1
4       2
5       2
6       3
7       4
8       4
9       3
10      4
11      1
12      2
13      4
14      0
15      3
16      4
17      3
18      2
19      4
20      3
21      3
22      2
23      1
24      4
25      4
26      4
27      1
28      0
29      0
30      1
31      1
32      0
33      2
34      1
35      3
36      4
37      2
38      2
39      4
40      2
41      2
42      0
43      2
44      0
45      2
46      0
47      0
48      4
49      2
50      4
51      0
52      3
53      3
54      1
55      1
56      2
57      4
58      4
59      4
60      4
61      1
62      1
63      4
64      1
65      4
66      2
67      2
68      2
69      2
70      2
71      4
72      2
73      4
74      4
75      0
76      0
77      2
78      2
79      4
80      2
81      2
82      0
83      3
84      2
85      2
86      4
87      2
88      1
89      2
90      2
91      0
92      1
93      4
94      0
95      2
96      1
97      1
98      3
99      3


In [88]:
fare_bin_dummies_df = pd.get_dummies(df['Fare_bin_id'])
df = pd.concat([fare_bin_dummies_df, df], axis=1)
print(df.columns)

Index([              0,               1,               2,               3,               4,               0,               1,               2,               3,               4,
       ...
                 'Sex',         'SibSp',      'Survived',        'Ticket',         'Title',  'Group_Ticket', 'Group_Ticket2',     'Fare_mean',      'Fare_bin',   'Fare_bin_id'],
      dtype='object', length=852)
