In [60]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
# pandas一些属性设置
pd.set_option('max_colwidth',200)
pd.set_option('display.width',200)
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)
# matlab支持汉字
mpl.rcParams["font.family"]="sans-serif"
mpl.rcParams['font.sans-serif'] = ['SimHei']

In [61]:
print('read data begin')
train_file_path = 'data/Titanic/train.csv'
test_file_path = 'data/Titanic/test.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
print(train_data.columns)
print(train_data.head(1))
print('train_data_info:',train_data.info())
print('test_data_info:',test_data.info())


read data begin
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
   PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  Parch     Ticket  Fare Cabin Embarked
0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1      0  A/5 21171  7.25   NaN        S
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
train_data_info: None
<clas

In [62]:
'''
train_data:Age,Embarked and Cabin have nan value
test_data:Age,Cabin,Fare have nan value
'''
print('begin data visualization')
#DataFrame.plot( )画图函数
train_data['Survived'].value_counts().plot(autopct = '%1.2f%%', kind='pie')
plt.show()

begin data visualization


In [63]:
print("分析性別与生存之间的关系")
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
plt.show()

分析性別与生存之间的关系


In [64]:
print("船舱等级和生存的关系")
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
plt.show()

船舱等级和生存的关系


In [65]:
print("姓名和生存的关系")
# 对姓名只取称呼部门，因为称呼决定了社会关系等。。。
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
a = train_data[['Title','Survived']].groupby(['Title']).count()/(train_data.shape[0])
a.plot(kind='bar')
plt.legend(loc='best')
plt.show()

姓名和生存的关系


In [87]:
print('年龄和生存的关系')
print('各个年龄段的生存率')
print(np.max(train_data['Age']))
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = train_data[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)
print('各个年龄的人数')
plt.figure(figsize=(8,10))
train_data['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')
plt.show()

年龄和生存的关系
各个年龄段的生存率
80.0


各个年龄的人数


In [82]:
title='SibSp relationship with Survived'
print('船上兄弟姐妹和配偶的数目与生存关系')
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar()
plt.title(title)
plt.show()

船上兄弟姐妹和配偶的数目与生存关系


In [68]:
title='Parch relationship with Survived'
print('船上父母，子女的数目与生存关系')
train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar()
plt.title(title)
plt.show()

Parch relationship with Survived


In [84]:
print('船上亲戚的数目与生存的关系')
train_data['Family'] = train_data['Parch']+train_data['SibSp']
train_data[['Family','Survived']].groupby(['Family']).mean().plot.bar()
plt.show()
print('从图表中可以看出，若独自一人，那么其存活率比较低；但是如果亲友太多的话，存活率也会很低。')

船上亲戚的数目与生存的关系


从图表中可以看出，若独自一人，那么其存活率比较低；但是如果亲友太多的话，存活率也会很低。


In [85]:
print('票价分布的直方图')
train_data['Fare'].hist(bins=70)
plt.show()

票价分布的直方图


In [91]:
print('票价与社会阶级之间的关系')
plt.figure(figsize=(8,10))
train_data[['Fare','Pclass']].groupby(['Pclass']).mean().plot.bar(width=0.1)
plt.show()

票价与社会阶级之间的关系


In [99]:
print('船舱编号缺省值填充')
train_data['Cabin'].fillna(value='N0', inplace=True)
print(train_data['Cabin'].isnull().sum())

船舱编号和生存之间的关系
0


In [None]:
print('船舱编号和生存之间的关系')
# Cabin缺省的字段较多，先将缺省船舱编号的和非缺省船舱编码号的进行对比
print('a')