In [22]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import matplotlib as mpl
# pandas一些属性设置
pd.set_option('max_colwidth',20000)
pd.set_option('display.width',200)
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
# matlab支持汉字
mpl.rcParams["font.family"]="sans-serif"
mpl.rcParams['font.sans-serif'] = ['SimHei']

In [25]:
print('read data begin')
train_file_path = 'data/Titanic/train.csv'
test_file_path = 'data/Titanic/test.csv'
train_data = pd.read_csv(train_file_path, header=0)
test_data = pd.read_csv(test_file_path, header=0)
test_data['Survived'] = 0
df = pd.concat([train_data, test_data], ignore_index=True)
print(df.head())

read data begin
    Age Cabin Embarked     Fare                                                 Name  Parch  PassengerId  Pclass     Sex  SibSp  Survived            Ticket
0  22.0   NaN        S   7.2500                              Braund, Mr. Owen Harris      0            1       3    male      1         0         A/5 21171
1  38.0   C85        C  71.2833  Cumings, Mrs. John Bradley (Florence Briggs Thayer)      0            2       1  female      1         1          PC 17599
2  26.0   NaN        S   7.9250                               Heikkinen, Miss. Laina      0            3       3  female      0         1  STON/O2. 3101282
3  35.0  C123        S  53.1000         Futrelle, Mrs. Jacques Heath (Lily May Peel)      0            4       1  female      1         1            113803
4  35.0   NaN        S   8.0500                             Allen, Mr. William Henry      0            5       3    male      0         0            373450


In [23]:
'''
train_data:Age,Embarked and Cabin have nan value
test_data:Age,Cabin,Fare have nan value
'''
print('begin data visualization')
#DataFrame.plot( )画图函数
train_data['Survived'].value_counts().plot(autopct = '%1.2f%%', kind='pie')
plt.show()

begin data visualization


In [24]:
print("分析性別与生存之间的关系")
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
plt.show()

分析性別与生存之间的关系


In [25]:
print("船舱等级和生存的关系")
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
plt.show()

船舱等级和生存的关系


In [26]:
print("姓名和生存的关系")
# 对姓名只取称呼部门，因为称呼决定了社会关系等。。。
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(train_data['Title'], train_data['Sex'])
a = train_data[['Title','Survived']].groupby(['Title']).count()/(train_data.shape[0])
a.plot(kind='bar')
plt.legend(loc='best')
plt.show()

姓名和生存的关系


In [28]:
print('年龄和生存的关系')
print('各个年龄段的生存率')
print(np.max(train_data['Age']))
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = train_data[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)
print('各个年龄的人数')
plt.figure(figsize=(8,10))
train_data['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')
plt.show()

年龄和生存的关系
各个年龄段的生存率
80.0
各个年龄的人数




In [31]:
title='SibSp relationship with Survived'
print('船上兄弟姐妹和配偶的数目与生存关系')
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar()
plt.title(title)
plt.show()

船上兄弟姐妹和配偶的数目与生存关系


In [34]:
title='Parch relationship with Survived'
print('船上父母，子女的数目与生存关系')
train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar()
plt.title(title)
plt.show()

船上父母，子女的数目与生存关系


In [35]:
print('船上亲戚的数目与生存的关系')
train_data['Family'] = train_data['Parch']+train_data['SibSp']
train_data[['Family','Survived']].groupby(['Family']).mean().plot.bar()
plt.show()
print('从图表中可以看出，若独自一人，那么其存活率比较低；但是如果亲友太多的话，存活率也会很低。')

船上亲戚的数目与生存的关系
从图表中可以看出，若独自一人，那么其存活率比较低；但是如果亲友太多的话，存活率也会很低。


In [36]:
print('票价分布的直方图')
train_data['Fare'].hist(bins=70)
plt.show()

票价分布的直方图


In [37]:
print('票价与社会阶级之间的关系')
plt.figure(figsize=(8,10))
train_data[['Fare','Pclass']].groupby(['Pclass']).mean().plot.bar(width=0.1)
plt.show()

票价与社会阶级之间的关系


In [38]:
print('船舱编号缺省值填充')
train_data['Cabin'].fillna(value='N0', inplace=True)
print(train_data['Cabin'].isnull().sum())

船舱编号缺省值填充
0


In [39]:
print('船舱编号和生存之间的关系')
# Cabin缺省的字段较多，先将缺省船舱编号的和非缺省船舱编码号的进行对比
train_data['Has_Cabin'] = train_data['Cabin'].apply(lambda x : 0 if x=='N0' else 1)
train_data[['Has_Cabin','Survived']].groupby(['Has_Cabin']).mean().plot.bar()
plt.show()
# 沒有船舱的人存活率更高

船舱编号和生存之间的关系


In [40]:
print('不同登录港口和生存之间的关系')
train_data[['Embarked','Survived']].groupby(['Embarked']).mean().plot.bar()
plt.show()
# 由上可以看出，在不同的港口上船，生还率不同，C最高，Q次之，S最低。

不同登录港口和生存之间的关系


In [41]:
sns.countplot('Embarked', hue='Survived', data=train_data)
plt.title('Embarked and Survived')
plt.show()

In [42]:
print('开始缺省值的填充')
print('train_data_info:',train_data.info())


开始缺省值的填充
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       889 non-null object
Title          891 non-null object
Family         891 non-null int64
Has_Cabin      891 non-null int64
dtypes: float64(2), int64(7), object(6)
memory usage: 104.5+ KB
train_data_info: None


In [61]:
from sklearn.ensemble import RandomForestRegressor
train_data.Embarked[train_data.Embarked.isnull()] = train_data.Embarked.dropna().mode().values
'''
使用回归 随机森林等模型来预测缺失属性的值。因为Age在该数据集里是一个相当重要的特征（先对Age进行分析即可得知），
所以保证一定的缺失值填充准确率是非常重要的，对结果也会产生较大影响。一般情况下，会使用数据完整的条目作为模型的训练集，以此来预测缺失值。
'''
age_df = train_data[['Age','Survived','Fare', 'Parch', 'SibSp', 'Pclass']]
age_df_null = age_df.loc[(train_data['Age'].isnull())]
age_df_notnull = age_df.loc[(train_data['Age'].notnull())]
X = age_df_notnull.iloc[:, 1:]
y = age_df_notnull.iloc[:, 0]
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X, y)
predictAges = RFR.predict(age_df_null.values[:, 1:])
train_data.loc[train_data['Age'].isnull(), ['Age']]= predictAges

Empty DataFrame
Columns: [Age, Survived, Fare, Parch, SibSp, Pclass]
Index: []


ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required.

In [63]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null object
Title          891 non-null object
Family         891 non-null int64
Has_Cabin      891 non-null int64
dtypes: float64(2), int64(7), object(6)
memory usage: 104.5+ KB
None


In [65]:
print('开始将非数值型特征进行转换')
print(train_data['Sex'].unique())
sex_map = {'male': 1, 'female':2}

开始将非数值型特征进行转换
['male' 'female']
