# 使用Decision Tree Classifiers 預測 Titanic 乘客的存活機率
by 蔡敏麒

In [2]:
import pandas as pd
import numpy as np

#資料來源網址
url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_train.csv"

#讀取資料
titanic_train = pd.read_csv(url)

#預覽資料
titanic_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## 先了解辨識的意義
變數意義解釋網址：https://www.kaggle.com/c/titanic/data

## 選擇變數
1. Pclass:艙等，與社會地位相關，可能影響逃亡順序。
2. Fare:票價，與艙等相關聯，票價越高可能位置越好。影響逃亡順序
3. Sex:性別，女性可能優先疏散。
4. Age:年齡，青壯年存活率可能較高。
5. Parch：直系血親，家庭的大小。

## 檢查數據

In [3]:
#用描述性統計檢查數據
titanic_train.describe()   #Age 中有 Nan



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
print(np.any(titanic_train['Sex'].isnull()) == True)  #檢查Sex欄位中是否有Nan，回傳True(有Nan)回傳False(沒有Nan)


False


In [5]:
#用Age中位數取代Nan
age_median = np.nanmedian(titanic_train["Age"]) 
imputed_Age = np.where(titanic_train["Age"].isnull(), age_median, titanic_train["Age"])
titanic_train['Age'] = imputed_Age

titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 資料預處理
* 用preprocessing中的LabelEncoder 
* 用preprocessing中的scale  標準化數據

In [28]:
#用sklearn的preprocessing來做預處理
from sklearn import preprocessing

#創建x,y
titanic_x = pd.DataFrame([titanic_train['Pclass'],
                          titanic_train['Fare'],
                          titanic_train['Age'],
                          titanic_train['Parch'],
                         titanic_train["Sex"]]).T

titanic_y = titanic_train['Survived']


#由於“Sex"欄位內容是Male，Female
#先將“Sex”類別變數轉換成 dummy variables
label_encoder = preprocessing.LabelEncoder()
titanic_x["Sex"] = label_encoder.fit_transform(titanic_x["Sex"])

#標準化數據
titanic_x = preprocessing.scale(titanic_x)

## 創建70%訓練資料及30%測試資料

In [29]:
from sklearn.cross_validation import train_test_split
train_x,test_x,train_y,test_y = train_test_split(titanic_x,titanic_y,test_size = 0.3)

## 使用Decision Tree Classifiers 決策樹分類器

In [31]:
from sklearn import tree

#建立模型
Dt = tree.DecisionTreeClassifier()
titanic_Dt = Dt.fit(train_x,train_y)

## 用cross_val_score評模型準確率

In [32]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(titanic_Dt,titanic_x,titanic_y,cv=10,scoring='accuracy')  
print(scores.mean())

0.794727613211


## 將原始資料套用模型

In [34]:
url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_test.csv"
to_submit = pd.read_csv(url)

# 跟訓練資料作一樣的整理
to_submit["Sex"] = label_encoder.fit_transform(to_submit["Sex"])

#Age裡面的NaN
age_median = np.nanmedian(to_submit["Age"]) 
to_submit["Age"] = np.where(to_submit["Age"].isnull(), age_median, to_submit["Age"])

#Fare裡面也有NaN用np.where(np.isnan(to_submit_X))找到
fare_median = np.nanmedian(to_submit["Fare"]) 
to_submit["Fare"] = np.where(to_submit["Fare"].isnull(), fare_median, to_submit["Fare"])

to_submit_X = pd.DataFrame([to_submit['Pclass'],
                            to_submit['Fare'],
                            to_submit["Age"],
                            to_submit['Parch'],
                            to_submit["Sex"]]).T

to_submit_X = preprocessing.scale(to_submit_X)

# 預測
to_submit_y = titanic_Dt.predict(to_submit_X)
print(to_submit_y)

# 準備要上傳的檔案
to_submit_dict = {
    "PassengerId": to_submit["PassengerId"],
    "Survived": to_submit_y
}
to_submit_df = pd.DataFrame(to_submit_dict)

# 輸出成 csv
to_submit_df.to_csv("to_submit.csv", index = False)

[0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0
 1 0 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 1 0 0 1 0 1 0 1 0 0 1 1 0 1
 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 1 1 0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1
 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 1 0
 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1
 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0
 1 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 1
 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 0
 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 0 1 0 1 0 1 1 0 1 0]
