In [29]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier

In [38]:
#將資料匯入

train = pd.read_csv("train.csv", dtype=object)

test = pd.read_csv("test.csv", dtype=object)



In [39]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [40]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [41]:
#觀察有無缺失值

train.info()

# Age , Embarked

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  891 non-null    object
 1   Survived     891 non-null    object
 2   Pclass       891 non-null    object
 3   Name         891 non-null    object
 4   Sex          891 non-null    object
 5   Age          714 non-null    object
 6   SibSp        891 non-null    object
 7   Parch        891 non-null    object
 8   Ticket       891 non-null    object
 9   Fare         891 non-null    object
 10  Cabin        204 non-null    object
 11  Embarked     889 non-null    object
dtypes: object(12)
memory usage: 83.7+ KB


In [42]:
#將缺失值補齊 ("Age")

train["Age"] = train["Age"].fillna(train["Age"].median())
train["Age"]

0      22
1      38
2      26
3      35
4      35
       ..
886    27
887    19
888    28
889    26
890    32
Name: Age, Length: 891, dtype: object

In [43]:
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Age"]

0      34.5
1        47
2        62
3        27
4        22
       ... 
413      27
414      39
415    38.5
416      27
417      27
Name: Age, Length: 418, dtype: object

In [46]:
#將缺失值補齊（"Embarked"）

#把2筆缺失直變成"S" ，並把 "S"、"C"、"Q" 分別轉變成數值 0 , 1 , 2

train["Embarked"] = train["Embarked"].fillna('S')

train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2


test.loc[test["Embarked"]=="S", "Embarked"] = 0
test.loc[test["Embarked"]=="C", "Embarked"] = 1
test.loc[test["Embarked"]=="Q", "Embarked"] = 2




In [48]:
#將 "Sex" 轉變為數值型態

train.loc[train["Sex"] == "male" , "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1

test.loc[test["Sex"] == "male" , "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1

In [51]:
#預測特徵宣告

predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked"]

train[predictors]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,0,22,7.25,0
1,1,1,38,71.2833,1
2,3,1,26,7.925,0
3,1,1,35,53.1,0
4,3,0,35,8.05,0
...,...,...,...,...,...
886,2,0,27,13,0
887,1,1,19,30,0
888,3,1,28,23.45,0
889,1,0,26,30,1


In [69]:
#使用RandomForestClassifier

RFC = RandomForestClassifier(random_state=2, n_estimators=100, min_samples_split=4, oob_score=True)

# random_state：用來固定隨機值

# n_estimators ：森林中樹木的數量(base estimator的數量)，通常此值越大，模型效果往往越好，但同時也會讓代碼變慢

# min_sample_split ：內部節點再劃分所需最小樣本數，默認2。這個值限制了子樹繼續劃分的條件，如果某節點的樣本數少於min_samples_split
#                           ，則不會繼續再嘗試選擇最優特徵來進行劃分。默認是2，如果樣本量不大，不需要管這個值。如果樣本量數量級非常大，則推薦增大這個值。

# min_samples_leaf：葉子節點最少樣本數。 這個值限制了葉子節點最少的樣本數，如果某葉子節點數目小於樣本數，則會和兄弟節點一起被剪枝。
#                            默認是1,可以輸入最少的樣本數的整數，或者最少樣本數佔樣本總數的百分比。如果樣本量不大，不需要管這個值。
#                           如果樣本量數量級非常大，則推薦增大這個值。

# oob_score ：默認識False，即是否採用袋外樣本來評估模型的好壞。有放回採樣中大約36.8%的沒有被採樣到的數據，我們常常稱之爲袋外數據(Out Of Bag, 簡稱OOB)，
#                 這些數據沒有參與訓練集模型的擬合，因此可以用來檢測模型的泛化能力。個人推薦設置爲True，因爲袋外分數反應了一個模型擬合後的泛化能力。
#                對單個模型的參數訓練，我們知道可以用cross validation（cv）來進行，但是特別消耗時間，而且對於隨機森林這種情況也沒有大的必要，
#               所以就用這個數據對決策樹模型進行驗證，算是一個簡單的交叉驗證，性能消耗小，但是效果不錯。



In [70]:
RFC.fit(train[predictors], train["Survived"])

print(RFC.oob_score_)

0.8316498316498316
