# 資料讀取
### 讀取 train.csv 和 test.csv


In [1]:
import pandas as pd

# 從URL讀取CSV文件
url = '../content/drive/MyDrive/Colab Notebooks/Titanic專案/Dataset/train.csv'
traindf = pd.read_csv(url, encoding="utf-8")
traindf

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [2]:
# 從URL讀取CSV文件
url = '../content/drive/MyDrive/Colab Notebooks/Titanic專案/Dataset/test.csv'
testdf = pd.read_csv(url, encoding="utf-8")
testdf

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 資料預處理
### 填補缺失值
1. 欄位的值如果有大小關係，遇到缺失值時可以用中位數填補
2. 欄位的值如果是類別，遇到缺失值時可以用One hot encoding

### 資料清洗
1. 刪除不重要的欄位

In [3]:
# 計算每一行 NaN 的出現次數
traindf.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# 訓練資料 填補缺失值
# 1.數值->補中位數
med = traindf.median()
traindf = traindf.fillna(med)
traindf.isna().sum()

  med = traindf.median()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# 測試資料 填補缺失值
# 用訓練資料的來填補缺失值
testdf = testdf.fillna(med)
testdf.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [6]:
# 訓練資料 填補缺失值
# 2.分類->最常出現的值
most = traindf['Embarked'].value_counts().idxmax()
traindf['Embarked'] = traindf['Embarked'].fillna(most)
traindf.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [7]:
# 測試資料 填補缺失值
# 用訓練資料的來填補缺失值
testdf['Embarked'] = testdf['Embarked'].fillna(most)
testdf.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [8]:
traindf['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [9]:
# 如果直接將C, Q, S換成0, 1, 2，就是承認大小關係
# 因此使用 One hot encoding
dummy = pd.get_dummies(traindf['Embarked'])
dummy

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [10]:
# 將traindf和dummy合併
traindf = pd.concat([traindf, dummy], axis=1)
traindf

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1,0,0


In [11]:
# 測試資料 One hot encoding
dummy = pd.get_dummies(testdf['Embarked'])
testdf = pd.concat([testdf, dummy], axis=1)
testdf

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,28.0,0,0,A.5. 3236,8.0500,,S,0,0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1,0,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1
416,1308,3,"Ware, Mr. Frederick",male,28.0,0,0,359309,8.0500,,S,0,0,1


In [12]:
# Sex 欄位進行 One hot encoding
dummy = pd.get_dummies(traindf['Sex'])
traindf = pd.concat([traindf, dummy], axis=1)
dummy = pd.get_dummies(testdf['Sex'])
testdf = pd.concat([testdf, dummy], axis=1)

In [13]:
# 將名字改成稱位
mid = 'Braund, Mr. Owen Harris'.split(',')[-1].split('.')[0]
mid = mid.replace(" ","")
mid

'Mr'

In [14]:
# 函式: 將所有人的名字改成稱位
def nameflow(s):
  mid = s.split(',')[-1].split('.')[0]
  mid = mid.replace(" ","")
  return mid
name = traindf['Name'].apply(nameflow)
pd.crosstab(name, traindf['Survived']).T

Name,Capt,Col,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,theCountess
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,1,1,4,1,0,1,17,55,0,0,436,26,0,6,0,0
1,0,1,0,3,0,1,1,23,127,2,1,81,99,1,0,1,1


In [15]:
# 將稱位較常見的(筆數較多)進行 One hot encoding
def nameflow(s):
  mid = s.split(',')[-1].split('.')[0]
  mid = mid.replace(" ","")
  if mid == 'Miss':
    return 'Miss'
  elif mid == 'Mr':
    return 'Mr'
  elif mid == 'Mrs':
    return 'Mrs'
  else:
    return 'Throw'
name = traindf['Name'].apply(nameflow)
dummy = pd.get_dummies(name)
traindf = pd.concat([traindf, dummy], axis=1)
traindf

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,C,Q,S,female,male,Miss,Mr,Mrs,Throw
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,S,0,0,1,0,1,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,1,0,0,1,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,S,0,0,1,1,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,S,0,0,1,1,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,S,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,S,0,0,1,0,1,0,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,S,0,0,1,1,0,1,0,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,...,S,0,0,1,1,0,1,0,0,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,C,1,0,0,0,1,0,1,0,0


In [16]:
# 測試資料 稱位較常見的(筆數較多)進行 One hot encoding
name = testdf['Name'].apply(nameflow)
dummy = pd.get_dummies(name)
testdf = pd.concat([testdf, dummy], axis=1)
testdf

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S,female,male,Miss,Mr,Mrs,Throw
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,0,0,1,0,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0,0,1,1,0,0,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,0,0,1,0,1,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0,1,0,1,0,1,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,28.0,0,0,A.5. 3236,8.0500,,S,0,0,1,0,1,0,1,0,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1,0,0,1,0,0,0,0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1,0,1,0,1,0,0
416,1308,3,"Ware, Mr. Frederick",male,28.0,0,0,359309,8.0500,,S,0,0,1,0,1,0,1,0,0


In [17]:
# 資料清洗，print 所有欄位
traindf.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'C', 'Q', 'S', 'female',
       'male', 'Miss', 'Mr', 'Mrs', 'Throw'],
      dtype='object')

In [18]:
# 答案
y_train = traindf["Survived"]

# 保留
predict_id = testdf["PassengerId"]

# 刪掉毫無預測價值的直列
x_train = traindf.drop(["PassengerId", "Survived", 'Name', 'Sex', "Ticket", "Cabin", 'Embarked', 'Throw'], axis = 1)

# test中刪掉一樣的直列
x_predict = testdf.drop(["PassengerId", 'Name', 'Sex', "Ticket", "Cabin", 'Embarked', 'Throw'], axis = 1)

# 模型訓練
### 隨機森林模型參數
1. max_depth: 樹的最大深度
2. n_estimators: 這是森林中樹木的數量。n_estimators值越大，模型的準確性往往越好。但是相應的，任何模型都有決策邊界，n_estimators達到一定程度後，隨機森林的準確性往往不再上升或者開始波動。並且n_estimators越大，需要的計算量也越大，訓練時間也越長。對於此參數，我們會在訓練難度和模型效果上取得平衡。

In [19]:
# 隨機森林 開始
# ps.先算預測力
# 這區是先算預測力後來拿調參數衝預測力用的

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()

params = {
    "n_estimators": range(20, 110, 10),
    "max_depth": range(5, 11)
}

#----------------------------------------------------------
# 算最佳參數

Grid = GridSearchCV(clf, params, cv = 10, n_jobs = 4 )
Grid.fit(x_train, y_train)

print("最佳參數:", Grid.best_params_)
print("最佳平均分數:", Grid.best_score_)

最佳參數: {'max_depth': 9, 'n_estimators': 40}
最佳平均分數: 0.8350936329588015


In [20]:
# 先寫這欄
# 算預測力
# ps.先算預測力

from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators = 60, max_depth = 7)

scores = cross_val_score(clf, x_train, y_train, cv = 10, n_jobs = 4)

import numpy as np
scores = np.around(scores, decimals=3)

print("10次分數:", list(scores))
print("")
print("average:", sum(scores)/len(scores))

10次分數: [0.778, 0.876, 0.742, 0.888, 0.888, 0.82, 0.831, 0.775, 0.854, 0.82]

average: 0.8272


In [21]:
# '真樹' 在此
# 來預測喔--->

clf = RandomForestClassifier(n_estimators = 27, max_depth = 7)
clf.fit(x_train, y_train)

# 模型預測
### 預測每一筆測試資料
### 將結果寫入csv檔

In [22]:
# 來預言喔

pre = clf.predict(x_predict)
# pre

In [23]:
# 預言要補上對應 ID

result = pd.DataFrame({
        "PassengerId":predict_id,
        "Survived":pre})

# 存檔
result.to_csv("../content/drive/MyDrive/Colab Notebooks/Titanic專案/titanic_rf.csv", encoding="UTF8", index=False)

# result