# Overview
- nb008、kaggle_nb001をベースに、いくつかのモデルを組み合わせて多数決によるアンサンブルモデルを構築する。
- SVC、ランダムフォレスト、ADAブースティングによるアンサンブルモデルをさくせいし

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
#df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train.csv')   # Google Colabの場合はこちら
data_train = pd.read_csv('../data/train.csv')   # ローカルの場合はこちら
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [25]:
#df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/test.csv')   # Google Colabの場合はこちら
data_test = pd.read_csv('../data/test.csv')   # ローカルの場合はこちら
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### データ前処理

In [42]:
# 'TrainFlag'列追加
data_train['TrainFlag'] = True
data_test['TrainFlag'] = False

# 訓練用とテスト用のデータ結合
df = pd.concat([data_train, data_test])
df.index = df['PassengerId']
df = df.drop("PassengerId", axis = 1)

# =========
# 欠損値処理
# =========

# 'Embarked'
# -----------
# 欠損値は2つだけなので、最頻値('S')で埋めることとする
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode().iloc[0])

# 'Age'
# -----
# 'Age'は'Pclass'と相関が高いため、'Pclass'と'Sex'でグループ分けし、各グループの中央値で置き換える
df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

# 'Cabin'
# -------
# 一文字目を取り出して新たな列'Deck'を作成、欠損値はZで置き換え
df['Deck'] = df['Cabin'].apply(lambda d: d[0] if pd.notnull(d) else 'Z')

# 'Fare'
# ------
# testデータに1つだけあるので、平均値で埋める
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

# =========
# 特徴量生成
# =========

# 'Name'
# ------
# 'Mr'などのタイトルを抜き出して新たな列'Title'を作成
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)

# 'Master'、'Miss'、'Mr'、'Mrs'に統合もしくはその他('Others')とする
df['Title'] = df['Title'].replace(['Mlle'], 'Miss')
df['Title'] = df['Title'].replace(['Countess', 'Mme', 'Lady'], 'Mrs')
df['Title'] = df['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Major', 'Ms', 'Rev', 'Sir'], 'Others')

# 'Ticket'
# --------
# 'Ticket'の1文字目を抽出して新たな列'Ticket_first'作成
df['Ticket_first'] = df['Ticket'].apply(lambda t: str(t)[0])

# 'Ticket'の長さによる新たな列'Ticket_length'作成
df['Ticket_length'] = df['Ticket'].apply(lambda t: len(str(t)))

# 'Family_size'
# -------------
# 'SibSp'+'Parch'+1を新たな列'Family_size'に出力
df['Family_size'] = df['SibSp'] + df['Parch'] + 1

# =========
# 特徴量整理
# =========

# 'Sex'、'Embarked'、'Deck'、'Title'、'Ticket_first'をone-hot-encodeする
df_oh = pd.get_dummies(df[['Sex', 'Embarked', 'Deck', 'Title', 'Ticket_first']], drop_first=True)

# one-hot-encodeデータを結合する
df_added = pd.concat([df, df_oh], axis=1)

# 'Name'、'Sex'、'Ticket'、'Cabin'、'Embarked'、'Deck'、'Title'、'Ticket_first'を削除する
df_deleted = df_added.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Deck', 'Title', 'Ticket_first'], axis=1)
df_deleted.info()

# 訓練用とテスト用に再び分割
df_train = df_deleted[df_deleted['TrainFlag']==True].drop(['TrainFlag'], axis=1)
df_test = df_deleted[df_deleted['TrainFlag']==False].drop(['TrainFlag'], axis=1)

# X、yとしてNumpy配列にする
X_train = df_train.drop(['Survived'], axis=1).values
y_train = df_train['Survived'].values
X_test = df_test.drop(['Survived'], axis=1).values

print('\nX_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 40 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Survived        891 non-null    float64
 1   Pclass          1309 non-null   int64  
 2   Age             1309 non-null   float64
 3   SibSp           1309 non-null   int64  
 4   Parch           1309 non-null   int64  
 5   Fare            1309 non-null   float64
 6   TrainFlag       1309 non-null   bool   
 7   Ticket_length   1309 non-null   int64  
 8   Family_size     1309 non-null   int64  
 9   Sex_male        1309 non-null   uint8  
 10  Embarked_Q      1309 non-null   uint8  
 11  Embarked_S      1309 non-null   uint8  
 12  Deck_B          1309 non-null   uint8  
 13  Deck_C          1309 non-null   uint8  
 14  Deck_D          1309 non-null   uint8  
 15  Deck_E          1309 non-null   uint8  
 16  Deck_F          1309 non-null   uint8  
 17  Deck_G          1309 non-null   u

### データ解析

In [57]:
# 訓練用、テスト用にデータ分割する
from sklearn.model_selection import train_test_split

X_train, X_train_test, y_train, y_train_test = train_test_split(X_train, y_train, test_size=0.2,
                                                                            random_state=21, stratify=y_train)   # 訓練:テスト = 80:20

print('Label counts in y_train: [0 1] =', np.bincount(y_train.astype(np.int64)))
print('Label counts in y_train_test: [0 1] =', np.bincount(y_train_test.astype(np.int64)))

Label counts in y_train: [0 1] = [439 273]
Label counts in y_train_test: [0 1] = [110  69]


In [60]:
# =================================
# Pipeline: pl_scv
# SVC / k分割交差検証 / グリッドサーチ
# =================================

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

pl_svc = make_pipeline(StandardScaler(), SVC(random_state=21, max_iter=5000))

from sklearn.model_selection import GridSearchCV

svc_param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
svc_param_grid = [{'svc__C': svc_param_range, 'svc__kernel': ['linear']},
                  {'svc__C': svc_param_range, 'svc__kernel': ['poly', 'rbf', 'sigmoid'], 'svc__gamma': svc_param_range}]
svc_gs = GridSearchCV(estimator=pl_svc, param_grid=svc_param_grid, scoring='accuracy', cv=10, refit=True, n_jobs=-1)
svc_gs.fit(X_train, y_train)

print('CV best accuracy:', svc_gs.best_score_)
print('Best parameters:', svc_gs.best_params_)
svc_bestclf = svc_gs.best_estimator_
print('Test accuracy: %f' % svc_bestclf.score(X_train_test, y_train_test))

svc_pred = svc_bestclf.predict(X_test)

CV best accuracy: 0.8411971830985916
Best parameters: {'svc__C': 10.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Test accuracy: 0.826816


In [61]:
# =============================================
# Pipeline: pl_randf
# ランダムフォレスト / k分割交差検証 / グリッドサーチ
# =============================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pl_randf = make_pipeline(RandomForestClassifier(random_state=21, n_jobs=-1))

from sklearn.model_selection import GridSearchCV

rf_param_estimators_range = [100, 200, 300, 400, 500]
rf_param_depth_range = [5, 10, 15, 20, 25, 30]
rf_param_split_range = [5, 10, 15, 20, 25, 30]
rf_param_grid = [{'randomforestclassifier__criterion': ['gini', 'entropy'],
                  'randomforestclassifier__n_estimators': rf_param_estimators_range,
                  'randomforestclassifier__max_depth': rf_param_depth_range,
                  'randomforestclassifier__min_samples_split': rf_param_split_range}]
rf_gs = GridSearchCV(estimator=pl_randf, param_grid=rf_param_grid, scoring='accuracy', cv=10, refit=True, n_jobs=-1)
rf_gs.fit(X_train, y_train)

print('CV best accuracy:', rf_gs.best_score_)
print(rf_gs.best_params_)
rf_bestclf = rf_gs.best_estimator_
print('Test accuracy: %f' % rf_bestclf.score(X_train_test, y_train_test))

rf_pred = rf_bestclf.predict(X_test)

CV best accuracy: 0.8426838810641628
{'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__min_samples_split': 20, 'randomforestclassifier__n_estimators': 400}
Test accuracy: 0.810056


In [74]:
# =============================================
# Pipeline: pl_ada
# ADA Boost / k分割交差検証 / グリッドサーチ
# =============================================

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import make_pipeline

pl_ada = make_pipeline(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=21), random_state=21))
ada_param_grid = [{'adaboostclassifier__base_estimator__criterion': ['gini', 'entropy'],
                   'adaboostclassifier__base_estimator__max_depth': [1, 5, 10, 15, 20],
                   'adaboostclassifier__base_estimator__min_samples_split': [1, 5, 10, 15, 20],
                   'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
                   'adaboostclassifier__n_estimators': [1, 5, 10, 15, 20, 25, 30],
                   'adaboostclassifier__learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}]
ada_gs = GridSearchCV(estimator=pl_ada, param_grid=ada_param_grid, scoring='accuracy', cv=10, refit=True, n_jobs=-1)
ada_gs.fit(X_train, y_train)

print('CV best accuracy:', ada_gs.best_score_)
print(ada_gs.best_params_)
ada_bestclf = ada_gs.best_estimator_
print('Test accuracy: %f' % ada_bestclf.score(X_train_test, y_train_test))

ada_pred = ada_bestclf.predict(X_test)

CV best accuracy: 0.8427230046948357
{'adaboostclassifier__algorithm': 'SAMME', 'adaboostclassifier__base_estimator__criterion': 'entropy', 'adaboostclassifier__base_estimator__max_depth': 10, 'adaboostclassifier__base_estimator__min_samples_split': 15, 'adaboostclassifier__learning_rate': 0.1, 'adaboostclassifier__n_estimators': 20}
Test accuracy: 0.776536




In [78]:
# ========================
# 多数決アンサンブルモデル
# ========================

from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(estimators=[('svc', svc_bestclf), ('rf', rf_bestclf), ('ada', ada_bestclf)])
vote_clf.fit(X_train, y_train)

print('Test accuracy: %f' % vote_clf.score(X_train_test, y_train_test))

vote_pred = vote_clf.predict(X_test)

Test accuracy: 0.815642
