In [None]:
#!usr/bin/env python3

import numpy as np
import pandas as pd

#pipeline
from sklearn.pipeline import Pipeline

#標準化→Standard Scaler
from sklearn.preprocessing import StandardScaler

#Support vector machine
from sklearn.svm import SVC

#RandomForest
from sklearn.ensemble import RandomForestClassifier as RFC

#XGBoost
import xgboost as xgb
XGB = xgb.XGBClassifier()

#LightGBM
import lightgbm as lgb
LGB = lgb.LGBMClassifier(silent=False)

#次元削減→PCA
from sklearn.decomposition import PCA

#特徴量選択→Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

#特徴量選択→Recursive feature elimination
from sklearn.feature_selection import RFE

#特徴量選択→Select From Model
from sklearn.feature_selection import SelectFromModel

# GridSearch
from sklearn.model_selection import GridSearchCV

# Stratified k-fold cross-validation
from sklearn.model_selection import StratifiedKFold

# 指標を計算するため
from sklearn.metrics import accuracy_score, cohen_kappa_score, make_scorer, f1_score, recall_score

In [None]:
# データの読み込み
df=pd.read_csv('Table.csv')

#X yの設定
#Xの.dropで行or列を削除、axis=1で行を削除と指定。削除する行の指定は'subject'で行う。
x=df.drop('subject',axis=1) 

#yはdf内からsbmのみを取り出す。
y=df.subject

In [None]:
# 初期データ形式
print(x.shape)

In [None]:
#初期データの特徴量一覧
display(x.columns)

In [None]:
# パイプライン構築
pipe = Pipeline([('preprocessing',None), ('feature_selection', None),
                     ('pca', PCA(0.80)), ('classifier', None)])

# パラメータ・モデルの設定
#計算コストが膨大なので適宜pipeline側で処理、feature_selection, classifierの選択を行う。

param_grid =[
    {
        'classifier':[SVC()], 
        'preprocessing':[StandardScaler(), None],
        'feature_selection': [
                RFE(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    n_features_to_select=14),
                SelectFromModel(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    threshold="median"),
                RFE(RFC(n_estimators=20), n_features_to_select=14),
                SelectFromModel(RFC(n_estimators=20), threshold="median"),
                None
            ],
        'classifier__kernel':['linear', 'rbf', 'poly', 'sigmoid'],
        'classifier__C': [1, 10, 100, 1000], 
        'classifier__gamma': [0.001, 0.0001],
        'classifier__degree': [2, 3, 4], 
    },
    {
        'classifier':[RFC()], 
        'preprocessing': [None], 
        'feature_selection': [
                RFE(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    n_features_to_select=14),
                SelectFromModel(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    threshold="median"),
                RFE(RFC(n_estimators=20), n_features_to_select=14),
                SelectFromModel(RFC(n_estimators=20), threshold="median"),
                None
            ],
         'classifier__n_estimators':[100,500,1000], 
         'classifier__max_features': [1, 2, 3, 4, 5, 7, 10],
         'classifier__max_depth': [3,5,7,10,15,None], 
         'classifier__min_samples_leaf':  [1, 2, 4],
         'classifier__min_samples_split': [2, 5, 10]
    },
    {
        'classifier':[XGB], 
        'preprocessing':[None],
        'feature_selection': [
                RFE(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    n_features_to_select=14),
                SelectFromModel(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    threshold="median"),
                RFE(RFC(n_estimators=20), n_features_to_select=14),
                SelectFromModel(RFC(n_estimators=20), threshold="median"),
                None
            ],
        'classifier__max_depth': [ 3, 6, 10,25], #10, 25,
        'classifier__learning_rate' : [0.0001,0.001,0.01], # 0.05,0.1
        'classifier__min_child_weight' : [1,3,6],
        'classifier__n_estimators': [100,200,300], # 500
        'classifier__subsample': [0.5,0.75,0.9],
        'classifier__gamma':[0,0.1,0.2],
        'classifier__eta': [0.3,0.15,0.10]
    },
    {
        'classifier':[LGB], 
        'preprocessing':[None],
        'feature_selection': [
                RFE(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    n_features_to_select=14),
                SelectFromModel(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    threshold="median"),
                RFE(RFC(n_estimators=20), n_features_to_select=14),
                SelectFromModel(RFC(n_estimators=20), threshold="median"),
                None
            ],
        'classifier__max_depth': [10, 25, 50, 75],
        'classifier__learning_rate' : [0.001,0.01,0.05,0.1],
        'classifier__num_leaves': [100,300,900,1200],
        'classifier__n_estimators': [100,200,500]
    }
]

In [None]:
#層化k分割交差検証　n_splits=10のため 1/10 * 10回での検討
skf = StratifiedKFold(n_splits=10,
                      shuffle=True,
                      random_state=0)

In [None]:
%%time

# パラメータチューニングをグリッドサーチ
gs = GridSearchCV(estimator = pipe,
                           param_grid = param_grid,
                           scoring = 'accuracy', 
                           cv = skf,
                           return_train_score = True,
                           n_jobs = -1)

gs.fit(x, y)

In [None]:
#ベストな分類器の抽出
print(gs.best_estimator_)

In [None]:
#ベストなパラメータの抽出
print(gs.best_params_)

In [None]:
#ベストな精度の抽出
print(gs.best_score_)

In [None]:
#x_test, y_testはexternal validation評価のため外部データを使用する。

#グリッドサーチで検索したベストな分類器で外部データの精度を評価する。
clf=gs.best_estimator_
print(clf.score(x_test, y_test))