## 1. 必要なものをimport

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
sample_submission = pd.read_csv('../input/titanic/gender_submission.csv')

## 2. カテゴリカル変数を用いたデータの整形

In [3]:
# Sexの変換
genders = {'female': 0, 'male':1}
train['Sex'] = train['Sex'].map(genders)
test['Sex'] = test['Sex'].map(genders)

# Embarkedの変換
embarked = {'S':0, 'C':1, 'Q':2}
train['Embarked'] = train['Embarked'].map(embarked)
test['Embarked'] = test['Embarked'].map(embarked)

# Cabinの変換
def cabin_to_num_v1(cabin):
    if cabin == 'Z':
        return 0
    else:
        return 1

def cabin_to_num_v2(cabin):
    if cabin == 'Z':
        return 0
    elif cabin == 'B':
        return 1
    elif cabin == 'C':
        return 1
    elif cabin == 'D':
        return 1
    elif cabin == 'E':
        return 1
    else:
        return 2


train['Cabin'] = ['Z' if pd.isnull(cabin) else cabin[0] for cabin in train['Cabin']]
train['Cabin'] = [cabin_to_num_v2(cabin) for cabin in train['Cabin']]
test['Cabin'] = ['Z' if pd.isnull(cabin) else cabin[0] for cabin in test['Cabin']]
test['Cabin'] = [cabin_to_num_v2(cabin) for cabin in test['Cabin']]

# 不要な列の削除
train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [4]:
display(train.head(), test.head(), sample_submission.head())

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,0,0.0
1,1,1,0,38.0,1,0,71.2833,1,1.0
2,1,3,0,26.0,0,0,7.925,0,0.0
3,1,1,0,35.0,1,0,53.1,1,0.0
4,0,3,1,35.0,0,0,8.05,0,0.0


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,34.5,0,0,7.8292,0,2
1,3,0,47.0,1,0,7.0,0,0
2,2,1,62.0,0,0,9.6875,0,2
3,3,1,27.0,0,0,8.6625,0,0
4,3,0,22.0,1,1,12.2875,0,0


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [5]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,0,0.0
1,1,0,38.0,1,0,71.2833,1,1.0
2,3,0,26.0,0,0,7.925,0,0.0
3,1,0,35.0,1,0,53.1,1,0.0
4,3,1,35.0,0,0,8.05,0,0.0


In [6]:
display(train['Embarked'].value_counts(), train['Sex'].value_counts(), train['Cabin'].value_counts())

0.0    644
1.0    168
2.0     77
Name: Embarked, dtype: int64

1    577
0    314
Name: Sex, dtype: int64

0    687
1    171
2     33
Name: Cabin, dtype: int64

## 5. Scikit-learn で色々とイジッテくでーの巻

In [7]:
# トレーニングデータをtrainとvalidに分割
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.33, random_state=0)

In [8]:
# GridSearchCVをimport
from sklearn.model_selection import GridSearchCV

In [9]:
# XGBoostの分類器
from xgboost import XGBClassifier
XGB = XGBClassifier()
params = {"learning_rate":[0.01,0.05,0.1,0.3],
          "max_depth": [3,4,5,6],
          "min_child_weight": [1,2,3],
          "n_estimators": [150, 200, 250, 300]
         }
# ハイパーパラメータ探索
GridSearch_xgb = GridSearchCV(XGB, params, cv=3, scoring= 'roc_auc', n_jobs =-1)
GridSearch_xgb.fit(train_x,train_y)

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,
                                     verbosity=None),
             iid=

In [10]:
print(GridSearch_xgb.best_score_)  # ベストスコアを表示
print(GridSearch_xgb.best_params_)  # ベストスコアのパラメータを表示
display(GridSearch_xgb.score(train_x,train_y), GridSearch_xgb.score(valid_x,valid_y))

0.8659782840110709
{'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 300}


0.9023186858803297

0.8831521739130435

In [11]:
pred_xgb = pd.DataFrame(GridSearch_xgb.predict(test))
xgb_submission = sample_submission.copy()
xgb_submission['Survived'] = pred_xgb
xgb_submission.to_csv('xgb_submission.csv', index=False)

In [12]:
display(xgb_submission)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [13]:
# LightGBMの分類器をインスタンス化
gbm = lgb.LGBMClassifier(objective='binary')  # , importance_type='gain'

# trainとvalidを指定し学習
gbm.fit(train_x, train_y,
        eval_set = [(train_x, train_y), (valid_x, valid_y)],
        categorical_feature=['Sex', 'Embarked', 'Cabin'],
        early_stopping_rounds=20,
        verbose=-1)

# valid_xについて推論
oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
print('score', round(accuracy_score(valid_y, oof)*100,2));  # validのscore

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[38]	training's binary_logloss: 0.27058	valid_1's binary_logloss: 0.399486
score 81.69


### パラメータの調整

In [14]:
gbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [15]:
gbm = lgb.LGBMClassifier(objective='binary')

# 試行するパラメータを羅列する
params = {
    'num_leaves': [20, 31, 40, 50],
    'reg_alpha': [0, 1, 10, 100],
    'reg_lambda': [0, 1, 10, 100],
}

grid_search = GridSearchCV(
                           gbm,  # 分類器を渡す
                           param_grid=params,  # 試行してほしいパラメータを渡す
                           cv=3,  # 3分割交差検証でスコアを確認
                          )

grid_search.fit(X_train, y_train)  # データを渡す

print(grid_search.best_score_)  # ベストスコアを表示
print(grid_search.best_params_)  # ベストスコアのパラメータを表示

0.8338945005611672
{'num_leaves': 20, 'reg_alpha': 0, 'reg_lambda': 10}


In [16]:
kf = KFold(n_splits=3, shuffle=True, random_state=0)

# スコアとモデルを格納するリスト
score_list = []
test_pred = np.zeros((len(test), 3))

for fold_, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    train_x = X_train.iloc[train_index]
    valid_x = X_train.iloc[valid_index]
    train_y = y_train[train_index]
    valid_y = y_train[valid_index]
    
    print(f'fold{fold_ + 1} start')

    gbm = lgb.LGBMClassifier(objective='binary',
                             num_leaves=20,
                             reg_alpha=0,
                             reg_lambda=10)  # パラメータを指定
    gbm.fit(train_x, train_y,
            eval_set = [(train_x, train_y), (valid_x, valid_y)],
            early_stopping_rounds=20,
            verbose= -1)
    
    oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
    score_list.append(round(accuracy_score(valid_y, oof)*100,2))
    test_pred[:, fold_] = gbm.predict_proba(test)[:, 1]
    print(f'fold{fold_ + 1} end\n' )
print(score_list, '平均score', np.mean(score_list))
pred_gbm = (np.mean(test_pred, axis=1) > 0.5).astype(int)
gbm_submission = sample_submission.copy()
gbm_submission['Survived'] = pred_gbm
gbm_submission.to_csv('gbm_submission.csv', index=False)

fold1 start
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[76]	training's binary_logloss: 0.323036	valid_1's binary_logloss: 0.394643
fold1 end

fold2 start
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.306045	valid_1's binary_logloss: 0.391796
fold2 end

fold3 start
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.284898	valid_1's binary_logloss: 0.452694
fold3 end

[82.49, 84.18, 80.81] 平均score 82.49333333333334


In [17]:
display(sample_submission)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
