## 1. 必要なものをimport

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

## 2. カテゴリカル変数を用いたデータの整形

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
sample_submission = pd.read_csv('../input/titanic/gender_submission.csv')

# Sexの変換
genders = {'female': 0, 'male':1}
train['Sex'] = train['Sex'].map(genders)
test['Sex'] = test['Sex'].map(genders)

# Embarkedの変換
embarked = {'S':0, 'C':1, 'Q':2}
train['Embarked'] = train['Embarked'].map(embarked)
test['Embarked'] = test['Embarked'].map(embarked)

# 不要な列の削除
train.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

In [3]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,0.0
1,1,0,38.0,1,0,71.2833,1.0
2,3,0,26.0,0,0,7.925,0.0
3,1,0,35.0,1,0,53.1,0.0
4,3,1,35.0,0,0,8.05,0.0


In [4]:
display(train['Embarked'].value_counts(), train['Sex'].value_counts())

0.0    644
1.0    168
2.0     77
Name: Embarked, dtype: int64

1    577
0    314
Name: Sex, dtype: int64

In [5]:
# トレーニングデータをtrainとvalidに分割
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.33, random_state=0)

# lab.Datasetを使って、trainとvalidを作っておく
lgb_train= lgb.Dataset(train_x, train_y, categorical_feature=['Sex', 'Embarked'])
lgb_valid = lgb.Dataset(valid_x, valid_y, categorical_feature=['Sex', 'Embarked'])

lgbm_params = {'objective': 'binary'}

# lgb.trainで学習
gbm1 = lgb.train(params=lgbm_params,
                train_set=lgb_train,
                valid_sets=[lgb_train, lgb_valid],
                early_stopping_rounds=20,
                verbose_eval=-1)

oof = gbm1.predict(valid_x)

preds = (oof > 0.5).astype(int)
print('score', round(accuracy_score(valid_y, preds)*100,2))

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.268824	valid_1's binary_logloss: 0.402566
score 81.69


## 3. 重要度を表示

In [6]:
gbm1.feature_importance()

array([ 56,  38, 389,  43,  16, 314,  38], dtype=int32)

In [7]:
pd.DataFrame({'特徴': X_train.columns,
    'importance':gbm1.feature_importance()}).sort_values('importance',
    ascending=False)

Unnamed: 0,特徴,importance
2,Age,389
5,Fare,314
0,Pclass,56
3,SibSp,43
1,Sex,38
6,Embarked,38
4,Parch,16


In [8]:
pd.DataFrame({'特徴': X_train.columns,
    'importance':gbm1.feature_importance(importance_type='gain')}) \
    .sort_values('importance', ascending=False)

Unnamed: 0,特徴,importance
1,Sex,1025.068321
2,Age,597.739877
5,Fare,389.990502
0,Pclass,302.335525
3,SibSp,89.421663
6,Embarked,74.457972
4,Parch,21.785036


## 4. LightGBMで一度提出

In [9]:
test_pred_v1 = (gbm1.predict(test) > 0.5).astype(int)
sample_submission['Survived'] = test_pred_v1
sample_submission.to_csv('sample_submission.csv', index=False)

## 5. Scikit-learn で色々とイジッテくでーの巻

In [10]:
# トレーニングデータをtrainとvalidに分割
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.33, random_state=0)

# LightGBMの分類器をインスタンス化
gbm = lgb.LGBMClassifier(objective='binary')  # , importance_type='gain'

# trainとvalidを指定し学習
gbm.fit(train_x, train_y,
        eval_set = [(train_x, train_y), (valid_x, valid_y)],
        categorical_feature=['Sex', 'Embarked'],
        early_stopping_rounds=20,
        verbose=-1)

# valid_xについて推論
oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
print('score', round(accuracy_score(valid_y, oof)*100,2));  # validのscore

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.268824	valid_1's binary_logloss: 0.402566
score 81.69


### パラメータの調整

In [11]:
gbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [12]:
# GridSearchCVをimport
from sklearn.model_selection import GridSearchCV

gbm = lgb.LGBMClassifier(objective='binary')

# 試行するパラメータを羅列する
params = {
    'num_leaves': [20, 31, 40, 50],
    'reg_alpha': [0, 1, 10, 100],
    'reg_lambda': [0, 1, 10, 100],
}

grid_search = GridSearchCV(
                           gbm,  # 分類器を渡す
                           param_grid=params,  # 試行してほしいパラメータを渡す
                           cv=3,  # 3分割交差検証でスコアを確認
                          )

grid_search.fit(X_train, y_train)  # データを渡す

print(grid_search.best_score_)  # ベストスコアを表示
print(grid_search.best_params_)  # ベストスコアのパラメータを表示

0.8361391694725029
{'num_leaves': 20, 'reg_alpha': 0, 'reg_lambda': 10}


In [13]:
kf = KFold(n_splits=3, shuffle=True, random_state=0)

# スコアとモデルを格納するリスト
score_list = []
test_pred = np.zeros((len(test), 3))

for fold_, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    train_x = X_train.iloc[train_index]
    valid_x = X_train.iloc[valid_index]
    train_y = y_train[train_index]
    valid_y = y_train[valid_index]
    
    print(f'fold{fold_ + 1} start')

    gbm = lgb.LGBMClassifier(objective='binary',
                             num_leaves=20,
                             reg_alpha=0,
                             reg_lambda=10)  # パラメータを指定
    gbm.fit(train_x, train_y,
            eval_set = [(train_x, train_y), (valid_x, valid_y)],
            early_stopping_rounds=20,
            verbose= -1)
    
    oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
    score_list.append(round(accuracy_score(valid_y, oof)*100,2))
    test_pred[:, fold_] = gbm.predict_proba(test)[:, 1]
    print(f'fold{fold_ + 1} end\n' )
print(score_list, '平均score', np.mean(score_list))
pred = (np.mean(test_pred, axis=1) > 0.5).astype(int)
sample_submission['Survived'] = pred
sample_submission.to_csv('glid_search.csv', index=False)

fold1 start
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[61]	training's binary_logloss: 0.340861	valid_1's binary_logloss: 0.406633
fold1 end

fold2 start
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.310629	valid_1's binary_logloss: 0.395367
fold2 end

fold3 start
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[65]	training's binary_logloss: 0.320685	valid_1's binary_logloss: 0.455512
fold3 end

[82.15, 83.84, 80.81] 平均score 82.26666666666667
