In [70]:
import numpy as np 
import pandas as pd
import os
import pickle
import gc

# 分布確認
!pip install pandas-profiling
import pandas_profiling as pdp
# 可視化
import matplotlib.pyplot as plt

# モデリング
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
!pip install LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

!pip install japanize-matplotlib
import japanize_matplotlib
%matplotlib inline



In [11]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
print(df_train.shape)
print('レコード数', len(df_train))
print('カラム数', len(df_train.columns))

(891, 12)
レコード数 891
カラム数 12


In [20]:
x_train, y_train, id_train = df_train[['Pclass', 'Fare']], df_train[['Survived']], df_train[['PassengerId']]

In [31]:
# クロスバリデーション
n_splits = 5
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x_train, y_train))

for nfold in np.arange(n_splits):
    print('-'*20, nfold, '-'*20)
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
    x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
    print(x_tr.shape, y_tr.shape)
    print(x_va.shape, y_va.shape)
    print('y_train:{:.3f}, y_tr:{:.3f}, y_va:{:.3f}'.format(
        y_train['Survived'].mean(),
        y_tr['Survived'].mean(),
        y_va['Survived'].mean(),
    ))

-------------------- 0 --------------------
(712, 2) (712, 1)
(179, 2) (179, 1)
y_train:0.384, y_tr:0.383, y_va:0.385
-------------------- 1 --------------------
(713, 2) (713, 1)
(178, 2) (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
-------------------- 2 --------------------
(713, 2) (713, 1)
(178, 2) (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
-------------------- 3 --------------------
(713, 2) (713, 1)
(178, 2) (178, 1)
y_train:0.384, y_tr:0.384, y_va:0.382
-------------------- 4 --------------------
(713, 2) (713, 1)
(178, 2) (178, 1)
y_train:0.384, y_tr:0.383, y_va:0.388


In [39]:
# ホールドアウト検証の場合
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=123)
print(x_tr.shape, y_tr.shape)
print(x_va.shape, y_va.shape)
print('y_train:{:.3f}, y_tr:{:.3f}, y_va:{:.3f}'.format(
    y_train['Survived'].mean(),
    y_tr['Survived'].mean(),
    y_va['Survived'].mean(),
))

# ハイパーパラメータ
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr,y_tr),(x_va, y_va)],
          callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True),lgb.log_evaluation(1)]
         )


(712, 2) (712, 1)
(179, 2) (179, 1)
y_train:0.384, y_tr:0.383, y_va:0.385
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[1]	training's auc: 0.76049	valid_1's auc: 0.752964
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.762184	valid_1's auc: 0.757312
[3]	training's auc: 0.764174	valid_1's auc: 0.75527
[4]	training's auc: 0.77276	valid_1's auc: 0.74058
[5]	training's auc: 0.778075	valid_1's auc: 0.731094
[6]	training's auc: 0.785205	valid_1's auc: 0.74249
[7]	training's auc: 0.784321	valid_1's auc: 0.752503
[8]	training'

In [40]:
y_tr_pred = model.predict(x_tr)
y_va_pred = model.predict(x_va)
print(accuracy_score(y_tr, y_tr_pred))
print(accuracy_score(y_va, y_va_pred))

0.6165730337078652
0.6145251396648045


In [41]:
imp = pd.DataFrame({'col': x_train.columns, "imp": model.feature_importances_})
imp.sort_values('imp', ascending=False, ignore_index=True)

Unnamed: 0,col,imp
0,Fare,129.049957
1,Pclass,128.762055


In [52]:
# クロスバリデーションの場合

# ハイパーパラメータ
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

metrics = []
imp = pd.DataFrame()

n_splits = 5
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x_train, y_train))

for nfold in np.arange(n_splits):
    print('-'*20, nfold, '-'*20)
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
    x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]

    model = lgb.LGBMClassifier(**params, force_row_wise=True)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr,y_tr),(x_va, y_va)],
              callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True),lgb.log_evaluation(1)]
             )
    y_tr_pred = model.predict(x_tr)
    y_va_pred = model.predict(x_va)
    metric_tr = accuracy_score(y_tr, y_tr_pred)
    metric_va = accuracy_score(y_va, y_va_pred)
    metrics.append([nfold, metric_tr, metric_va])
    _imp = pd.DataFrame({'col': x_train.columns, "imp": model.feature_importances_})
    imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

print('-'*20, 'result', '-'*20)
metrics = np.array(metrics)
print(metrics)

print('[cv ] tr: {:.2f}+-{:.2f}, va:{:.2f}+-{:.2f}'.format(
    metrics[:,1].mean(),metrics[:,1].std(),
    metrics[:,2].mean(),metrics[:,1].std(),
))

-------------------- 0 --------------------
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Total Bins 123
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[1]	training's auc: 0.762985	valid_1's auc: 0.729381
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.763607	valid_1's auc: 0.730237
[3]	training's auc: 0.763607	valid_1's auc: 0.730237
[4]	training's auc: 0.777045	valid_1's auc: 0.732411
[5]	training's auc: 0.775702	valid_1's auc: 0.735046
[6]	training's auc: 0.777383	valid_1's auc: 0.72747
[7]	training's auc: 0.774955	valid_1's auc: 0.738274
[8]	training's auc: 0.776974	valid_1's auc: 0.739394
[9]	training's auc: 0.780612	valid_1's auc: 0.729644
[10]	training's auc: 0.788155	valid_1's auc: 0.736957
[11]	training's auc: 0.792636	valid_1's a

In [53]:
imp = imp.groupby('col')['imp'].agg(['mean','std'])
imp.columns = ['imp','imp_std']
imp = imp.reset_index(drop=False)
imp.sort_values('imp', ascending=False, ignore_index=True)

Unnamed: 0,col,imp,imp_std
0,Fare,604.666372,290.70052
1,Pclass,286.905244,131.633643


In [59]:
# ベースラインを検証するために検証データを用意する。
x_tr, x_va2, y_tr, y_va2 = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=123)
print(x_tr.shape, y_tr.shape)
print(x_va2.shape, y_va2.shape)

x_tr1, x_va1, y_tr1, y_va1 = train_test_split(x_tr, y_tr, test_size=0.2, shuffle=True, stratify=y_tr, random_state=789)
print(x_tr1.shape, y_tr1.shape)
print(x_va1.shape, y_va1.shape)

(712, 2) (712, 1)
(179, 2) (179, 1)
(569, 2) (569, 1)
(143, 2) (143, 1)


In [60]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr,y_tr),(x_va1, y_va1)],
          callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True),lgb.log_evaluation(1)]
         )

y_va1_pred = model.predict(x_va1)
y_va2_pred = model.predict(x_va2)

print(accuracy_score(y_va1, y_va1_pred))
print(accuracy_score(y_va2, y_va2_pred))

[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[1]	training's auc: 0.76049	valid_1's auc: 0.742562
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.762184	valid_1's auc: 0.743285
[3]	training's auc: 0.764174	valid_1's auc: 0.745971
[4]	training's auc: 0.77276	valid_1's auc: 0.758368
[5]	training's auc: 0.778075	valid_1's auc: 0.766632
[6]	training's auc: 0.785205	valid_1's auc: 0.769731
[7]	training's auc: 0.784321	valid_1's auc: 0.774483
[8]	training's auc: 0.785389	valid_1's auc: 0.780062
[9]	training's auc: 0.792402	va

In [63]:
print('混合行列：検証用データ')
print(confusion_matrix(y_va1, y_va1_pred))
print(confusion_matrix(y_va1, y_va1_pred, normalize='all'))

print('ベースライン検証用データ')
print(confusion_matrix(y_va2, y_va2_pred))
print(confusion_matrix(y_va2, y_va2_pred, normalize='all'))

混合行列：検証用データ
[[80  8]
 [35 20]]
[[0.55944056 0.05594406]
 [0.24475524 0.13986014]]
ベースライン検証用データ
[[93 17]
 [38 31]]
[[0.51955307 0.09497207]
 [0.2122905  0.17318436]]


In [64]:
x_test = df_test[['Pclass', 'Fare']]
id_test = df_test[['PassengerId']]

y_test_pred = model.predict(x_test)
y_test_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [69]:
df_submit = pd.DataFrame({
    'PassengerId': id_test['PassengerId'],
    'Survived': y_test_pred
})
df_submit.to_csv('submission_baseline.csv', index=None)