In [1]:
import numpy as np 
import pandas as pd
import os
import pickle
import gc

import ydata_profiling as ypf
# 可視化
import matplotlib.pyplot as plt

# モデリング
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
!pip install LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

!pip install japanize-matplotlib
import japanize_matplotlib
%matplotlib inline



In [2]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')
x_train, y_train, id_train = df_train[['Pclass', 'Fare']], df_train[['Survived']], df_train[['PassengerId']]

In [3]:
# モデル学習、評価を関数化する

# ハイパーパラメータ
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 16,
    'n_estimators': 100000,
    'random_state': 123,
    'importance_type': 'gain',
}

def train_cv(input_x, input_y, input_id, params, n_split=5):
    metrics = []
    imp = pd.DataFrame()
    
    n_splits = 5
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(x_train, y_train))
    
    for nfold in np.arange(n_splits):
        print('-'*20, nfold, '-'*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
    
        model = lgb.LGBMClassifier(**params, force_row_wise=True)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr),(x_va, y_va)],
                  callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True),lgb.log_evaluation(1)]
                 )
        y_tr_pred = model.predict(x_tr)
        y_va_pred = model.predict(x_va)
        metric_tr = accuracy_score(y_tr, y_tr_pred)
        metric_va = accuracy_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        _imp = pd.DataFrame({'col': x_train.columns, "imp": model.feature_importances_})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
    
    print('-'*20, 'result', '-'*20)
    metrics = np.array(metrics)
    print(metrics)

    print('[cv ] tr: {:.2f}+-{:.2f}, va:{:.2f}+-{:.2f}'.format(
        metrics[:,1].mean(),metrics[:,1].std(),
        metrics[:,2].mean(),metrics[:,1].std(),
    ))
    imp = imp.groupby('col')['imp'].agg(['mean','std'])
    imp.columns = ['imp','imp_std']
    imp = imp.reset_index(drop=False)

    return imp, metrics

In [4]:
imp, metrics = train_cv(x_train, y_train, id_train, params, n_split=5)

-------------------- 0 --------------------
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Total Bins 123
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[1]	training's auc: 0.762985	valid_1's auc: 0.729381
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.763607	valid_1's auc: 0.730237
[3]	training's auc: 0.763607	valid_1's auc: 0.730237
[4]	training's auc: 0.777045	valid_1's auc: 0.732411
[5]	training's auc: 0.775702	valid_1's auc: 0.735046
[6]	training's auc: 0.777383	valid_1's auc: 0.72747
[7]	training's auc: 0.774955	valid_1's auc: 0.738274
[8]	training's auc: 0.776974	valid_1's auc: 0.739394
[9]	training's auc: 0.780612	valid_1's auc: 0.729644
[10]	training's auc: 0.788155	valid_1's auc: 0.736957
[11]	training's auc: 0.792636	valid_1's a

In [14]:
x_train = df_train[['Pclass', 'Fare', 'Age']]
imp, metrics = train_cv(x_train, y_train, id_train, params, n_split=5)
#  → tr: 0.76+-0.03, va:0.69+-0.03

-------------------- 0 --------------------
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Total Bins 185
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[1]	training's auc: 0.782093	valid_1's auc: 0.657708
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.795214	valid_1's auc: 0.669236
[3]	training's auc: 0.805598	valid_1's auc: 0.67859
[4]	training's auc: 0.810237	valid_1's auc: 0.674045
[5]	training's auc: 0.809636	valid_1's auc: 0.676943
[6]	training's auc: 0.822132	valid_1's auc: 0.696904
[7]	training's auc: 0.825949	valid_1's auc: 0.704743
[8]	training's auc: 0.82743	valid_1's auc: 0.701383
[9]	training's auc: 0.828135	valid_1's auc: 0.703755
[10]	training's auc: 0.834706	valid_1's auc: 0.700264
[11]	training's auc: 0.839153	valid_1's au

In [5]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [6]:
df_train.describe(exclude='number').T

Unnamed: 0,count,unique,top,freq
Name,891,891,"Braund, Mr. Owen Harris",1
Sex,891,2,male,577
Ticket,891,681,347082,7
Cabin,204,147,B96 B98,4
Embarked,889,3,S,644


In [8]:
import ydata_profiling as pdp

AttributeError: module 'numba' has no attribute 'generated_jit'

In [13]:
!conda install numba=0.11.1

numba: error: the following arguments are required: filename
