In [11]:
import japanize_matplotlib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


In [12]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [13]:
data = pd.concat([df_train, df_test], sort=False)


# no yes を 0 と 1 に変更
data['default'].replace(['no', 'yes'], [0, 1], inplace=True)
data['housing'].replace(['no', 'yes'], [0, 1], inplace=True)
data['loan'].replace(['no', 'yes'], [0, 1], inplace=True)

from sklearn.preprocessing import LabelEncoder
job_le = LabelEncoder()
data['job'] = job_le.fit_transform(data['job'])
marital_le = LabelEncoder()
data['marital'] = marital_le.fit_transform(data['marital'])
education_le = LabelEncoder()
data['education'] = education_le.fit_transform(data['education'])
housing_le = LabelEncoder()
data['housing'] = housing_le.fit_transform(data['housing'])
loan_le = LabelEncoder()
data['loan'] = loan_le.fit_transform(data['loan'])
contact_le = LabelEncoder()
data['contact'] = contact_le.fit_transform(data['contact'])
poutcome_le = LabelEncoder()
data['poutcome'] = poutcome_le.fit_transform(data['poutcome'])


data['month'] = data['month'].map({'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}).astype(int)


In [14]:
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'previous']

In [15]:
train = data[:len(df_train)]
test = data[len(df_train):]

y_train = train['y'].astype(int)
X_train = train.drop('y', axis=1)
X_test = test.drop('y', axis=1)

print(X_train.head())

   id  age  job  marital  education  default  balance  housing  loan  contact  \
0   0   31    7        1          1        0    12294        1     0        0   
1   1   29    2        2          2        0    43027        0     0        0   
2   2   35    4        1          2        0    12252        1     0        0   
3   3   31    9        1          1        0    99121        1     1        2   
4   4   48   10        1          0        0    42005        1     0        1   

   day  month  duration  campaign  pdays  previous  poutcome  
0   21     11       101         3    498         0         1  
1   22      8       158         2    702         0         3  
2   11     11       351         1    826         0         0  
3   16      5       658         2    120         0         0  
4    3      4       177         1    273         0         3  


In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [17]:
FOLD = 5
NUM_ROUND = 1000


params = {
    'objective': 'binary', 
}
valid_scores = []
models = []

kf = KFold(n_splits=FOLD, shuffle=True, random_state=0)

for fold, (train_indices, valid_indices) in enumerate(kf.split(X_train)):
    X_t, X_v = X_train.iloc[train_indices], X_train.iloc[valid_indices]
    y_t, y_v = y_train.iloc[train_indices], y_train.iloc[valid_indices]

    lgb_train = lgb.Dataset(X_t, y_t, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_v, y_v, categorical_feature=categorical_features)
    
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=1000, callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True), lgb.log_evaluation(0)])

    y_valid_pred = model.predict(X_v, num_iteration=model.best_iteration)

    score = roc_auc_score(y_v, y_valid_pred)

    print(f'fold {fold}, AUC: {score}')
    valid_scores.append(score)
    models.append(model)

cv_score = np.mean(valid_scores)
print(f'CV score: {cv_score}')

[LightGBM] [Info] Number of positive: 1178, number of negative: 13998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 15176, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.077623 -> initscore=-2.475096
[LightGBM] [Info] Start training from score -2.475096
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[66]	training's binary_logloss: 0.144081	valid_1's binary_logloss: 0.194435
fold 0, AUC: 0.8595163136805953
[LightGBM] [Info] Number of positive: 1178, number of negative: 13998
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGB

In [18]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categorical_features)

model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=1000, callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=True), lgb.log_evaluation(0)])

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 1478, number of negative: 17492
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 987
[LightGBM] [Info] Number of data points in the train set: 18970, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.077912 -> initscore=-2.471054
[LightGBM] [Info] Start training from score -2.471054
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[46]	training's binary_logloss: 0.165099	valid_1's binary_logloss: 0.20454
