### Context

#### Experiment Tools
- WanDB

In [None]:
!pip install wandb



In [None]:
sweep_config = {
  "name" : "mdc_sweep",
  "method" : "bayes",
  "parameters" : {
    "max_depth" : {
      "distribution": "int_uniform",
      "min":2,
      "max":15
    },
    "subsample" :{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    },
    "colsample_bytree":{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    }
  },
  "metric":{
      "name": "cv_loss",
      "goal": "minimize"
  }
}

In [None]:
import os
from os.path import join

import multiprocessing
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import wandb

# 웹 서버 연결
wandb.init()

# optnua랑 비슷
sweep_id = wandb.sweep(sweep_config, 
                       project="medici wandb test")

n_cpus = multiprocessing.cpu_count()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ········


wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\cv002/.netrc




Create sweep with ID: 3x3z8ebw
Sweep URL: https://wandb.ai/jeongho/medici%20wandb%20test/sweeps/3x3z8ebw


In [None]:
BASE_DIR = './data' 

train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

data = pd.read_csv(train_path)
test = pd.read_csv(test_path)

label = data['credit']

In [None]:
# 불필요한 컬럼 제거
data.drop(columns=['index', 'credit'], inplace=True)
test.drop(columns=['index'],         inplace=True)

In [None]:
cat_columns = [c for c, t in zip(data.dtypes.index, data.dtypes) if t == 'O'] 
num_columns = [c for c    in data.columns if c not in cat_columns]

print('Categorical Columns: \n{}\n'.format(cat_columns))
print('Numeric Columns: \n{}'.format(num_columns))

Categorical Columns: 
['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']

Numeric Columns: 
['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'family_size', 'begin_month']


#### 라벨 데이터 인코딩

In [None]:
label = label.astype(int)

#### 전처리 프로세스 함수로 작성

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess(x_train, x_valid, x_test):
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train.reset_index(drop=True, inplace=True)
    tmp_x_valid.reset_index(drop=True, inplace=True)
    
    # 결측치 처리
    imputer = SimpleImputer(strategy='most_frequent')
    tmp_x_train[cat_columns] = imputer.fit_transform(tmp_x_train[cat_columns])
    tmp_x_valid[cat_columns] = imputer.transform(tmp_x_valid[cat_columns])
    tmp_x_test[cat_columns]  = imputer.transform(tmp_x_test[cat_columns])
    
    # 스케일링
    scaler = StandardScaler()
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])

    # 인코딩
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(tmp_x_train[cat_columns])
    
    tmp_x_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]))
    tmp_x_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]))
    tmp_x_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]))
    
    tmp_x_train.drop(columns=cat_columns, inplace=True)
    tmp_x_valid.drop(columns=cat_columns, inplace=True)
    tmp_x_test.drop(columns=cat_columns, inplace=True)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_x_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_x_valid_cat], axis=1)
    tmp_x_test  = pd.concat([tmp_x_test, tmp_x_test_cat], axis=1)
    
    return tmp_x_train, tmp_x_valid, tmp_x_test

### Ensemble

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

def train():
    with wandb.init() as run:
        params = wandb.config
        
        val_scores = list()
        n_splits = 5

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        for i, (trn_idx, val_idx) in enumerate(skf.split(data, label)):
            x_train, y_train = data.iloc[trn_idx, :], label.iloc[trn_idx,]
            x_valid, y_valid = data.iloc[val_idx, :], label.iloc[val_idx,]

            # 전처리
            x_train, x_valid, x_test = preprocess(x_train, x_valid, test)

            # 모델 정의
            model = XGBClassifier(n_estimators=1000,
                                  max_depth=params['max_depth'],
                                  subsample=params['subsample'],
                                  colsample_bytree=params['colsample_bytree'],
#                                   tree_method='gpu_hist',
                                  n_jobs=n_cpus-1)

            # 모델 학습
            model.fit(x_train, y_train, 
                      eval_metric='mlogloss', 
                      eval_set=[[x_train, y_train], [x_valid, y_valid]],
                      early_stopping_rounds=100,
                      verbose=100) # 

            # 훈련, 검증 데이터 log_loss 확인
            trn_logloss = log_loss(y_train, model.predict_proba(x_train))
            val_logloss = log_loss(y_valid, model.predict_proba(x_valid))
            print('{} Fold, train logloss : {:.4f}4, validation logloss : {:.4f}'.format(i, trn_logloss, val_logloss))

            val_scores.append(val_logloss)
            
        metrics = {"cv_loss": np.mean(val_scores)}
        wandb.log(metrics)
count = 5
wandb.agent(sweep_id, function=train, count=count)

wandb: Agent Starting Run: ybhpchnr with config:
wandb: 	colsample_bytree: 0.8356307450676174
wandb: 	max_depth: 15
wandb: 	subsample: 0.5760809165624917


[0]	validation_0-mlogloss:0.94563	validation_1-mlogloss:0.97137
[100]	validation_0-mlogloss:0.10372	validation_1-mlogloss:0.90358
[117]	validation_0-mlogloss:0.09063	validation_1-mlogloss:0.93819
0 Fold, train logloss : 0.43064, validation logloss : 0.7266
[0]	validation_0-mlogloss:0.94650	validation_1-mlogloss:0.97288
[100]	validation_0-mlogloss:0.10204	validation_1-mlogloss:0.93990
[116]	validation_0-mlogloss:0.09007	validation_1-mlogloss:0.97461
1 Fold, train logloss : 0.42554, validation logloss : 0.7400
[0]	validation_0-mlogloss:0.96810	validation_1-mlogloss:0.99226
[100]	validation_0-mlogloss:0.10292	validation_1-mlogloss:0.93722
[119]	validation_0-mlogloss:0.08780	validation_1-mlogloss:0.98058
2 Fold, train logloss : 0.40664, validation logloss : 0.7343
[0]	validation_0-mlogloss:0.96698	validation_1-mlogloss:0.99152
[100]	validation_0-mlogloss:0.10260	validation_1-mlogloss:0.92812
[118]	validation_0-mlogloss:0.08848	validation_1-mlogloss:0.96647
3 Fold, train logloss : 0.42504, 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73213
_runtime,123.0
_timestamp,1628321004.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: 45vpx00d with config:
wandb: 	colsample_bytree: 0.7496732139764124
wandb: 	max_depth: 5
wandb: 	subsample: 0.9307190923092045
wandb: Currently logged in as: jeongho (use `wandb login --relogin` to force relogin)


[0]	validation_0-mlogloss:0.97674	validation_1-mlogloss:0.97753
[100]	validation_0-mlogloss:0.62547	validation_1-mlogloss:0.74374
[200]	validation_0-mlogloss:0.53258	validation_1-mlogloss:0.72747
[300]	validation_0-mlogloss:0.46227	validation_1-mlogloss:0.72181
[400]	validation_0-mlogloss:0.41016	validation_1-mlogloss:0.72350
[408]	validation_0-mlogloss:0.40650	validation_1-mlogloss:0.72441
0 Fold, train logloss : 0.45744, validation logloss : 0.7204
[0]	validation_0-mlogloss:0.97513	validation_1-mlogloss:0.97931
[100]	validation_0-mlogloss:0.62132	validation_1-mlogloss:0.75743
[200]	validation_0-mlogloss:0.52607	validation_1-mlogloss:0.74419
[300]	validation_0-mlogloss:0.45804	validation_1-mlogloss:0.73991
[380]	validation_0-mlogloss:0.41667	validation_1-mlogloss:0.74125
1 Fold, train logloss : 0.46854, validation logloss : 0.7387
[0]	validation_0-mlogloss:1.00449	validation_1-mlogloss:1.00714
[100]	validation_0-mlogloss:0.62594	validation_1-mlogloss:0.75418
[200]	validation_0-mloglos

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72945
_runtime,137.0
_timestamp,1628321150.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: sngs674y with config:
wandb: 	colsample_bytree: 0.6852873622490906
wandb: 	max_depth: 5
wandb: 	subsample: 0.8267006346573685


[0]	validation_0-mlogloss:0.97657	validation_1-mlogloss:0.97733
[100]	validation_0-mlogloss:0.62933	validation_1-mlogloss:0.74799
[200]	validation_0-mlogloss:0.53065	validation_1-mlogloss:0.72892
[300]	validation_0-mlogloss:0.46152	validation_1-mlogloss:0.72699
[400]	validation_0-mlogloss:0.41044	validation_1-mlogloss:0.73147
[402]	validation_0-mlogloss:0.40960	validation_1-mlogloss:0.73162
0 Fold, train logloss : 0.45954, validation logloss : 0.7268
[0]	validation_0-mlogloss:0.97588	validation_1-mlogloss:0.97986
[100]	validation_0-mlogloss:0.62489	validation_1-mlogloss:0.75904
[200]	validation_0-mlogloss:0.52915	validation_1-mlogloss:0.74420
[300]	validation_0-mlogloss:0.45968	validation_1-mlogloss:0.74337
[400]	validation_0-mlogloss:0.40716	validation_1-mlogloss:0.74683
[407]	validation_0-mlogloss:0.40390	validation_1-mlogloss:0.74677
1 Fold, train logloss : 0.45614, validation logloss : 0.7426
[0]	validation_0-mlogloss:1.00540	validation_1-mlogloss:1.00790
[100]	validation_0-mloglos

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73407
_runtime,136.0
_timestamp,1628321296.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: 0sw2q1k4 with config:
wandb: 	colsample_bytree: 0.8145134280029038
wandb: 	max_depth: 2
wandb: 	subsample: 0.5524820822631327


[0]	validation_0-mlogloss:0.97974	validation_1-mlogloss:0.97843
[100]	validation_0-mlogloss:0.77394	validation_1-mlogloss:0.79296
[200]	validation_0-mlogloss:0.75338	validation_1-mlogloss:0.78651
[300]	validation_0-mlogloss:0.73587	validation_1-mlogloss:0.78100
[400]	validation_0-mlogloss:0.72017	validation_1-mlogloss:0.77706
[500]	validation_0-mlogloss:0.70757	validation_1-mlogloss:0.77160
[600]	validation_0-mlogloss:0.69580	validation_1-mlogloss:0.77036
[700]	validation_0-mlogloss:0.68532	validation_1-mlogloss:0.76794
[800]	validation_0-mlogloss:0.67634	validation_1-mlogloss:0.76784
[900]	validation_0-mlogloss:0.66735	validation_1-mlogloss:0.76681
[999]	validation_0-mlogloss:0.65898	validation_1-mlogloss:0.76450
0 Fold, train logloss : 0.65994, validation logloss : 0.7645
[0]	validation_0-mlogloss:0.97879	validation_1-mlogloss:0.98076
[100]	validation_0-mlogloss:0.77000	validation_1-mlogloss:0.80035
[200]	validation_0-mlogloss:0.74623	validation_1-mlogloss:0.79386
[300]	validation_0-

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.77025
_runtime,114.0
_timestamp,1628321419.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


wandb: Agent Starting Run: 3q8jw5zz with config:
wandb: 	colsample_bytree: 0.6748309527123748
wandb: 	max_depth: 9
wandb: 	subsample: 0.976503047280304


[0]	validation_0-mlogloss:0.96677	validation_1-mlogloss:0.97546
[100]	validation_0-mlogloss:0.38103	validation_1-mlogloss:0.71453
[177]	validation_0-mlogloss:0.26652	validation_1-mlogloss:0.74080
0 Fold, train logloss : 0.42654, validation logloss : 0.7123
[0]	validation_0-mlogloss:0.96582	validation_1-mlogloss:0.97807
[100]	validation_0-mlogloss:0.37537	validation_1-mlogloss:0.73701
[164]	validation_0-mlogloss:0.27001	validation_1-mlogloss:0.76238
1 Fold, train logloss : 0.45754, validation logloss : 0.7340
[0]	validation_0-mlogloss:0.99468	validation_1-mlogloss:1.00290
[100]	validation_0-mlogloss:0.37558	validation_1-mlogloss:0.73315
[174]	validation_0-mlogloss:0.26192	validation_1-mlogloss:0.76132
2 Fold, train logloss : 0.43024, validation logloss : 0.7290
[0]	validation_0-mlogloss:0.99261	validation_1-mlogloss:1.00159
[100]	validation_0-mlogloss:0.36678	validation_1-mlogloss:0.72960
[181]	validation_0-mlogloss:0.25053	validation_1-mlogloss:0.76118
3 Fold, train logloss : 0.41134, 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72574
_runtime,108.0
_timestamp,1628321535.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


In [None]:
# submit.to_csv('oof_first_submit.csv', index=False)