In [1]:
# Check GPU version
!nvidia-smi

Mon Jun  7 21:42:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX250      WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   48C    P8    N/A /  N/A |     64MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# install packages
!pip install catboost
!pip install optuna
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
run = wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [3]:
# import packages
import os
import numpy as np
import pandas as pd

# Optuna
import optuna
from optuna.samplers import TPESampler   # TPE (Tree-structured Parzen Estimator) sampler
from optuna.integration import SkoptSampler   # Scikit-Optimize sampler
from optuna.pruners import SuccessiveHalvingPruner   # ASHA : 剪枝演算法，防止over-fitting

import catboost
from catboost import CatBoostClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from joblib import load, dump

In [4]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [5]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [6]:
cat_col = [i for i in df_train.columns if i not in ['id', 'target']]
target_col = 'target'

x = df_train[cat_col]
x_test = df_test[cat_col]

# Label Y
le = LabelEncoder()
y = le.fit_transform(df_train[target_col])
y = pd.DataFrame(y, columns = [target_col])

In [7]:
# Optuna
PROJECT_NAME = 'catboost(optuna)-20210605'
SEED = 20210605

sampler = TPESampler(seed=SEED)
# sampler = SkoptSampler()

pruner = SuccessiveHalvingPruner()

def objective(trial, kaggle_metrics='LogLoss', predic_proba=True):
    train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)
    
    params = {'objective': 'MultiClass',
              'eval_metric': 'MultiClass',
              'n_estimators': trial.suggest_int("iterations", 100, 20000),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
              'random_state': SEED,
              'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-3, 10),
              'bootstrap_type': 'Bernoulli',
              'subsample': trial.suggest_uniform('subsample', 0, 1),
              'random_strength': trial.suggest_uniform('random_strength', 1, 50),
              'max_depth': trial.suggest_int('max_depth', 1, 15),
              'min_child_samples': trial.suggest_int('min_child_samples', 1, 30),
              # 'num_leaves': trial.suggest_int('num_leaves', 10, 200),
              # 'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0, 1),
              'leaf_estimation_method': 'Newton',
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 5),
              'task_type': "GPU",
              'od_wait':trial.suggest_int('od_wait', 100, 1000),
             }

    clf = CatBoostClassifier(**params)
    clf.fit(train_x, train_y,
              eval_set=[(valid_x, valid_y)],
              early_stopping_rounds=100,
              verbose=False, cat_features=cat_col)
    
    if predic_proba:
        preds = clf.predict_proba(valid_x)
        preds = np.float64(preds)
    else:
        preds = clf.predict(valid_x)
    
    if kaggle_metrics == 'LogLoss':
        result = log_loss(valid_y, preds)
    elif kaggle_metrics == 'AUC':
        result = roc_auc_score(valid_y, preds)
    elif kaggle_metrics == 'Acc':
        result = accuracy_score(valid_y, preds)
    
    return result


In [None]:
OPTUNA_OPTIMIZATION = True

study = optuna.create_study(direction='minimize', 
                            sampler=sampler, 
                            pruner=pruner, 
                            study_name=PROJECT_NAME,
                           )

study.optimize(objective, 
               n_trials=100, 
               timeout=3*60*60,   # in seconds
              )

print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [8]:
# create best model
# best_params = study.best_trial.params

OPTUNA_OPTIMIZATION = True
best_params = {}
best_params['objective'] = 'MultiClass'
best_params['eval_metric'] = 'MultiClass'
best_params['n_estimators'] = 11270
best_params['learning_rate'] = 0.012538702605499273
best_params['random_state'] = SEED
best_params['reg_lambda'] = 4.505572761446656
best_params['bootstrap_type'] = 'Bernoulli'
best_params['subsample'] = 0.3309772177208782
best_params['random_strength'] = 33.35555891366271
best_params['max_depth'] = 7
best_params['min_child_samples'] = 16
# best_params['num_leaves'] = int
# best_params['colsample_bylevel']  = float
best_params['leaf_estimation_method'] = 'Newton'
best_params['leaf_estimation_iterations'] = 2
best_params['task_type'] = 'GPU'
best_params['od_wait'] = 620

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))
    display(optuna.visualization.plot_parallel_coordinate(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

In [9]:
if OPTUNA_OPTIMIZATION:
    final_model = CatBoostClassifier(**best_params)
else:
    final_model = CatBoostClassifier(**trial)

In [10]:
test_preds=None
KFOLD = 10

kf = StratifiedKFold(n_splits = KFOLD , shuffle = True , random_state = SEED)
for fold, (tr_index , val_index) in enumerate(kf.split(x.values , y.values)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train , x_val = x.values[tr_index] , x.values[val_index]
    y_train , y_val = y.values[tr_index] , y.values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model = final_model
    model.fit(x_train, y_train, eval_set = eval_set, early_stopping_rounds=100, verbose = False)
    dump(model, f'{PROJECT_NAME}_{fold+1}.joblib', compress = 3)
    
    # train_preds = model.predict(x_train)    
    val_preds = model.predict_proba(x_val)
    
    print(log_loss(y_val, val_preds))
    
    if test_preds is None:
        test_preds = model.predict_proba(x_test.values)
    else:
        test_preds += model.predict_proba(x_test.values)

print("-" * 50)
test_preds /= KFOLD

--------------------------------------------------
Fold 1
1.7446199763650154
--------------------------------------------------
Fold 2
1.7455542354685971
--------------------------------------------------
Fold 3
1.7432171049140557
--------------------------------------------------
Fold 4
1.7414502095901634
--------------------------------------------------
Fold 5
1.7507673705642925
--------------------------------------------------
Fold 6
1.7422203750245349
--------------------------------------------------
Fold 7
1.7482563245308804
--------------------------------------------------
Fold 8
1.7509424811203258
--------------------------------------------------
Fold 9
1.7484846351362244
--------------------------------------------------
Fold 10
1.7525987872072306
--------------------------------------------------


In [13]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = test_preds
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)

In [None]:
run.finish()