In [8]:
# Check GPU version
!nvidia-smi

Tue Jun  8 11:10:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX250      WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   44C    P8    N/A /  N/A |     64MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
# Check CUDA/cuDNN Version
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:04_Central_Daylight_Time_2018
Cuda compilation tools, release 10.0, V10.0.130


In [None]:
# install packages
!pip install xgboost
!pip install optuna
!pip install category_encoders
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [10]:
# import packages
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Optuna
import optuna
from optuna.samplers import TPESampler   # TPE (Tree-structured Parzen Estimator) sampler
from optuna.integration import SkoptSampler   # Scikit-Optimize sampler
from optuna.pruners import SuccessiveHalvingPruner   # ASHA : 剪枝演算法，防止over-fitting

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from category_encoders.cat_boost import CatBoostEncoder

from joblib import load, dump

def check_gpu_support():
    try:
        data = np.random.rand(1000, 10)
        label = np.random.randint(2, size=1000)
        train_data = xgb.DMatrix(data, label = label)
        params = {'tree_method': 'gpu_hist', 'max_depth': 3, 'learning_rate': 0.1}
        gbm = xgb.train(params, train_data, evals=[(train_data, "train")])
        return True
    except Exception as e:
        return False

print(check_gpu_support())

[0]	train-rmse:0.49744
[1]	train-rmse:0.49564
[2]	train-rmse:0.49301
[3]	train-rmse:0.49085
[4]	train-rmse:0.48928
[5]	train-rmse:0.48782
[6]	train-rmse:0.48626
[7]	train-rmse:0.48430
[8]	train-rmse:0.48253
[9]	train-rmse:0.48144
True


In [11]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [12]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

all_data = pd.concat([df_train, df_test]).reset_index(drop=True)
all_data.drop(['id'], axis=1, inplace=True)

In [13]:
num_col = []
cat_col = [i for i in all_data.columns if i not in ['id', 'target']]
target_col = 'target'
comb = num_col + cat_col + [target_col]

# Label Y
le = LabelEncoder()
y = le.fit_transform(df_train[target_col])

#  Catboost Target Encoder
ce = CatBoostEncoder(cols=cat_col, random_state=42)
x = df_train[cat_col]
ce.fit(x, y)

all_data[cat_col] = pd.DataFrame(ce.transform(all_data[cat_col]), columns=cat_col)

In [14]:
# split train and test
df_train = all_data[:len(df_train)]
df_train[target_col] = y.astype('int64')
x = df_train[cat_col]
y = pd.DataFrame(df_train[target_col])

df_test = all_data[len(df_train):]
df_test.drop(target_col, axis=1, inplace=True)
x_test = df_test[cat_col]

In [15]:
# Optuna
PROJECT_NAME = 'xgboost(optuna)-20210607'
SEED = 20210607

sampler = TPESampler(seed=SEED)
# sampler = SkoptSampler()

pruner = SuccessiveHalvingPruner()

def objective(trial, kaggle_metrics='LogLoss', predic_proba=True):
    train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)
    
    params = {'n_estimators': trial.suggest_int("n_estimators", 100, 20000),
              'max_depth': trial.suggest_int('max_depth', 1, 25),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
              'objective': 'multi:softmax',
              'booster': 'gbtree',
              'n_jobs': -1,
              'gamma': trial.suggest_uniform('gamma', 0, 1),
              'min_child_weight': trial.suggest_uniform('min_child_weight', 0.001, 10),
              'subsample': trial.suggest_uniform('subsample', 0, 1),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0, 1),
              'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-3, 10),
              'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-3, 10),
              'random_state': SEED,
              'tree_method': 'gpu_hist',
              'predictor': "gpu_predictor",
              'eval_metric': 'mlogloss',
              'num_class': len(y.value_counts()),
             }

    clf = XGBClassifier(**params)
    clf.fit(train_x, train_y,
            eval_set=[(valid_x, valid_y)],
            early_stopping_rounds=100,
            verbose=False)
    
    if predic_proba:
        preds = clf.predict_proba(valid_x)
        preds = np.float64(preds)
    else:
        preds = clf.predict(valid_x)
    
    if kaggle_metrics == 'LogLoss':
        result = log_loss(valid_y, preds)
    elif kaggle_metrics == 'AUC':
        result = roc_auc_score(valid_y, preds)
    elif kaggle_metrics == 'Acc':
        result = accuracy_score(valid_y, preds)
    
    return result


In [None]:
OPTUNA_OPTIMIZATION = True

study = optuna.create_study(direction='minimize', 
                            sampler=sampler, 
                            pruner=pruner, 
                            study_name=PROJECT_NAME,
                           )

study.optimize(objective, 
               n_trials=100, 
               timeout=3*60*60,   # in seconds
              )

print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [16]:
# create best model
# best_params = study.best_trial.params

OPTUNA_OPTIMIZATION = True
best_params = {}
best_params['n_estimators'] = 13901
best_params['max_depth'] = 25
best_params['learning_rate'] = 0.02063032057412173
best_params['objective'] = 'multi:softmax'
best_params['booster'] = 'gbtree'
best_params['n_jobs']  = -1
best_params['gamma'] = 0.7892861960740625
best_params['min_child_weight'] = 7.0254636458448525
best_params['subsample'] = 0.6167324770652026
best_params['colsample_bytree'] = 0.030431658659036964
best_params['reg_alpha'] = 7.245923834298281
best_params['reg_lambda'] = 7.758285811972932
best_params['random_state'] = SEED
best_params['tree_method'] = 'gpu_hist'
best_params['predictor'] = "gpu_predictor"
best_params['eval_metric'] = 'mlogloss'
best_params['num_class'] = len(y.value_counts())

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))
    display(optuna.visualization.plot_parallel_coordinate(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

In [17]:
if OPTUNA_OPTIMIZATION:
    final_model = XGBClassifier(**best_params)
else:
    final_model = XGBClassifier(**trial)

In [18]:
test_preds=None
KFOLD = 10

kf = StratifiedKFold(n_splits = KFOLD , shuffle = True , random_state = SEED)
for fold, (tr_index , val_index) in enumerate(kf.split(x.values , y.values)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train , x_val = x.values[tr_index] , x.values[val_index]
    y_train , y_val = y.values[tr_index] , y.values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model = final_model
    model.fit(x_train, y_train, eval_set = eval_set, early_stopping_rounds=100, verbose = False)
    dump(model, f'{PROJECT_NAME}_{fold+1}.joblib', compress = 3)
    
    # train_preds = model.predict(x_train)    
    val_preds = model.predict_proba(x_val)
    
    print(log_loss(y_val, val_preds))
    
    if test_preds is None:
        test_preds = model.predict_proba(x_test.values)
    else:
        test_preds += model.predict_proba(x_test.values)

print("-" * 50)
test_preds /= KFOLD

--------------------------------------------------
Fold 1
1.7481942215166986
--------------------------------------------------
Fold 2
1.7445724634477868
--------------------------------------------------
Fold 3
1.7463170833535493
--------------------------------------------------
Fold 4
1.7493261484146119
--------------------------------------------------
Fold 5
1.7479210376460106
--------------------------------------------------
Fold 6
1.7492149265835062
--------------------------------------------------
Fold 7
1.7463703367885202
--------------------------------------------------
Fold 8
1.7473817618731409
--------------------------------------------------
Fold 9
1.7455244385108353
--------------------------------------------------
Fold 10
1.7447263811446727
--------------------------------------------------


In [21]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = test_preds
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)

In [None]:
run.finish()