In [1]:
# Check GPU version
!nvidia-smi

Mon Jun  7 19:27:18 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX250      WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   54C    P8    N/A /  N/A |     64MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Check CUDA/cuDNN Version
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:04_Central_Daylight_Time_2018
Cuda compilation tools, release 10.0, V10.0.130


In [None]:
# install packages
!pip install lightgbm
!pip install optuna
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [None]:
# install lightgbm GPU in colab
# 先登入google cloud
from google.colab import drive
drive.mount('/content/drive')

!pip uninstall lightgbm -y
!git clone --recursive https://github.com/Microsoft/LightGBM
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu

In [4]:
# import packages
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Optuna
import optuna
from optuna.samplers import TPESampler   # TPE (Tree-structured Parzen Estimator) sampler
from optuna.integration import SkoptSampler   # Scikit-Optimize sampler
from optuna.pruners import SuccessiveHalvingPruner   # ASHA : 剪枝演算法，防止over-fitting

import lightgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from joblib import load, dump

def check_gpu_support():
    try:
        data = np.random.rand(1000, 10)
        label = np.random.randint(2, size=1000)
        train_data = lightgbm.Dataset(data, label=label)
        params = {'device': 'gpu'}
        gbm = lightgbm.train(params, train_set=train_data)
        return True
    except Exception as e:
        return False

print(check_gpu_support())

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.01 MB) transferred to GPU in 0.001261 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.512000
True


In [5]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [6]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [7]:
num_col = []
cat_col = [i for i in df_train.columns if i not in ['id', 'target']]
target_col = 'target'
comb = num_col + cat_col + [target_col]

# Label Y
le = LabelEncoder()
y = le.fit_transform(df_train[target_col])
y.astype('int64')
y = pd.DataFrame(y, columns=[target_col])

x = df_train[cat_col]
x_test = df_test[cat_col]

In [8]:
# Optuna
PROJECT_NAME = 'lightgbm(optuna)-20210607'
SEED = 20210607

sampler = TPESampler(seed=SEED)
# sampler = SkoptSampler()

pruner = SuccessiveHalvingPruner()

def objective(trial, kaggle_metrics='LogLoss', predic_proba=True):
    train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)
    
    params = {'boosting_type': 'gbdt',
              'num_leaves': trial.suggest_int("num_leaves", 10, 200),
              'max_depth': trial.suggest_int('max_depth', 1, 25),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
              'n_estimators': trial.suggest_int("n_estimators", 100, 20000),
              'objective': 'multiclass',
              'class_weight': None,   # 'balanced': adjust class weight by class size.
              'subsample': trial.suggest_uniform('subsample', 0, 1),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0, 1),
              'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-3, 10),
              'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-3, 10),
              'random_state': SEED,
              'n_jobs': -1,
              'device_type': 'gpu',
              'metric': 'multi_logloss',
              'cat_smooth': trial.suggest_uniform('cat_smooth', 0.1, 100),    
             }

    clf = LGBMClassifier(**params)
    clf.fit(train_x, train_y,
            eval_set=[(valid_x, valid_y)],
            feature_name=cat_col,
            early_stopping_rounds=100,
            verbose=False)
    
    if predic_proba:
        preds = clf.predict_proba(valid_x)
        preds = np.float64(preds)
    else:
        preds = clf.predict(valid_x)
    
    if kaggle_metrics == 'LogLoss':
        result = log_loss(valid_y, preds)
    elif kaggle_metrics == 'AUC':
        result = roc_auc_score(valid_y, preds)
    elif kaggle_metrics == 'Acc':
        result = accuracy_score(valid_y, preds)
    
    return result


In [None]:
OPTUNA_OPTIMIZATION = True

study = optuna.create_study(direction='minimize', 
                            sampler=sampler, 
                            pruner=pruner, 
                            study_name=PROJECT_NAME,
                           )

study.optimize(objective, 
               n_trials=100, 
               timeout=3*60*60,   # in seconds
              )

print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

[32m[I 2021-06-07 12:47:44,787][0m A new study created in memory with name: lightgbm(optuna)-20210607[0m


In [9]:
# create best model
# best_params = study.best_trial.params

OPTUNA_OPTIMIZATION = True
best_params = {}
best_params['boosting_type'] = 'gbdt'
best_params['num_leaves'] = 47
best_params['max_depth'] = 17
best_params['learning_rate'] = 0.016006157294323333
best_params['n_estimators'] = 4945
best_params['objective'] = 'multiclass'
best_params['class_weight'] = None
best_params['subsample'] = 0.43139189367913644
best_params['colsample_bytree'] = 0.1530577924027114
best_params['reg_alpha'] = 7.882756575918996
best_params['reg_lambda'] = 3.331787587893195
best_params['random_state'] = SEED
best_params['n_jobs']  = -1
best_params['device_type'] = 'gpu'
best_params['metric'] = 'multi_logloss'
best_params['cat_smooth'] = 16.192565310228396

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))
    display(optuna.visualization.plot_parallel_coordinate(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

In [10]:
if OPTUNA_OPTIMIZATION:
    final_model = LGBMClassifier(**best_params)
else:
    final_model = LGBMClassifier(**trial)

In [11]:
test_preds=None
KFOLD = 10

kf = StratifiedKFold(n_splits = KFOLD , shuffle = True , random_state = SEED)
for fold, (tr_index , val_index) in enumerate(kf.split(x.values , y.values)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train , x_val = x.values[tr_index] , x.values[val_index]
    y_train , y_val = y.values[tr_index] , y.values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model = final_model
    model.fit(x_train, y_train, eval_set = eval_set, feature_name=cat_col, early_stopping_rounds=100, verbose = False)
    dump(model, f'{PROJECT_NAME}_{fold+1}.joblib', compress = 3)
    
    # train_preds = model.predict(x_train)    
    val_preds = model.predict_proba(x_val)
    
    print(log_loss(y_val, val_preds))
    
    if test_preds is None:
        test_preds = model.predict_proba(x_test.values)
    else:
        test_preds += model.predict_proba(x_test.values)

print("-" * 50)
test_preds /= KFOLD

--------------------------------------------------
Fold 1
1.746124621435721
--------------------------------------------------
Fold 2
1.742258148834359
--------------------------------------------------
Fold 3
1.7459103029820238
--------------------------------------------------
Fold 4
1.7460789564366264
--------------------------------------------------
Fold 5
1.7461376297925673
--------------------------------------------------
Fold 6
1.745573683592618
--------------------------------------------------
Fold 7
1.7438839269049153
--------------------------------------------------
Fold 8
1.7447194680475098
--------------------------------------------------
Fold 9
1.7433068769947466
--------------------------------------------------
Fold 10
1.741265495180946
--------------------------------------------------


In [12]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = test_preds
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)

In [None]:
run.finish()