In [1]:
# Check GPU version
!nvidia-smi

Sat Jun  5 18:19:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 462.31       Driver Version: 462.31       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX250      WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   39C    P0    N/A /  N/A |     64MiB /  2048MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# Check CUDA/cuDNN Version
!nvcc -V && which nvcc

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:04_Central_Daylight_Time_2018
Cuda compilation tools, release 10.0, V10.0.130


In [None]:
# install packages
!pip install lightgbm
!pip install optuna
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [None]:
# install lightgbm GPU in colab
# 先登入google cloud
from google.colab import drive
drive.mount('/content/drive')

!pip uninstall lightgbm -y
!git clone --recursive https://github.com/Microsoft/LightGBM
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# Optuna
import optuna
from optuna.samplers import TPESampler   # TPE (Tree-structured Parzen Estimator) sampler
from optuna.integration import SkoptSampler   # Scikit-Optimize sampler
from optuna.pruners import SuccessiveHalvingPruner   # ASHA : 剪枝演算法，防止over-fitting

import lightgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from category_encoders.cat_boost import CatBoostEncoder

from joblib import load, dump

def check_gpu_support():
    try:
        data = np.random.rand(1000, 10)
        label = np.random.randint(2, size=1000)
        train_data = lightgbm.Dataset(data, label=label)
        params = {'device': 'gpu'}
        gbm = lightgbm.train(params, train_set=train_data)
        return True
    except Exception as e:
        return False

print(check_gpu_support())

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.01 MB) transferred to GPU in 0.000783 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.489000
True


In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

all_data = pd.concat([df_train, df_test]).reset_index(drop=True)
all_data.drop(['id'], axis=1, inplace=True)

In [5]:
num_col = []
cat_col = [i for i in all_data.columns if i not in ['id', 'target']]
target_col = 'target'
comb = num_col + cat_col + [target_col]

# Label Y
le = LabelEncoder()
y = le.fit_transform(df_train[target_col])

#  Catboost Target Encoder
ce = CatBoostEncoder(cols=cat_col)
x = df_train[cat_col]
ce.fit(x, y)

all_data[cat_col] = pd.DataFrame(ce.transform(all_data[cat_col]), columns=cat_col)

  elif pd.api.types.is_categorical(cols):


In [6]:
# split train and test
df_train = all_data[:len(df_train)]
df_train[target_col] = y.astype('int64')

df_test = all_data[len(df_train):]
df_test.drop(target_col, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [11]:
# Optuna
PROJECT_NAME = 'lightgbm(optuna)-20210605'
SEED = 20210605

sampler = TPESampler(seed=SEED)
# sampler = SkoptSampler()

pruner = SuccessiveHalvingPruner()

def objective(trial, kaggle_metrics='LogLoss', predic_proba=True):
    train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)
    
    params = {'boosting_type': 'gbdt',
              'num_leaves': trial.suggest_int("num_leaves", 10, 200),
              'max_depth': trial.suggest_int('max_depth', 1, 25),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
              'n_estimators': trial.suggest_int("n_estimators", 100, 20000),
              'objective': 'multiclass',
              'class_weight': None,   # 'balanced': adjust class weight by class size.
              'subsample': trial.suggest_uniform('subsample', 0, 1),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0, 1),
              'reg_alpha': trial.suggest_uniform('reg_alpha', 1e-3, 100),
              'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-3, 100),
              'random_state': SEED,
              'n_jobs': -1,
              'device_type': 'gpu',
              'metric': 'multi_logloss',
              # 'categorical_feature': "",
              # 'cat_smooth': trial.suggest_uniform('cat_smooth', 0.1, 100),    
             }

    clf = LGBMClassifier(**params)
    clf.fit(train_x, train_y,
            eval_set=[(valid_x, valid_y)],
            early_stopping_rounds=100,
            verbose=False)
    
    if predic_proba:
        preds = clf.predict_proba(valid_x)
        preds = np.float64(preds)
    else:
        preds = clf.predict(valid_x)
    
    if kaggle_metrics == 'LogLoss':
        result = log_loss(valid_y, preds)
    elif kaggle_metrics == 'AUC':
        result = roc_auc_score(valid_y, preds)
    elif kaggle_metrics == 'Acc':
        result = accuracy_score(valid_y, preds)
    
    return result


Unnamed: 0,Description,Value
0,session_id,20210602
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,"0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8"
4,Original Data,"(200000, 76)"
5,Missing Values,False
6,Numeric Features,75
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


(0         5
 1         5
 2         1
 3         7
 4         1
          ..
 199995    5
 199996    5
 199997    7
 199998    6
 199999    7
 Name: target, Length: 200000, dtype: int32,
 'clf-default-name',
 {'lr': <pycaret.containers.models.classification.LogisticRegressionClassifierContainer at 0x154805e7e88>,
  'knn': <pycaret.containers.models.classification.KNeighborsClassifierContainer at 0x154805e8e88>,
  'nb': <pycaret.containers.models.classification.GaussianNBClassifierContainer at 0x154805e8508>,
  'dt': <pycaret.containers.models.classification.DecisionTreeClassifierContainer at 0x154805e8888>,
  'svm': <pycaret.containers.models.classification.SGDClassifierContainer at 0x154805e8288>,
  'rbfsvm': <pycaret.containers.models.classification.SVCClassifierContainer at 0x154805ed8c8>,
  'gpc': <pycaret.containers.models.classification.GaussianProcessClassifierContainer at 0x154805edf48>,
  'mlp': <pycaret.containers.models.classification.MLPClassifierContainer at 0x154805ed248

In [5]:
OPTUNA_OPTIMIZATION = True

study = optuna.create_study(direction='minimize', 
                            sampler=sampler, 
                            pruner=pruner, 
                            study_name=PROJECT_NAME,
                           )

study.optimize(objective, 
               n_trials=100, 
               timeout=3*60*60,   # in seconds
              )

print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

SyntaxError: invalid syntax (<ipython-input-5-2234eff5c41e>, line 20)

In [None]:
# create best model
best_params = study.best_trial.params
best_params['boosting_type'] = 'gbdt'
best_params['num_leaves'] = int
best_params['max_depth'] = int
best_params['learning_rate'] = float
best_params['n_estimators'] = int
best_params['objective'] = 'multiclass'
best_params['class_weight'] = None
best_params['subsample'] = float
best_params['colsample_bytree'] = float
best_params['reg_alpha'] = float
best_params['reg_lambda'] = float
best_params['random_state'] = SEED
best_params['n_jobs']  = -1
best_params['device_type'] = 'gpu'
best_params['metric'] = 'multi_logloss'

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study))
    display(optuna.visualization.plot_slice(study))
    display(optuna.visualization.plot_parallel_coordinate(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

In [None]:
if OPTUNA_OPTIMIZATION:
    final_model = LGBMClassifier(**best_params)
else:
    final_model = LGBMClassifier(**trial)