In [None]:
# Check GPU version
!nvidia-smi

In [None]:
# Check CUDA/cuDNN Version
!nvcc -V && which nvcc

In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install pycaret[full]
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [None]:
# install lightgbm GPU in colab
# 先登入google cloud
from google.colab import drive
drive.mount('/content/drive')

!pip uninstall lightgbm -y
!git clone --recursive https://github.com/Microsoft/LightGBM
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# Pycaret
from pycaret.classification import *

import catboost
import lightgbm
import xgboost

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from joblib import load, dump

def check_gpu_support():
    try:
        data = np.random.rand(1000, 10)
        label = np.random.randint(2, size=1000)
        train_data = lightgbm.Dataset(data, label=label)
        params = {'device': 'gpu'}
        gbm = lightgbm.train(params, train_set=train_data)
        return True
    except Exception as e:
        return False

print(check_gpu_support())

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.01 MB) transferred to GPU in 0.000892 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.487000
True


In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,0,0,0,3,0,1,0,0,3,3,1,0,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11,0,0,0,0,0,9,0,0,0,0,0,0,3,0,1,3,0,0,0,0,0,0,0,1,1,0,0,3,0,0,0,0,0,0,2,0,0,Class_6
1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,Class_6
2,2,0,0,0,0,0,1,0,3,0,0,1,0,0,0,0,0,0,0,0,5,4,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,5,0,5,0,1,0,0,0,1,0,0,1,0,7,0,1,1,0,3,4,0,0,1,3,0,2,0,0,8,0,0,0,0,1,0,0,0,0,Class_2
3,3,0,0,7,0,1,5,2,2,0,1,2,0,5,0,0,4,0,0,22,2,1,0,0,0,0,3,0,37,0,1,3,13,0,10,0,3,1,1,0,7,0,0,2,0,1,0,0,0,0,0,0,10,0,0,25,1,0,1,2,0,2,0,7,0,0,0,0,4,0,2,2,0,4,3,0,Class_8
4,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Class_2


In [6]:
# Imbalance
df_train['target'].value_counts()

Class_6    51811
Class_8    51763
Class_9    25542
Class_2    24431
Class_3    14798
Class_7    14769
Class_1     9118
Class_4     4704
Class_5     3064
Name: target, dtype: int64

In [7]:
all_data = pd.concat([df_train, df_test]).reset_index(drop=True)
all_data.drop(['id'], axis=1, inplace=True)

In [8]:
# # Check missing value
# print(all_data.isnull().sum())

# 敘述統計量
# print(all_data.describe())

# Check category features or numerical features
# print(all_data.nunique())

In [9]:
num_col = [i for i in all_data.columns if i not in ['id', 'target']]
cat_col = []
target_col = 'target'
comb = num_col + cat_col + [target_col]

# transform
scaler = MinMaxScaler()
all_data[num_col] = pd.DataFrame(scaler.fit_transform(all_data[num_col]), columns=num_col)

# Label Y
le = LabelEncoder()
y = le.fit_transform(df_train[target_col])

In [10]:
# split train and test
df_train = all_data[:len(df_train)]
df_train[target_col] = y.astype('int64')

df_test = all_data[len(df_train):]
df_test[target_col] = np.int64(0)

In [11]:
# Set-Up
PROJECT_NAME = 'pycaret-20210602'
SEED = 20210602

setup(data = df_train[comb],
      categorical_features=cat_col,
      numeric_features=num_col,
      imputation_type='iterative',
      target = target_col,
      train_size=1.0,
      test_data=df_test,
      data_split_shuffle=True,
      data_split_stratify=True,
      fold = 5,
      fold_strategy='stratifiedkfold',   # or 'kfold', 'stratifiedkfold', 'groupkfold', 'timeseries'
      fold_shuffle = True,
      n_jobs=-1,
      use_gpu = True,
      session_id=SEED,   # seed
      silent = True)

Unnamed: 0,Description,Value
0,session_id,20210602
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,"0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8"
4,Original Data,"(200000, 76)"
5,Missing Values,False
6,Numeric Features,75
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


(False,
 None,
         feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
 200000   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
 200001   0.016393   0.039216   0.000000   0.000000   0.000000   0.000000   
 200002   0.000000   0.019608   0.109375   0.014286   0.000000   0.000000   
 200003   0.000000   0.000000   0.000000   0.057143   0.078947   0.013158   
 200004   0.000000   0.000000   0.078125   0.000000   0.000000   0.000000   
 ...           ...        ...        ...        ...        ...        ...   
 299995   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
 299996   0.000000   0.000000   0.000000   0.000000   0.000000   0.013158   
 299997   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
 299998   0.000000   0.000000   0.000000   0.000000   0.052632   0.026316   
 299999   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
 
         feature_6  feature_7  feature_8  feature_9  ...  f

In [12]:
# GPU Enable
# models(internal=True)[['Name', 'GPU Enabled']]

In [13]:
# check all metrics used for model evaluation
# print(get_metrics())

# add Log Loss metric in pycaret
add_metric('logloss', 'LogLoss', log_loss, target='pred_proba', greater_is_better=False)

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function               <function log_loss at 0x000001EF4225CF78>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [14]:
top6 = compare_models(exclude=['catboost'],   #catboost不能 tune_model
                      sort='Accuracy',   # or ['Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC']
                      n_select=6,
                      cross_validation=True,
                      budget_time=10*60,   # in minutes
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
gbc,Gradient Boosting Classifier,0.3606,0.6717,0.185,0.2768,0.284,0.1683,0.1794,1.7533,549.052
lightgbm,Light Gradient Boosting Machine,0.3603,0.6691,0.1839,0.2809,0.2846,0.1667,0.178,1.7563,31.406
xgboost,Extreme Gradient Boosting,0.3548,0.6643,0.1828,0.2765,0.2845,0.162,0.1713,1.7694,58.78
ada,Ada Boost Classifier,0.3541,0.6576,0.1831,0.2594,0.2775,0.1619,0.1715,2.1782,15.822
rf,Random Forest Classifier,0.3504,0.6502,0.1773,0.2673,0.2811,0.152,0.1617,2.0229,43.518
et,Extra Trees Classifier,0.3464,0.6447,0.1737,0.2694,0.2761,0.1442,0.1543,2.0284,76.622
lr,Logistic Regression,0.3403,0.6327,0.172,0.2339,0.2641,0.1376,0.1471,1.8285,25.044
lda,Linear Discriminant Analysis,0.333,0.6332,0.1592,0.2372,0.2547,0.1165,0.128,1.8312,2.694
ridge,Ridge Classifier,0.3226,0.0,0.1468,0.2339,0.239,0.0948,0.1077,0.0,0.364
knn,K Neighbors Classifier,0.2601,0.5726,0.1546,0.236,0.2394,0.0858,0.0872,13.8432,746.396


In [15]:
# Tune Top 6 models
tuned_top6 = [tune_model(i, n_iter=10, search_library="optuna", search_algorithm="tpe") for i in top6]

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.3582,0.6618,0.1789,0.2299,0.2788,0.1582,0.1697,1.797
1,0.3577,0.6628,0.1786,0.2292,0.2783,0.1575,0.1689,1.7974
2,0.3567,0.6597,0.178,0.2292,0.2777,0.1561,0.1676,1.7984
3,0.3566,0.6602,0.1783,0.2288,0.2775,0.1558,0.1672,1.797
4,0.3602,0.6614,0.1809,0.2311,0.2805,0.1612,0.1727,1.7955
Mean,0.3579,0.6612,0.1789,0.2296,0.2786,0.1578,0.1692,1.7971
SD,0.0013,0.0011,0.001,0.0008,0.0011,0.0019,0.002,0.0009


In [16]:
# stacking
stacker = stack_models(estimator_list = tuned_top6[1:], meta_model=tuned_top6[0])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.3634,0.6752,0.1841,0.2682,0.2833,0.1686,0.182,1.7472
1,0.3624,0.6762,0.1837,0.2608,0.2825,0.1677,0.1806,1.7451
2,0.3606,0.6725,0.1826,0.2651,0.2815,0.165,0.1779,1.7491
3,0.3626,0.6736,0.1838,0.2467,0.2829,0.1677,0.1805,1.7467
4,0.3643,0.6768,0.1853,0.2559,0.284,0.1703,0.1833,1.7428
Mean,0.3627,0.6748,0.1839,0.2593,0.2828,0.1679,0.1809,1.7462
SD,0.0012,0.0016,0.0009,0.0075,0.0008,0.0017,0.0018,0.0021


In [17]:
# blending
blender = blend_models(tuned_top6)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.3638,0.6733,0.1844,0.2355,0.2841,0.1695,0.1817,1.7807
1,0.3599,0.6741,0.1825,0.2329,0.281,0.1648,0.1765,1.7808
2,0.3599,0.6698,0.1825,0.2334,0.2811,0.1644,0.1764,1.7834
3,0.3608,0.6712,0.1832,0.2336,0.2817,0.1656,0.1775,1.7807
4,0.3638,0.6741,0.1853,0.2355,0.2841,0.1699,0.1821,1.7785
Mean,0.3616,0.6725,0.1836,0.2342,0.2824,0.1668,0.1788,1.7808
SD,0.0018,0.0017,0.0011,0.0011,0.0014,0.0024,0.0025,0.0015


In [19]:
# select best model 
best = automl(optimize = 'Accuracy')
print(best)

StackingClassifier(cv=StratifiedKFold(n_splits=5, random_state=20210602, shuffle=True),
                   estimators=[('lightgbm',
                                LGBMClassifier(bagging_fraction=0.6562017395042332,
                                               bagging_freq=2,
                                               boosting_type='gbdt',
                                               class_weight=None,
                                               colsample_bytree=1.0,
                                               device='gpu',
                                               feature_fraction=0.4253723720487174,
                                               importance_type='split',
                                               learning_rate=0.05684321254283615,
                                               max_dept...
                                                              min_impurity_decrease=9.817071798033433e-06,
                                                    

In [20]:
# save best, stacking, blending and top6 model
save_model(best, model_name=PROJECT_NAME)
save_model(stacker, model_name=PROJECT_NAME + '_stacking')
save_model(blender, model_name=PROJECT_NAME + '_blending')
save_model(tuned_top6, model_name=PROJECT_NAME + '_top6')

Transformation Pipeline and Model Succesfully Saved
Transformation Pipeline and Model Succesfully Saved
Transformation Pipeline and Model Succesfully Saved
Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=['feature_0',
                                                           'feature_1',
                                                           'feature_2',
                                                           'feature_3',
                                                           'feature_4',
                                                           'feature_5',
                                                           'feature_6',
                                                           'feature_7',
                                                           'feature_8',
                                   

In [21]:
# load best model
best = load_model(PROJECT_NAME)

Transformation Pipeline and Model Successfully Loaded


In [22]:
# predict probability
result = predict_model(best, data=df_test, raw_score=True)

In [28]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result.iloc[:, -9:].values
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)

In [30]:
# load blending model
blender = load_model(PROJECT_NAME + '_blending')

Transformation Pipeline and Model Successfully Loaded


In [None]:
# predict probability
result = predict_model(blender, data=df_test, raw_score=True)

In [None]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result.iloc[:, -9:].values
sub.to_csv(f'{PROJECT_NAME}_blending.csv', index=False)