In [None]:
# Check GPU version
!nvidia-smi

In [None]:
# Check CUDA/cuDNN Version
!nvcc -V && which nvcc

In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install pycaret[full]
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [None]:
# install lightgbm GPU in colab
# 先登入google cloud
from google.colab import drive
drive.mount('/content/drive')

!pip uninstall lightgbm -y
!git clone --recursive https://github.com/Microsoft/LightGBM
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# Pycaret
from pycaret.classification import *

import catboost
import lightgbm
import xgboost

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from category_encoders.cat_boost import CatBoostEncoder
from joblib import load, dump

def check_gpu_support():
    try:
        data = np.random.rand(1000, 10)
        label = np.random.randint(2, size=1000)
        train_data = lightgbm.Dataset(data, label=label)
        params = {'device': 'gpu'}
        gbm = lightgbm.train(params, train_set=train_data)
        return True
    except Exception as e:
        return False

print(check_gpu_support())

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.01 MB) transferred to GPU in 0.001006 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.480000
True


In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,0,0,0,3,0,1,0,0,3,3,1,0,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11,0,0,0,0,0,9,0,0,0,0,0,0,3,0,1,3,0,0,0,0,0,0,0,1,1,0,0,3,0,0,0,0,0,0,2,0,0,Class_6
1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,Class_6
2,2,0,0,0,0,0,1,0,3,0,0,1,0,0,0,0,0,0,0,0,5,4,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,5,0,5,0,1,0,0,0,1,0,0,1,0,7,0,1,1,0,3,4,0,0,1,3,0,2,0,0,8,0,0,0,0,1,0,0,0,0,Class_2
3,3,0,0,7,0,1,5,2,2,0,1,2,0,5,0,0,4,0,0,22,2,1,0,0,0,0,3,0,37,0,1,3,13,0,10,0,3,1,1,0,7,0,0,2,0,1,0,0,0,0,0,0,10,0,0,25,1,0,1,2,0,2,0,7,0,0,0,0,4,0,2,2,0,4,3,0,Class_8
4,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Class_2


In [6]:
# Imbalance
df_train['target'].value_counts()

Class_6    51811
Class_8    51763
Class_9    25542
Class_2    24431
Class_3    14798
Class_7    14769
Class_1     9118
Class_4     4704
Class_5     3064
Name: target, dtype: int64

In [7]:
all_data = pd.concat([df_train, df_test]).reset_index(drop=True)
all_data.drop(['id'], axis=1, inplace=True)

In [8]:
# # Check missing value
# print(all_data.isnull().sum())

# 敘述統計量
# print(all_data.describe())

# Check category features or numerical features
# print(all_data.nunique())

In [9]:
num_col = []
cat_col = [i for i in all_data.columns if i not in ['id', 'target']]
target_col = 'target'
comb = num_col + cat_col + [target_col]

# Label Y
le = LabelEncoder()
y = le.fit_transform(df_train[target_col])

# Target-based Encoder
ce = CatBoostEncoder(cols=cat_col, random_state=42)
x = df_train[cat_col]
ce.fit(x, y)

all_data[cat_col] = all_data[cat_col]
all_data[cat_col] = pd.DataFrame(ce.transform(all_data[cat_col]), columns=cat_col)

In [10]:
# split train and test
df_train = all_data[:len(df_train)]
df_train[target_col] = y.astype('int64')

df_test = all_data[len(df_train):]
df_test[target_col] = np.int64(0)

In [11]:
# Set-Up
PROJECT_NAME = 'pycaret-20210604'
SEED = 20210604

setup(data = df_train[comb],
      categorical_features=None,
      numeric_features=cat_col + num_col,
      imputation_type='iterative',
      target = target_col,
      train_size=1.0,
      test_data=df_test,
      data_split_shuffle=True,
      data_split_stratify=True,
      fold = 5,
      fold_strategy='stratifiedkfold',   # or 'kfold', 'stratifiedkfold', 'groupkfold', 'timeseries'
      fold_shuffle = True,
      n_jobs=-1,
      use_gpu = True,
      session_id=SEED,   # seed
      silent = True)

Unnamed: 0,Description,Value
0,session_id,20210604
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,"0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8"
4,Original Data,"(200000, 76)"
5,Missing Values,False
6,Numeric Features,75
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Make_Time_Features' object has no attribute 'list_of_features'

In [None]:
# GPU Enable
# models(internal=True)[['Name', 'GPU Enabled']]

In [18]:
# check all metrics used for model evaluation
# print(get_metrics())

# add Log Loss metric in pycaret
add_metric('logloss', 'LogLoss', log_loss, target='pred_proba', greater_is_better=False)

ValueError: id already present in metrics dataframe.

In [12]:
top6 = compare_models(# exclude=['catboost'],   #catboost不能 tune_model
                      sort='Accuracy',   # or ['Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC']
                      n_select=6,
                      cross_validation=True,
                      budget_time=10*60,   # in minutes
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.3609,0.6719,0.1852,0.2804,0.2843,0.169,0.1798,594.484
lightgbm,Light Gradient Boosting Machine,0.3604,0.6696,0.1842,0.2791,0.2851,0.1672,0.1783,37.584
lr,Logistic Regression,0.359,0.6688,0.1858,0.2595,0.2818,0.1688,0.1788,132.678
ridge,Ridge Classifier,0.3588,0.0,0.1833,0.253,0.2803,0.165,0.175,0.392
lda,Linear Discriminant Analysis,0.3579,0.6689,0.1863,0.2582,0.2829,0.1691,0.1788,2.978
catboost,CatBoost Classifier,0.3564,0.6662,0.1837,0.2757,0.2878,0.1643,0.1731,36.126
xgboost,Extreme Gradient Boosting,0.3558,0.6658,0.1833,0.2784,0.2852,0.1633,0.1726,56.378
ada,Ada Boost Classifier,0.3542,0.6572,0.1835,0.2604,0.2776,0.1624,0.1719,16.132
rf,Random Forest Classifier,0.3504,0.65,0.1775,0.2715,0.2818,0.1521,0.1616,45.48
et,Extra Trees Classifier,0.3478,0.6481,0.1759,0.2659,0.2815,0.1491,0.1579,83.514


In [13]:
# Tune Top 6 models
tuned_top6 = top6
# tuned_top6 = [tune_model(i, n_iter=10, search_library="optuna", search_algorithm="tpe") for i in top6]

In [14]:
# stacking
stacker = stack_models(estimator_list = tuned_top6[1:], meta_model=tuned_top6[0])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.3604,0.675,0.183,0.2706,0.2829,0.1663,0.1781
1,0.3635,0.6742,0.1853,0.3002,0.2865,0.17,0.1819
2,0.3637,0.6746,0.1852,0.2779,0.2861,0.1703,0.1819
3,0.3605,0.6746,0.1827,0.2754,0.2831,0.1649,0.1765
4,0.361,0.6733,0.1841,0.2903,0.2839,0.1674,0.1789
Mean,0.3618,0.6743,0.184,0.2829,0.2845,0.1678,0.1795
SD,0.0015,0.0006,0.0011,0.0108,0.0015,0.0021,0.0021


In [15]:
# blending
blender = blend_models(tuned_top6)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SD,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# select best model 
best = automl(optimize = 'Accuracy')
print(best)

StackingClassifier(cv=StratifiedKFold(n_splits=5, random_state=20210604, shuffle=True),
                   estimators=[('lightgbm',
                                LGBMClassifier(boosting_type='gbdt',
                                               class_weight=None,
                                               colsample_bytree=1.0,
                                               device='gpu',
                                               importance_type='split',
                                               learning_rate=0.1, max_depth=-1,
                                               min_child_samples=20,
                                               min_child_weight=0.001,
                                               min_split_gain=0.0,
                                               n_estimators=100, n_jobs=-1,
                                               num_leav...
                                                              max_features=None,
                            

In [None]:
# save best, stacking, blending and top6 model
save_model(best, model_name=PROJECT_NAME)
save_model(stacker, model_name=PROJECT_NAME + '_stacking')
save_model(blender, model_name=PROJECT_NAME + '_blending')
save_model(tuned_top6, model_name=PROJECT_NAME + '_top6')

In [None]:
# load best model
best = load_model(PROJECT_NAME)

In [24]:
# predict probability
result = predict_model(best, data=df_test, raw_score=True)

In [32]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result.iloc[:, -9:].values
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)

In [None]:
# load blending model
blender = load_model(PROJECT_NAME + '_blending')

In [None]:
# predict probability
result = predict_model(blender, data=df_test, raw_score=True)

In [None]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result.iloc[:, -9:].values
sub.to_csv(f'{PROJECT_NAME}_blending.csv', index=False)