In [None]:
# Check GPU version
!nvidia-smi

In [None]:
# Check CUDA/cuDNN Version
!nvcc -V && which nvcc

In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install pycaret[full]
!pip install wandb

In [None]:
# 即時監控colab資源
import wandb

# connect with jim107225017/colab/20210526
wandb.init(project='colab', entity='jim107225017', name='CPU_GPU', id='20210526')

In [None]:
# install lightgbm GPU in colab
# 先登入google cloud
from google.colab import drive
drive.mount('/content/drive')

!pip uninstall lightgbm -y
!git clone --recursive https://github.com/Microsoft/LightGBM
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu

In [3]:
# import packages
import os
import numpy as np
import pandas as pd

# Pycaret
from pycaret.classification import *

import catboost
import lightgbm
import xgboost

from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from joblib import load, dump

def check_gpu_support():
    try:
        data = np.random.rand(1000, 10)
        label = np.random.randint(2, size=1000)
        train_data = lightgbm.Dataset(data, label=label)
        params = {'device': 'gpu'}
        gbm = lightgbm.train(params, train_set=train_data)
        return True
    except Exception as e:
        return False

print(check_gpu_support())

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.01 MB) transferred to GPU in 0.000863 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.517000
True


In [4]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [5]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [6]:
df_train.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,0,0,0,3,0,1,0,0,3,3,1,0,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11,0,0,0,0,0,9,0,0,0,0,0,0,3,0,1,3,0,0,0,0,0,0,0,1,1,0,0,3,0,0,0,0,0,0,2,0,0,Class_6
1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,Class_6
2,2,0,0,0,0,0,1,0,3,0,0,1,0,0,0,0,0,0,0,0,5,4,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,5,0,5,0,1,0,0,0,1,0,0,1,0,7,0,1,1,0,3,4,0,0,1,3,0,2,0,0,8,0,0,0,0,1,0,0,0,0,Class_2
3,3,0,0,7,0,1,5,2,2,0,1,2,0,5,0,0,4,0,0,22,2,1,0,0,0,0,3,0,37,0,1,3,13,0,10,0,3,1,1,0,7,0,0,2,0,1,0,0,0,0,0,0,10,0,0,25,1,0,1,2,0,2,0,7,0,0,0,0,4,0,2,2,0,4,3,0,Class_8
4,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Class_2


In [7]:
# Imbalance
df_train['target'].value_counts()

Class_6    51811
Class_8    51763
Class_9    25542
Class_2    24431
Class_3    14798
Class_7    14769
Class_1     9118
Class_4     4704
Class_5     3064
Name: target, dtype: int64

In [8]:
all_data = pd.concat([df_train, df_test]).reset_index(drop=True)
all_data.drop(['id','target'], axis=1, inplace=True)

In [9]:
# print(all_data.isnull().sum())
# all_data.describe()

In [10]:
num_col = [i for i in df_train.columns if i not in ['id', 'target']]
cat_col = []
target_col = 'target'
comb = num_col + cat_col + [target_col]

# transform
scaler = MinMaxScaler()
all_data[num_col] = pd.DataFrame(scaler.fit_transform(all_data[num_col]), columns=num_col)

# Label Y
le = LabelEncoder()
df_train[target_col] = le.fit_transform(df_train[target_col])

df_train[num_col] = all_data[num_col][:len(df_train)]

In [11]:
# Set-Up
PROJECT_NAME = 'pycaret-20210602'

setup(data = df_train[comb],
      categorical_features=cat_col,
      numeric_features=num_col,
      imputation_type='iterative',
      target = target_col,
      train_size=0.8,
      data_split_shuffle=True,
      data_split_stratify=True,
      fold = 5,
      fold_strategy='stratifiedkfold',   # or 'kfold', 'stratifiedkfold', 'groupkfold', 'timeseries'
      fold_shuffle = True,
      n_jobs=-1,
      use_gpu = True,
      session_id=42,   # seed
      silent = True)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(200000, 76)"
5,Missing Values,False
6,Numeric Features,75
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


([],
 StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
 False,
 184563    1
 110008    6
 10866     7
 88471     8
 148448    6
          ..
 130088    8
 160980    5
 68536     6
 91266     2
 145705    4
 Name: target, Length: 40000, dtype: int32,
 {'lr': <pycaret.containers.models.classification.LogisticRegressionClassifierContainer at 0x20b7a3b5348>,
  'knn': <pycaret.containers.models.classification.KNeighborsClassifierContainer at 0x20b7a40de48>,
  'nb': <pycaret.containers.models.classification.GaussianNBClassifierContainer at 0x20b7a40d088>,
  'dt': <pycaret.containers.models.classification.DecisionTreeClassifierContainer at 0x20b7a411508>,
  'svm': <pycaret.containers.models.classification.SGDClassifierContainer at 0x20b7a411188>,
  'rbfsvm': <pycaret.containers.models.classification.SVCClassifierContainer at 0x20b7a411048>,
  'gpc': <pycaret.containers.models.classification.GaussianProcessClassifierContainer at 0x20b7a40f1c8>,
  'mlp': <pycaret.containers.models.cl

In [12]:
# GPU Enable
# models(internal=True)[['Name', 'GPU Enabled']]

In [13]:
# check all metrics used for model evaluation
# print(get_metrics())

# add Log Loss metric in pycaret
add_metric('logloss', 'LogLoss', log_loss, target='pred_proba', greater_is_better=False)

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function               <function log_loss at 0x0000020B697DE5E8>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [None]:
top6 = compare_models(# include=['xgboost', 'lightgbm', 'catboost'] ,
                        sort='Accuracy',   # or ['Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC']
                        n_select=6,
                        cross_validation=False,
                        budget_time=10*60,   # in minutes
                        )

IntProgress(value=0, description='Processing: ', max=79)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
lr,Logistic Regression,0.3387,0.6314,0.1712,0.2161,0.2629,0.1352,0.1446,1.83,26.67


In [22]:
# Tune Top 6 models
tuned_top6 = [tune_model(i, n_iter=50, search_library="tune-sklearn", search_algorithm="optuna",) for i in top6]

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
# stacking
stacker = stack_models(tuned_top6)

In [None]:
# blending
blender = blend_models(tuned_top6)

In [13]:
# select best model 
best = automl(optimize = 'logloss')
print(best)

ValueError: max() arg is an empty sequence

In [None]:
# save best, stacking, blending and top6 model
save_model(best, model_name=PROJECT_NAME)
save_model(stacker, model_name=PROJECT_NAME + '_stacking')
save_model(blender, model_name=PROJECT_NAME + '_blending')
save_model(tuned_top6, model_name=PROJECT_NAME + '_top6')

In [None]:
# load best model
best = load_model(PROJECT_NAME)

In [None]:
# predict probability
result = predict_model(best, data=df_test, raw_score=True)

In [None]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result.iloc[:, -4:]
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)