# Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import data

In [2]:
import pandas as pd

In [3]:
path = '/content/drive/MyDrive/dataset/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

#  Installation

### 방법 1

In [4]:
# pycaret에 없는 모델 설치
!pip install xgboost
!pip install --upgrade xgboost
!pip install catboost



In [5]:
# pycaret 설치
!pip install pycaret 
#!pip install category_encoders
# !pip install --upgrade pycaret



In [7]:
# import pycaret
from pycaret.classification import *

### 방법 2

In [5]:
!pip install pycaret[full]

Collecting pycaret[full]
  Downloading pycaret-2.3.6-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 5.1 MB/s 
Collecting imbalanced-learn==0.7.0
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 34.2 MB/s 
[?25hCollecting kmodes>=0.10.1
  Downloading kmodes-0.11.1-py2.py3-none-any.whl (19 kB)
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 63.1 MB/s 
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 50.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting mlfl

In [4]:
from pycaret.utils import enable_colab
enable_colab()

Colab mode enabled.


In [5]:
!pip install pycaret[full]



In [7]:
import pycaret

from pycaret.classification import *

# Data preprocessing

# Modeling

### setup()

말 그대로 준비 단계. 기본적으로 data와 target을 의무적으로 설정한 후 필요한 옵션들을 분석 목적에 맞게 설정 가능 

In [8]:
model = setup(
    data = train,
    target = "target",
    fold = 15
)

Unnamed: 0,Description,Value
0,session_id,228
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(3000, 24)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,18
8,Ordinal Features,False
9,High Cardinality Features,False


### compare_models()

setup에 맞는 모델 자동으로 적용해 보고 비교
> 직접 모델을 선언해줄 수도 있음: model() -> create_model()

In [9]:
top5_model = compare_models(
               round=4,
               sort="Accuracy",
               n_select = 5 )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.93,0.9821,0.9363,0.9379,0.9367,0.8583,0.8593,2.926
xgboost,Extreme Gradient Boosting,0.9252,0.9785,0.9346,0.9314,0.9327,0.8485,0.8493,3.512
lightgbm,Light Gradient Boosting Machine,0.9219,0.981,0.9277,0.9315,0.9293,0.842,0.8427,0.1527
gbc,Gradient Boosting Classifier,0.9152,0.9743,0.9234,0.9238,0.9234,0.8284,0.829,0.51
et,Extra Trees Classifier,0.9142,0.9758,0.9329,0.9146,0.9233,0.826,0.827,0.6247
rf,Random Forest Classifier,0.9095,0.9738,0.9286,0.9105,0.9192,0.8163,0.8172,0.6267
lr,Logistic Regression,0.8938,0.9597,0.9088,0.9007,0.9045,0.7848,0.7854,1.0947
ada,Ada Boost Classifier,0.8928,0.9566,0.8924,0.912,0.9018,0.7838,0.7846,0.208
lda,Linear Discriminant Analysis,0.8904,0.9582,0.9027,0.9003,0.9011,0.7782,0.7791,0.0473
ridge,Ridge Classifier,0.8895,0.0,0.9036,0.8982,0.9004,0.7762,0.7771,0.0187


# Tuning

### tune_model()

모델의 하이퍼 파라미터 최적화

In [19]:
tuned_top5 = [tune_model(i,n_iter = 10) for i in top5_model]   # top11 모델에 대해 각각 튜닝을 진행

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9,0.9644,0.9103,0.9103,0.9103,0.7974,0.7974
1,0.9,0.9578,0.9231,0.9,0.9114,0.7967,0.797
2,0.8929,0.9686,0.8718,0.9315,0.9007,0.7847,0.7867
3,0.8929,0.9671,0.8846,0.92,0.902,0.784,0.7847
4,0.9357,0.9793,0.9359,0.9481,0.9419,0.8699,0.87
5,0.8714,0.9601,0.8718,0.8947,0.8831,0.7403,0.7406
6,0.8929,0.9636,0.8974,0.9091,0.9032,0.7832,0.7833
7,0.8929,0.9665,0.8718,0.9315,0.9007,0.7847,0.7867
8,0.9,0.9731,0.8718,0.9444,0.9067,0.7993,0.8023
9,0.8857,0.9715,0.8718,0.9189,0.8947,0.7699,0.7712


### blend_models()

Voting 알고리즘을 구현

In [20]:
blend5_soft = blend_models(estimator_list=top5_model, method='soft', choose_better = True)   # 위의 best 3 모델에 대해 soft voting 바탕으로 최적화

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9357,0.9868,0.9487,0.9367,0.9427,0.8695,0.8696
1,0.9071,0.9758,0.9487,0.8916,0.9193,0.8103,0.8124
2,0.9429,0.9872,0.9487,0.9487,0.9487,0.8842,0.8842
3,0.9143,0.9806,0.9103,0.9342,0.9221,0.8269,0.8272
4,0.95,0.9909,0.9615,0.9494,0.9554,0.8985,0.8986
5,0.9214,0.9853,0.9359,0.9241,0.9299,0.8405,0.8406
6,0.9357,0.9868,0.9487,0.9367,0.9427,0.8695,0.8696
7,0.9071,0.9768,0.8974,0.9333,0.915,0.8128,0.8135
8,0.95,0.9866,0.9359,0.9733,0.9542,0.8992,0.9
9,0.9286,0.9857,0.9487,0.925,0.9367,0.8548,0.8551


### stack_models()

stacking 적용

In [10]:
blend5_stack = stack_models(estimator_list=top5_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9143,0.9806,0.9221,0.9221,0.9221,0.8268,0.8268
1,0.9571,0.9918,0.961,0.961,0.961,0.9134,0.9134
2,0.95,0.988,0.961,0.9487,0.9548,0.8988,0.8989
3,0.9071,0.9685,0.8701,0.9571,0.9116,0.8143,0.8184
4,0.9143,0.9718,0.9221,0.9221,0.9221,0.8268,0.8268
5,0.95,0.9829,0.9481,0.9605,0.9542,0.8991,0.8992
6,0.8929,0.9639,0.9221,0.8875,0.9045,0.7826,0.7833
7,0.9214,0.9773,0.9359,0.9241,0.9299,0.8405,0.8406
8,0.9143,0.9642,0.8974,0.9459,0.9211,0.8274,0.8288
9,0.9214,0.9744,0.9487,0.9136,0.9308,0.84,0.8408


# Final train

이전까지는 train/validation으로 fold를 나누어 분할 데이터에 대해 학습을 진행했는데 이제는 예측을 위해 전체 데이터에 대해 stacking한 모델을 마지막으로 학습

In [11]:
final_model = finalize_model(blend5_stack)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

# Prediction

In [12]:
prediction = predict_model(final_model, data = test) 