In [1]:
%load_ext lab_black

# 목차
## 0. Library
## 1. Loading Data
## 2. Feature Engineering
- Pivot table
- Find Best Model
- Scaling

## 3. Modeling
- Bayesian Optimization

## 4. Make Submission

# 0. Library

In [2]:
## 1. Loading Data
import pandas as pd

path = "data/"

## 2. Feature Engineering

### Find Best Model with Pycaret
from pycaret.classification import *

### Scaling
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## 3. Modeling
from sklearn.metrics import log_loss
import time
from bayes_opt import BayesianOptimization

## 4. Make Submission
from datetime import datetime
import os

## 1. Loading Data

In [3]:
X_train = pd.read_csv(path + "train_features.csv")
y_train = pd.read_csv(path + "train_labels.csv")
X_test = pd.read_csv(path + "test_features.csv")
display(X_train.head(3))
display(y_train.head(3))
display(X_test.head(3))

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
0,0,0,1.206087,-0.179371,-0.148447,-0.591608,-30.54901,-31.676112
1,0,1,1.287696,-0.198974,-0.182444,0.3031,-39.139103,-24.927216
2,0,2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629


Unnamed: 0,id,label,label_desc
0,0,37,Shoulder Press (dumbbell)
1,1,26,Non-Exercise
2,2,3,Biceps Curl (band)


Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
0,3125,0,-0.6281,-0.160155,0.151487,49.665357,88.435961,13.597668
1,3125,1,-0.462548,0.012462,-0.053726,56.953059,96.185341,16.278458
2,3125,2,-0.363481,-0.091789,-0.130004,29.557396,93.836453,13.329043


## 2. Feature Engineering

### 1) Make Pivot Table
현재 데이터가 3125명 * 600개의 log 로 구성되어 있습니다.

Machine Learning Model에 넣기 위해서는 데이터를 집약해서 한명의 id에 데이터를 집약적으로 넣어야 합니다.

따라서 첫번째로 Pandas에서 지원하는 Pivot Table Method를 이용합니다.

In [4]:
X_pivot_train = pd.pivot_table(
    data=X_train,  # X_train의 데이터를 통해서
    values=X_train.columns[2:],  # id와 time을 제외한 피쳐를 대상으로
    index="id",  # id를 기준으로 잡아
    aggfunc=[
        "sum",
        "mean",  # 합, 평균
        "median",
        "min",
        "max",  # 중앙값 최소값, 최대값
        "std",
        "var",  # 베셀 보정 표본 표준편차, 비편향 편차 의 값을 구합니다.
    ],
)

X_pivot_test = pd.pivot_table(
    data=X_test,  #
    values=X_test.columns[2:],
    index="id",  # id를 기준으로 잡아
    aggfunc=["sum", "mean", "median", "min", "max", "std", "var"],
)

display(X_pivot_train.head(3))
display(X_pivot_test.head(3))

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median,min,min,min,min,min,min,max,max,max,max,max,max,std,std,std,std,std,std,var,var,var,var,var,var
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2
0,558.797337,-131.082711,-222.252919,-1119.161589,-2015.703683,709.264425,0.931329,-0.218471,-0.370422,-1.865269,-3.359506,1.182107,0.956149,-0.240638,-0.346749,-1.273569,-2.36223,1.913286,0.59194,-0.624113,-0.786336,-46.254836,-85.887677,-79.930029,1.344268,0.176871,-0.054876,31.644123,69.847244,55.953827,0.191479,0.177131,0.135131,13.284216,24.300479,25.275185,0.036664,0.031375,0.01826,176.470384,590.513292,638.834979
1,-459.948117,-190.354639,-2.534051,6642.960123,1044.284884,835.976169,-0.76658,-0.317258,-0.004223,11.0716,1.740475,1.393294,-0.805767,-0.228905,-0.034583,3.81065,8.043707,-0.655819,-2.156208,-1.295598,-1.019531,-325.328531,-315.096003,-270.980823,1.23402,0.700065,0.888661,286.624363,389.60806,340.170199,0.495528,0.336415,0.499395,79.244561,96.005289,75.545343,0.245548,0.113175,0.249396,6279.700472,9217.015511,5707.098884
2,23.901616,-49.441742,375.607013,-5083.770868,358.725917,1831.974458,0.039836,-0.082403,0.626012,-8.472951,0.597877,3.053291,0.140667,-0.062598,0.634781,-8.112557,19.306132,3.568888,-1.142847,-0.69099,0.073846,-164.779067,-249.953944,-44.192071,1.219836,0.650645,1.332992,73.525082,297.320834,55.642836,0.711972,0.147127,0.248807,25.422926,118.956646,13.920337,0.506904,0.021646,0.061905,646.325142,14150.683677,193.775778


Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median,min,min,min,min,min,min,max,max,max,max,max,max,std,std,std,std,std,std,var,var,var,var,var,var
Unnamed: 0_level_1,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2
3125,-611.23836,-11.744605,-139.355669,-1911.076959,1639.123438,-1200.410049,-1.018731,-0.019574,-0.232259,-3.185128,2.731872,-2.000683,-1.064222,-0.005735,-0.268442,-3.77015,0.108956,-1.607847,-1.564,-0.470937,-0.573836,-50.429364,-81.607713,-35.446915,-0.275446,0.22804,0.286182,56.953059,96.185341,49.981455,0.236232,0.091641,0.174672,12.897967,31.993022,12.251648,0.055806,0.008398,0.03051,166.357553,1023.553453,150.102867
3126,-313.705824,367.296809,-42.655405,-10644.915365,4184.863263,-2162.74715,-0.522843,0.612161,-0.071092,-17.741526,6.974772,-3.604579,-0.677411,0.606215,-0.026089,-14.305258,-0.974696,-10.833508,-1.929033,-0.200678,-1.212052,-273.572486,-97.100707,-147.597574,0.627571,1.708743,0.671876,132.830402,241.240196,169.41765,0.539688,0.333015,0.3838,51.625096,45.706311,61.604867,0.291264,0.110899,0.147302,2665.150566,2089.06682,3795.159662
3127,304.167948,542.291164,-84.658968,-1307.846921,-1350.871152,-235.904841,0.506947,0.903819,-0.141098,-2.179745,-2.251452,-0.393175,0.49737,0.931239,-0.156647,0.476247,-2.319172,-3.087735,-0.792916,0.219008,-0.484614,-99.799971,-98.420987,-154.477074,2.972063,1.94182,0.644154,160.426058,74.530763,97.21173,0.219934,0.191485,0.152077,22.770845,13.467885,23.041463,0.048371,0.036666,0.023128,518.511372,181.38394,530.909012


#### Reset Column Name
- 현재 Column 명이 Multi index로 되어있어, 모델에 넣을 때 에러가 발생합니다.
- Column명의 변환 작업 후 id를 다시 column에 추가합니다.

In [5]:
X_pivot_train.columns[:5]

MultiIndex([('sum', 'acc_x'),
            ('sum', 'acc_y'),
            ('sum', 'acc_z'),
            ('sum',  'gy_x'),
            ('sum',  'gy_y')],
           )

In [6]:
X_columns = [agg + "_" + column for agg, column in X_pivot_train.columns]
X_pivot_train.columns = X_columns
X_pivot_test.columns = X_columns
display(X_pivot_train.head())

Unnamed: 0_level_0,sum_acc_x,sum_acc_y,sum_acc_z,sum_gy_x,sum_gy_y,sum_gy_z,mean_acc_x,mean_acc_y,mean_acc_z,mean_gy_x,mean_gy_y,mean_gy_z,median_acc_x,median_acc_y,median_acc_z,median_gy_x,median_gy_y,median_gy_z,min_acc_x,min_acc_y,min_acc_z,min_gy_x,min_gy_y,min_gy_z,max_acc_x,max_acc_y,max_acc_z,max_gy_x,max_gy_y,max_gy_z,std_acc_x,std_acc_y,std_acc_z,std_gy_x,std_gy_y,std_gy_z,var_acc_x,var_acc_y,var_acc_z,var_gy_x,var_gy_y,var_gy_z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
0,558.797337,-131.082711,-222.252919,-1119.161589,-2015.703683,709.264425,0.931329,-0.218471,-0.370422,-1.865269,-3.359506,1.182107,0.956149,-0.240638,-0.346749,-1.273569,-2.36223,1.913286,0.59194,-0.624113,-0.786336,-46.254836,-85.887677,-79.930029,1.344268,0.176871,-0.054876,31.644123,69.847244,55.953827,0.191479,0.177131,0.135131,13.284216,24.300479,25.275185,0.036664,0.031375,0.01826,176.470384,590.513292,638.834979
1,-459.948117,-190.354639,-2.534051,6642.960123,1044.284884,835.976169,-0.76658,-0.317258,-0.004223,11.0716,1.740475,1.393294,-0.805767,-0.228905,-0.034583,3.81065,8.043707,-0.655819,-2.156208,-1.295598,-1.019531,-325.328531,-315.096003,-270.980823,1.23402,0.700065,0.888661,286.624363,389.60806,340.170199,0.495528,0.336415,0.499395,79.244561,96.005289,75.545343,0.245548,0.113175,0.249396,6279.700472,9217.015511,5707.098884
2,23.901616,-49.441742,375.607013,-5083.770868,358.725917,1831.974458,0.039836,-0.082403,0.626012,-8.472951,0.597877,3.053291,0.140667,-0.062598,0.634781,-8.112557,19.306132,3.568888,-1.142847,-0.69099,0.073846,-164.779067,-249.953944,-44.192071,1.219836,0.650645,1.332992,73.525082,297.320834,55.642836,0.711972,0.147127,0.248807,25.422926,118.956646,13.920337,0.506904,0.021646,0.061905,646.325142,14150.683677,193.775778
3,-532.621192,-52.600737,136.413976,10646.500409,2880.558352,-3521.938833,-0.887702,-0.087668,0.227357,17.744167,4.800931,-5.869898,-0.880343,-0.054577,0.231537,8.229938,1.78326,-3.853078,-1.417751,-0.540827,-0.257124,-69.419166,-82.537304,-85.600536,-0.62225,0.283721,0.598814,192.765368,159.083788,56.456908,0.130899,0.194008,0.205882,42.92886,36.953466,23.647153,0.017134,0.037639,0.042387,1842.887012,1365.558625,559.187841
4,-395.410844,-202.240064,121.654507,-2891.782899,5791.027696,2672.029417,-0.659018,-0.337067,0.202758,-4.819638,9.651713,4.453382,-0.941146,-0.168467,0.293556,-1.292194,0.977772,-0.750283,-2.429109,-2.055076,-1.250483,-769.076518,-243.909948,-270.581913,0.59972,1.724782,2.678034,613.9726,284.952954,221.015193,0.49517,0.570305,0.389646,108.258866,60.514531,46.148326,0.245193,0.325247,0.151824,11719.982095,3662.008463,2129.668017


In [7]:
X_pivot_train = X_pivot_train.reset_index()
X_pivot_test = X_pivot_test.reset_index()

display(X_pivot_train.head())
display(X_pivot_test.head())

Unnamed: 0,id,sum_acc_x,sum_acc_y,sum_acc_z,sum_gy_x,sum_gy_y,sum_gy_z,mean_acc_x,mean_acc_y,mean_acc_z,mean_gy_x,mean_gy_y,mean_gy_z,median_acc_x,median_acc_y,median_acc_z,median_gy_x,median_gy_y,median_gy_z,min_acc_x,min_acc_y,min_acc_z,min_gy_x,min_gy_y,min_gy_z,max_acc_x,max_acc_y,max_acc_z,max_gy_x,max_gy_y,max_gy_z,std_acc_x,std_acc_y,std_acc_z,std_gy_x,std_gy_y,std_gy_z,var_acc_x,var_acc_y,var_acc_z,var_gy_x,var_gy_y,var_gy_z
0,0,558.797337,-131.082711,-222.252919,-1119.161589,-2015.703683,709.264425,0.931329,-0.218471,-0.370422,-1.865269,-3.359506,1.182107,0.956149,-0.240638,-0.346749,-1.273569,-2.36223,1.913286,0.59194,-0.624113,-0.786336,-46.254836,-85.887677,-79.930029,1.344268,0.176871,-0.054876,31.644123,69.847244,55.953827,0.191479,0.177131,0.135131,13.284216,24.300479,25.275185,0.036664,0.031375,0.01826,176.470384,590.513292,638.834979
1,1,-459.948117,-190.354639,-2.534051,6642.960123,1044.284884,835.976169,-0.76658,-0.317258,-0.004223,11.0716,1.740475,1.393294,-0.805767,-0.228905,-0.034583,3.81065,8.043707,-0.655819,-2.156208,-1.295598,-1.019531,-325.328531,-315.096003,-270.980823,1.23402,0.700065,0.888661,286.624363,389.60806,340.170199,0.495528,0.336415,0.499395,79.244561,96.005289,75.545343,0.245548,0.113175,0.249396,6279.700472,9217.015511,5707.098884
2,2,23.901616,-49.441742,375.607013,-5083.770868,358.725917,1831.974458,0.039836,-0.082403,0.626012,-8.472951,0.597877,3.053291,0.140667,-0.062598,0.634781,-8.112557,19.306132,3.568888,-1.142847,-0.69099,0.073846,-164.779067,-249.953944,-44.192071,1.219836,0.650645,1.332992,73.525082,297.320834,55.642836,0.711972,0.147127,0.248807,25.422926,118.956646,13.920337,0.506904,0.021646,0.061905,646.325142,14150.683677,193.775778
3,3,-532.621192,-52.600737,136.413976,10646.500409,2880.558352,-3521.938833,-0.887702,-0.087668,0.227357,17.744167,4.800931,-5.869898,-0.880343,-0.054577,0.231537,8.229938,1.78326,-3.853078,-1.417751,-0.540827,-0.257124,-69.419166,-82.537304,-85.600536,-0.62225,0.283721,0.598814,192.765368,159.083788,56.456908,0.130899,0.194008,0.205882,42.92886,36.953466,23.647153,0.017134,0.037639,0.042387,1842.887012,1365.558625,559.187841
4,4,-395.410844,-202.240064,121.654507,-2891.782899,5791.027696,2672.029417,-0.659018,-0.337067,0.202758,-4.819638,9.651713,4.453382,-0.941146,-0.168467,0.293556,-1.292194,0.977772,-0.750283,-2.429109,-2.055076,-1.250483,-769.076518,-243.909948,-270.581913,0.59972,1.724782,2.678034,613.9726,284.952954,221.015193,0.49517,0.570305,0.389646,108.258866,60.514531,46.148326,0.245193,0.325247,0.151824,11719.982095,3662.008463,2129.668017


Unnamed: 0,id,sum_acc_x,sum_acc_y,sum_acc_z,sum_gy_x,sum_gy_y,sum_gy_z,mean_acc_x,mean_acc_y,mean_acc_z,mean_gy_x,mean_gy_y,mean_gy_z,median_acc_x,median_acc_y,median_acc_z,median_gy_x,median_gy_y,median_gy_z,min_acc_x,min_acc_y,min_acc_z,min_gy_x,min_gy_y,min_gy_z,max_acc_x,max_acc_y,max_acc_z,max_gy_x,max_gy_y,max_gy_z,std_acc_x,std_acc_y,std_acc_z,std_gy_x,std_gy_y,std_gy_z,var_acc_x,var_acc_y,var_acc_z,var_gy_x,var_gy_y,var_gy_z
0,3125,-611.23836,-11.744605,-139.355669,-1911.076959,1639.123438,-1200.410049,-1.018731,-0.019574,-0.232259,-3.185128,2.731872,-2.000683,-1.064222,-0.005735,-0.268442,-3.77015,0.108956,-1.607847,-1.564,-0.470937,-0.573836,-50.429364,-81.607713,-35.446915,-0.275446,0.22804,0.286182,56.953059,96.185341,49.981455,0.236232,0.091641,0.174672,12.897967,31.993022,12.251648,0.055806,0.008398,0.03051,166.357553,1023.553453,150.102867
1,3126,-313.705824,367.296809,-42.655405,-10644.915365,4184.863263,-2162.74715,-0.522843,0.612161,-0.071092,-17.741526,6.974772,-3.604579,-0.677411,0.606215,-0.026089,-14.305258,-0.974696,-10.833508,-1.929033,-0.200678,-1.212052,-273.572486,-97.100707,-147.597574,0.627571,1.708743,0.671876,132.830402,241.240196,169.41765,0.539688,0.333015,0.3838,51.625096,45.706311,61.604867,0.291264,0.110899,0.147302,2665.150566,2089.06682,3795.159662
2,3127,304.167948,542.291164,-84.658968,-1307.846921,-1350.871152,-235.904841,0.506947,0.903819,-0.141098,-2.179745,-2.251452,-0.393175,0.49737,0.931239,-0.156647,0.476247,-2.319172,-3.087735,-0.792916,0.219008,-0.484614,-99.799971,-98.420987,-154.477074,2.972063,1.94182,0.644154,160.426058,74.530763,97.21173,0.219934,0.191485,0.152077,22.770845,13.467885,23.041463,0.048371,0.036666,0.023128,518.511372,181.38394,530.909012
3,3128,-346.561617,-366.333946,18.891749,485.147442,-1790.98131,-14.590798,-0.577603,-0.610557,0.031486,0.808579,-2.984969,-0.024318,-0.880541,-0.507927,-0.09286,1.457625,-0.2696,-0.404583,-1.045889,-1.294482,-0.469924,-229.072919,-168.03108,-117.297766,0.337281,-0.258476,0.702574,119.527887,118.268797,167.860762,0.431713,0.233601,0.326569,42.818157,45.069932,37.967372,0.186376,0.054569,0.106647,1833.394532,2031.298793,1441.521366
4,3129,-443.184021,109.52118,240.781103,-858.922755,865.419381,3447.298941,-0.73864,0.182535,0.401302,-1.431538,1.442366,5.745498,-0.703842,0.12228,0.432678,-3.066063,1.631638,1.866352,-2.153047,-0.860883,-0.631258,-345.44724,-223.475411,-125.5986,0.015642,1.562602,1.037876,366.167357,226.728939,138.130133,0.305797,0.314294,0.261848,92.301963,67.911174,43.353007,0.093512,0.098781,0.068565,8519.65235,4611.927587,1879.483194


### 2) FInd Appropriate Model by Pycaret
- lable 데이터를 id에 맞게 merge
- pycaret 모델에 넣어 가장 성능이 좋은 model을 선택
- 해당 모델을 기반으로 모델링 진행

In [8]:
y_train.head()

Unnamed: 0,id,label,label_desc
0,0,37,Shoulder Press (dumbbell)
1,1,26,Non-Exercise
2,2,3,Biceps Curl (band)
3,3,26,Non-Exercise
4,4,26,Non-Exercise


In [9]:
train_data = pd.merge(
    X_pivot_train, y_train.loc[:, ["id", "label"]], on="id"
)  # label_desc는 사용하지 않을 예정입니다.
train_data.label = train_data.label.astype("category")

In [10]:
clf = setup(data=train_data, target="label", silent=True)

Unnamed: 0,Description,Value
0,session_id,5027
1,Target,label
2,Target Type,Multiclass
3,Label Encoded,"0: 0, 1: 1, 10: 2, 11: 3, 12: 4, 13: 5, 14: 6, 15: 7, 16: 8, 17: 9, 18: 10, 19: 11, 2: 12, 20: 13, 21: 14, 22: 15, 23: 16, 24: 17, 25: 18, 26: 19, 27: 20, 28: 21, 29: 22, 3: 23, 30: 24, 31: 25, 32: 26, 33: 27, 34: 28, 35: 29, 36: 30, 37: 31, 38: 32, 39: 33, 4: 34, 40: 35, 41: 36, 42: 37, 43: 38, 44: 39, 45: 40, 46: 41, 47: 42, 48: 43, 49: 44, 5: 45, 50: 46, 51: 47, 52: 48, 53: 49, 54: 50, 55: 51, 56: 52, 57: 53, 58: 54, 59: 55, 6: 56, 60: 57, 7: 58, 8: 59, 9: 60"
4,Original Data,"(3125, 44)"
5,Missing Values,False
6,Numeric Features,43
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [11]:
# Gradient Boosting Classifier, CatboostClassifeir 는 시간이 오래걸려 제외

compare_models(exclude=["gbc", "catboost"], sort="Accuracy", n_select=1, fold=4)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7481,0.9627,0.4808,0.7107,0.7116,0.639,0.6473,1.885
rf,Random Forest Classifier,0.7458,0.9553,0.4771,0.6979,0.6996,0.6296,0.6423,0.35
et,Extra Trees Classifier,0.743,0.9609,0.4654,0.6944,0.692,0.6208,0.6373,0.2275
lda,Linear Discriminant Analysis,0.6959,0.9475,0.4638,0.6792,0.6703,0.5841,0.5858,0.02
lr,Logistic Regression,0.6388,0.9225,0.3406,0.5699,0.5914,0.4865,0.492,1.125
dt,Decision Tree Classifier,0.5921,0.7593,0.3509,0.6178,0.596,0.4643,0.4649,0.2225
knn,K Neighbors Classifier,0.5839,0.8252,0.2393,0.5256,0.5401,0.4064,0.4115,0.2375
ridge,Ridge Classifier,0.5821,0.0,0.1486,0.3992,0.4552,0.2962,0.3542,0.0075
nb,Naive Bayes,0.5482,0.9269,0.486,0.6724,0.5784,0.4549,0.4645,0.1875
qda,Quadratic Discriminant Analysis,0.5112,0.519,0.0267,0.2759,0.3519,0.0503,0.1578,0.0175


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=5027, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

### 2) Scaling

In [12]:
best_score = 0
columns = "scaled_" + X_pivot_train.columns
for scaler in [None, MinMaxScaler(), StandardScaler()]:

    if scaler == None:
        scaled_train_x = X_pivot_train.copy()
        scaled_test_x = X_pivot_test.copy()
    else:
        scaled_train_x = scaler.fit_transform(X_pivot_train)
        scaled_train_x = pd.DataFrame(scaled_train_x, columns=columns).astype("float64")
        scaled_test_x = scaler.transform(X_pivot_test)
        scaled_test_x = pd.DataFrame(scaled_test_x, columns=columns).astype("float64")

    skf = StratifiedKFold(n_splits=3, shuffle=True)
    model = LGBMClassifier(n_jobs=-1)
    scores = cross_val_score(
        model,
        X_pivot_train,
        y_train.label.values,
        scoring="accuracy",
        cv=skf,
        n_jobs=-1,
    )

    if scaler == None:
        print("NON_SCALED")
    else:
        print(scaler.__class__.__name__)
    print(f"TOTAL 최대성능: {max(scores)}\n평균성능: {np.mean(scores)}")
    print("-" * 30)

    if np.mean(scores) >= best_score:
        best_score = np.mean(scores)
        best_train_x = scaled_train_x
        best_test_x = scaled_test_x
        best_scaler = scaler

if best_scaler == None:
    print("BEST SCALER IS NON_SCALED")
else:
    print("BEST SCALER : ", best_scaler.__class__.__name__)

NON_SCALED
TOTAL 최대성능: 0.7694524495677233
평균성능: 0.7580836380196954
------------------------------
MinMaxScaler
TOTAL 최대성능: 0.7617675312199808
평균성능: 0.756801589101478
------------------------------
StandardScaler
TOTAL 최대성능: 0.7646493756003843
평균성능: 0.7532836370363404
------------------------------
BEST SCALER IS NON_SCALED


In [13]:
display(best_train_x.head())

Unnamed: 0,id,sum_acc_x,sum_acc_y,sum_acc_z,sum_gy_x,sum_gy_y,sum_gy_z,mean_acc_x,mean_acc_y,mean_acc_z,...,std_acc_z,std_gy_x,std_gy_y,std_gy_z,var_acc_x,var_acc_y,var_acc_z,var_gy_x,var_gy_y,var_gy_z
0,0,558.797337,-131.082711,-222.252919,-1119.161589,-2015.703683,709.264425,0.931329,-0.218471,-0.370422,...,0.135131,13.284216,24.300479,25.275185,0.036664,0.031375,0.01826,176.470384,590.513292,638.834979
1,1,-459.948117,-190.354639,-2.534051,6642.960123,1044.284884,835.976169,-0.76658,-0.317258,-0.004223,...,0.499395,79.244561,96.005289,75.545343,0.245548,0.113175,0.249396,6279.700472,9217.015511,5707.098884
2,2,23.901616,-49.441742,375.607013,-5083.770868,358.725917,1831.974458,0.039836,-0.082403,0.626012,...,0.248807,25.422926,118.956646,13.920337,0.506904,0.021646,0.061905,646.325142,14150.683677,193.775778
3,3,-532.621192,-52.600737,136.413976,10646.500409,2880.558352,-3521.938833,-0.887702,-0.087668,0.227357,...,0.205882,42.92886,36.953466,23.647153,0.017134,0.037639,0.042387,1842.887012,1365.558625,559.187841
4,4,-395.410844,-202.240064,121.654507,-2891.782899,5791.027696,2672.029417,-0.659018,-0.337067,0.202758,...,0.389646,108.258866,60.514531,46.148326,0.245193,0.325247,0.151824,11719.982095,3662.008463,2129.668017


# 3. Modeling

## LGBM + Bayesian Optimization
### Bayesian Optimization(BOA)은 ??
#### init_points에서 지정한 값 만큼 Random Search합니다.
#### init_points에서 나온 값을 기반으로 n_iter에서 지정한 값 만큼 최적의 파라미터를 찾아갑니다.
#### 단, BOA는 target을 Max로 만드는 방법만 있습니다.
#### 반면,  우리의 metrics인 log_loss는 minimum한 값이 더 좋은 값이 됩니다.
#### 따라서 저는 100점에서 loss만큼을 빼주어 더 큰 값을 찾도록 조정했습니다.

In [14]:
lgbm_pbounds = {
    "learning_rate": (0.001, 0.5),
    "max_depth": (5, 20),
    "n_estimators": (100, 300),
}


def lgbm_opt(learning_rate, max_depth, n_estimators):

    params = {
        "learning_rate": learning_rate,
        "max_depth": int(round(max_depth)),
        "n_estimators": int(round(max_depth)),
    }

    lgbm = LGBMClassifier(**params)

    # Cross_val_score의 Metric이 "Neg_log_loss"이기 때문에, Negative한 값들이 나옵니다. 따라서 abs로 절대값을 씌워줍니다.
    # 이 값이 Target 값이 됩니다.
    score = 100 - abs(
        cross_val_score(
            lgbm,
            best_train_x,
            y_train.label.values,
            scoring="neg_log_loss",
            cv=4,
            n_jobs=-1,
        ).mean()
    )

    return score


BO_lgbm = BayesianOptimization(f=lgbm_opt, pbounds=lgbm_pbounds, random_state=1)

# 20번의 Random Search 후 20번의 최적의 Search를 합니다.
BO_lgbm.maximize(init_points=20, n_iter=20)


# Maximize를 통해 나온 Parameter가 저장되어있습니다.
max_params = BO_lgbm.max["params"]
max_params["max_depth"] = int(
    round(max_params["max_depth"])
)  # Max_depth는 int값을 받기에 int로 변환합니다.
max_params["n_estimators"] = int(
    round(max_params["n_estimators"])
)  # n_estimators는 int값을 받기에 int로 변환합니다.
print(max_params)

# Model에 저장된 param을 지정해 저장후, 재 학습을 통해 predict합니다.
model_lgbm = LGBMClassifier(**max_params)
model_lgbm.fit(best_train_x, y_train.label.values, eval_metric="logloss")
prediction = model_lgbm.predict_proba(best_test_x)

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 75.46   [0m | [0m 0.2091  [0m | [0m 15.8    [0m | [0m 100.0   [0m |
| [95m 2       [0m | [95m 93.09   [0m | [95m 0.1519  [0m | [95m 7.201   [0m | [95m 118.5   [0m |
| [95m 3       [0m | [95m 98.56   [0m | [95m 0.09394 [0m | [95m 10.18   [0m | [95m 179.4   [0m |
| [0m 4       [0m | [0m 73.63   [0m | [0m 0.2699  [0m | [0m 11.29   [0m | [0m 237.0   [0m |
| [95m 5       [0m | [95m 98.64   [0m | [95m 0.103   [0m | [95m 18.17   [0m | [95m 105.5   [0m |
| [0m 6       [0m | [0m 71.99   [0m | [0m 0.3356  [0m | [0m 11.26   [0m | [0m 211.7   [0m |
| [0m 7       [0m | [0m 98.51   [0m | [0m 0.07105 [0m | [0m 7.972   [0m | [0m 260.1   [0m |
| [0m 8       [0m | [0m 80.28   [0m | [0m 0.4842  [0m | [0m 9.701   [0m | [0m 238.5   [0m |
| [0m 9       [0m | [0m 73.19   [0m | [0

# 5. Make Submission
- Score에서 나온 144는 loss가 1.44라는 뜻입니다.

In [15]:
os.mkdir(path + "submission")

sample_submission = pd.read_csv(path + "sample_submission.csv")
sample_submission.iloc[:, 1:] = prediction
score = abs(
    cross_val_score(
        model_lgbm,
        best_train_x,
        y_train.label.values,
        scoring="neg_log_loss",
        cv=4,
        n_jobs=-1,
    ).mean()
)
score = str(round(score, 2)).replace(".", "")
print(score)

month = str(datetime.now().date().month).zfill(2)
day = str(datetime.now().date().day).zfill(2)
date = month + day

sample_submission.to_csv(
    path
    + f"submission/{date}_{best_scaler}_{model_lgbm.__class__.__name__}_{score}.csv",
    index=False,
)

144
