# lgbm-cv 데모
- cross-validation 검증 방법: hold out validation, n-fold cv, stratified n-fold cv,leave one out ect.. 

## 라이브러리 import 및 설정

In [27]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing


rcParams['figure.figsize'] = (16, 8)             #그림 사이즈 지정
plt.style.use('fivethirtyeight')                 #그림 기본 스타일
pd.set_option('max_columns', 100)                #칼럼 100개 허용
pd.set_option("display.precision", 4)            #소수점 4자리허용
warnings.simplefilter('ignore')                  #경고분 무시

## 학습데이터 로드

In [28]:
feature_file = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\feature.csv'
sample_file  = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\sample_submission.csv'
tst_file     = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\testset.csv'
trn_file     = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\trainset.csv'

df= pd.read_csv(feature_file,index_col=0)        #학습/시험 데이터 
print(df.shape)                                  #(6113, 32)
df.head()                                        #위 5개            

(6113, 32)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,188,128,95,114,143,108,88,103,113,85,88,113,87,88,103,87,84,99,104,82,96,100,78,70,79,84,66,70,75,76,63,HI
1,174,112,88,104,119,92,74,79,88,74,67,90,68,71,73,68,71,77,90,67,71,82,65,70,75,89,73,67,71,89,73,PH
2,175,138,106,105,135,109,75,95,113,96,74,112,96,70,87,100,66,83,117,67,88,110,98,67,88,119,98,75,91,110,94,GR
3,176,111,80,106,131,96,76,99,104,85,75,89,75,79,91,75,84,103,109,82,91,96,78,78,91,96,78,82,104,112,85,PH
4,182,144,111,100,151,119,67,106,114,90,76,115,94,68,106,91,68,102,115,71,95,108,88,71,103,113,92,68,107,118,92,EL


In [29]:
y = df.iloc[:,31].values[:4280]                  #학습_종속 4820개
df.drop(df.columns[[31]], axis=1, inplace=True)  #독립을 위한 종속 제거
trn = df.iloc[:4280].values                      #학습데이터_독립 4280개
tst = df.iloc[4280:].values                      #시험데이터_독립 1883개
print(y.shape, trn.shape, tst.shape)             #(4280,) (4280, 31) (1833, 31)

(4280,) (4280, 31) (1833, 31)


In [30]:
seed=150
kfold=10

## Stratified K-Fold Cross Validation
*Stratified N-Fold CV: N-Fold CV에서 각각의 폴드에서 종속변수의 분포가 동일하도록 폴드를 나누는 방식.
현재 사용하는 데이터처럼 분류학습에서 종속변수의 범주의 분포가 균일하지 않을 때 사용된다.

In [31]:
cv = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=seed)

### [범주형 > 수치형 변환_종속변수]

In [32]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

label_str=y
label_int=le.fit_transform(label_str).astype('int')
label_int

array([3, 5, 2, ..., 1, 5, 1])

In [34]:
p_val = np.zeros((trn.shape[0], 6))
p_tst = np.zeros((tst.shape[0], 6))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             boosting_type='dart', 
                             n_estimators=300,
                             learning_rate=0.1,
                             num_leaves=64,
                             min_child_samples=10,   
                             subsample=.5,                       
                             subsample_freq=1,    
                             colsample_bytree= 0.8,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(trn[i_trn], label_int[i_trn],
            eval_set=[(trn[i_val], label_int[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=15)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / kfold
print(f'{accuracy_score(label_int, np.argmax(p_val, axis=1)) * 100:.4f}%')

training model for CV #1
[1]	valid_0's multi_logloss: 1.37974
[2]	valid_0's multi_logloss: 1.16194
[3]	valid_0's multi_logloss: 0.999997
[4]	valid_0's multi_logloss: 0.880055
[5]	valid_0's multi_logloss: 0.780881
[6]	valid_0's multi_logloss: 0.698005
[7]	valid_0's multi_logloss: 0.628851
[8]	valid_0's multi_logloss: 0.570729
[9]	valid_0's multi_logloss: 0.570596
[10]	valid_0's multi_logloss: 0.594457
[11]	valid_0's multi_logloss: 0.633107
[12]	valid_0's multi_logloss: 0.575356
[13]	valid_0's multi_logloss: 0.557734
[14]	valid_0's multi_logloss: 0.574764
[15]	valid_0's multi_logloss: 0.526242
[16]	valid_0's multi_logloss: 0.483204
[17]	valid_0's multi_logloss: 0.513324
[18]	valid_0's multi_logloss: 0.476227
[19]	valid_0's multi_logloss: 0.442837
[20]	valid_0's multi_logloss: 0.454233
[21]	valid_0's multi_logloss: 0.421426
[22]	valid_0's multi_logloss: 0.426746
[23]	valid_0's multi_logloss: 0.437792
[24]	valid_0's multi_logloss: 0.412191
[25]	valid_0's multi_logloss: 0.422945
[26]	valid_

[213]	valid_0's multi_logloss: 0.243952
[214]	valid_0's multi_logloss: 0.244724
[215]	valid_0's multi_logloss: 0.248949
[216]	valid_0's multi_logloss: 0.251292
[217]	valid_0's multi_logloss: 0.250427
[218]	valid_0's multi_logloss: 0.252711
[219]	valid_0's multi_logloss: 0.253513
[220]	valid_0's multi_logloss: 0.255194
[221]	valid_0's multi_logloss: 0.254131
[222]	valid_0's multi_logloss: 0.255084
[223]	valid_0's multi_logloss: 0.2582
[224]	valid_0's multi_logloss: 0.257529
[225]	valid_0's multi_logloss: 0.256794
[226]	valid_0's multi_logloss: 0.258087
[227]	valid_0's multi_logloss: 0.257027
[228]	valid_0's multi_logloss: 0.256181
[229]	valid_0's multi_logloss: 0.255833
[230]	valid_0's multi_logloss: 0.255114
[231]	valid_0's multi_logloss: 0.254026
[232]	valid_0's multi_logloss: 0.256603
[233]	valid_0's multi_logloss: 0.256574
[234]	valid_0's multi_logloss: 0.255794
[235]	valid_0's multi_logloss: 0.25852
[236]	valid_0's multi_logloss: 0.257532
[237]	valid_0's multi_logloss: 0.2604
[238]

[123]	valid_0's multi_logloss: 0.335993
[124]	valid_0's multi_logloss: 0.34017
[125]	valid_0's multi_logloss: 0.339299
[126]	valid_0's multi_logloss: 0.337503
[127]	valid_0's multi_logloss: 0.336289
[128]	valid_0's multi_logloss: 0.33588
[129]	valid_0's multi_logloss: 0.336886
[130]	valid_0's multi_logloss: 0.335921
[131]	valid_0's multi_logloss: 0.339232
[132]	valid_0's multi_logloss: 0.339545
[133]	valid_0's multi_logloss: 0.338025
[134]	valid_0's multi_logloss: 0.336753
[135]	valid_0's multi_logloss: 0.335611
[136]	valid_0's multi_logloss: 0.338917
[137]	valid_0's multi_logloss: 0.337977
[138]	valid_0's multi_logloss: 0.340788
[139]	valid_0's multi_logloss: 0.342483
[140]	valid_0's multi_logloss: 0.342969
[141]	valid_0's multi_logloss: 0.341377
[142]	valid_0's multi_logloss: 0.339737
[143]	valid_0's multi_logloss: 0.33849
[144]	valid_0's multi_logloss: 0.336991
[145]	valid_0's multi_logloss: 0.338685
[146]	valid_0's multi_logloss: 0.337051
[147]	valid_0's multi_logloss: 0.338296
[14

[40]	valid_0's multi_logloss: 0.324552
[41]	valid_0's multi_logloss: 0.31447
[42]	valid_0's multi_logloss: 0.318502
[43]	valid_0's multi_logloss: 0.304876
[44]	valid_0's multi_logloss: 0.308017
[45]	valid_0's multi_logloss: 0.318963
[46]	valid_0's multi_logloss: 0.324524
[47]	valid_0's multi_logloss: 0.326554
[48]	valid_0's multi_logloss: 0.336199
[49]	valid_0's multi_logloss: 0.344403
[50]	valid_0's multi_logloss: 0.347708
[51]	valid_0's multi_logloss: 0.352953
[52]	valid_0's multi_logloss: 0.363047
[53]	valid_0's multi_logloss: 0.371388
[54]	valid_0's multi_logloss: 0.348065
[55]	valid_0's multi_logloss: 0.329979
[56]	valid_0's multi_logloss: 0.312644
[57]	valid_0's multi_logloss: 0.300792
[58]	valid_0's multi_logloss: 0.308176
[59]	valid_0's multi_logloss: 0.312813
[60]	valid_0's multi_logloss: 0.300745
[61]	valid_0's multi_logloss: 0.304705
[62]	valid_0's multi_logloss: 0.312566
[63]	valid_0's multi_logloss: 0.317752
[64]	valid_0's multi_logloss: 0.303751
[65]	valid_0's multi_loglo

[247]	valid_0's multi_logloss: 0.238152
[248]	valid_0's multi_logloss: 0.237528
[249]	valid_0's multi_logloss: 0.236674
[250]	valid_0's multi_logloss: 0.23587
[251]	valid_0's multi_logloss: 0.237946
[252]	valid_0's multi_logloss: 0.237401
[253]	valid_0's multi_logloss: 0.236663
[254]	valid_0's multi_logloss: 0.237523
[255]	valid_0's multi_logloss: 0.237982
[256]	valid_0's multi_logloss: 0.237266
[257]	valid_0's multi_logloss: 0.239785
[258]	valid_0's multi_logloss: 0.238989
[259]	valid_0's multi_logloss: 0.238367
[260]	valid_0's multi_logloss: 0.23919
[261]	valid_0's multi_logloss: 0.23771
[262]	valid_0's multi_logloss: 0.236897
[263]	valid_0's multi_logloss: 0.236468
[264]	valid_0's multi_logloss: 0.236063
[265]	valid_0's multi_logloss: 0.235547
[266]	valid_0's multi_logloss: 0.238725
[267]	valid_0's multi_logloss: 0.237969
[268]	valid_0's multi_logloss: 0.237289
[269]	valid_0's multi_logloss: 0.236798
[270]	valid_0's multi_logloss: 0.23811
[271]	valid_0's multi_logloss: 0.237528
[272

[156]	valid_0's multi_logloss: 0.281704
[157]	valid_0's multi_logloss: 0.284666
[158]	valid_0's multi_logloss: 0.283228
[159]	valid_0's multi_logloss: 0.286762
[160]	valid_0's multi_logloss: 0.288786
[161]	valid_0's multi_logloss: 0.291822
[162]	valid_0's multi_logloss: 0.290786
[163]	valid_0's multi_logloss: 0.28961
[164]	valid_0's multi_logloss: 0.288187
[165]	valid_0's multi_logloss: 0.287173
[166]	valid_0's multi_logloss: 0.288385
[167]	valid_0's multi_logloss: 0.287374
[168]	valid_0's multi_logloss: 0.289344
[169]	valid_0's multi_logloss: 0.290271
[170]	valid_0's multi_logloss: 0.289439
[171]	valid_0's multi_logloss: 0.287934
[172]	valid_0's multi_logloss: 0.29159
[173]	valid_0's multi_logloss: 0.290152
[174]	valid_0's multi_logloss: 0.28882
[175]	valid_0's multi_logloss: 0.289509
[176]	valid_0's multi_logloss: 0.288417
[177]	valid_0's multi_logloss: 0.287505
[178]	valid_0's multi_logloss: 0.286467
[179]	valid_0's multi_logloss: 0.285147
[180]	valid_0's multi_logloss: 0.284321
[18

[72]	valid_0's multi_logloss: 0.296785
[73]	valid_0's multi_logloss: 0.290448
[74]	valid_0's multi_logloss: 0.286034
[75]	valid_0's multi_logloss: 0.288034
[76]	valid_0's multi_logloss: 0.290412
[77]	valid_0's multi_logloss: 0.291538
[78]	valid_0's multi_logloss: 0.293975
[79]	valid_0's multi_logloss: 0.291073
[80]	valid_0's multi_logloss: 0.292671
[81]	valid_0's multi_logloss: 0.28709
[82]	valid_0's multi_logloss: 0.286302
[83]	valid_0's multi_logloss: 0.286681
[84]	valid_0's multi_logloss: 0.288665
[85]	valid_0's multi_logloss: 0.284293
[86]	valid_0's multi_logloss: 0.285679
[87]	valid_0's multi_logloss: 0.287352
[88]	valid_0's multi_logloss: 0.288927
[89]	valid_0's multi_logloss: 0.28509
[90]	valid_0's multi_logloss: 0.286349
[91]	valid_0's multi_logloss: 0.284264
[92]	valid_0's multi_logloss: 0.278032
[93]	valid_0's multi_logloss: 0.274679
[94]	valid_0's multi_logloss: 0.27184
[95]	valid_0's multi_logloss: 0.27292
[96]	valid_0's multi_logloss: 0.273556
[97]	valid_0's multi_logloss:

[280]	valid_0's multi_logloss: 0.25545
[281]	valid_0's multi_logloss: 0.254942
[282]	valid_0's multi_logloss: 0.255715
[283]	valid_0's multi_logloss: 0.25571
[284]	valid_0's multi_logloss: 0.257166
[285]	valid_0's multi_logloss: 0.257868
[286]	valid_0's multi_logloss: 0.256986
[287]	valid_0's multi_logloss: 0.257018
[288]	valid_0's multi_logloss: 0.259528
[289]	valid_0's multi_logloss: 0.262353
[290]	valid_0's multi_logloss: 0.261585
[291]	valid_0's multi_logloss: 0.262772
[292]	valid_0's multi_logloss: 0.262275
[293]	valid_0's multi_logloss: 0.261332
[294]	valid_0's multi_logloss: 0.260345
[295]	valid_0's multi_logloss: 0.260793
[296]	valid_0's multi_logloss: 0.260114
[297]	valid_0's multi_logloss: 0.259253
[298]	valid_0's multi_logloss: 0.25869
[299]	valid_0's multi_logloss: 0.259323
[300]	valid_0's multi_logloss: 0.261344
training model for CV #6
[1]	valid_0's multi_logloss: 1.38975
[2]	valid_0's multi_logloss: 1.17749
[3]	valid_0's multi_logloss: 1.01576
[4]	valid_0's multi_logloss

[189]	valid_0's multi_logloss: 0.226004
[190]	valid_0's multi_logloss: 0.225639
[191]	valid_0's multi_logloss: 0.225303
[192]	valid_0's multi_logloss: 0.224758
[193]	valid_0's multi_logloss: 0.223948
[194]	valid_0's multi_logloss: 0.225814
[195]	valid_0's multi_logloss: 0.22514
[196]	valid_0's multi_logloss: 0.2249
[197]	valid_0's multi_logloss: 0.224256
[198]	valid_0's multi_logloss: 0.224914
[199]	valid_0's multi_logloss: 0.224347
[200]	valid_0's multi_logloss: 0.223752
[201]	valid_0's multi_logloss: 0.223234
[202]	valid_0's multi_logloss: 0.227602
[203]	valid_0's multi_logloss: 0.227387
[204]	valid_0's multi_logloss: 0.229631
[205]	valid_0's multi_logloss: 0.228968
[206]	valid_0's multi_logloss: 0.228568
[207]	valid_0's multi_logloss: 0.229258
[208]	valid_0's multi_logloss: 0.231328
[209]	valid_0's multi_logloss: 0.230492
[210]	valid_0's multi_logloss: 0.232531
[211]	valid_0's multi_logloss: 0.231926
[212]	valid_0's multi_logloss: 0.234619
[213]	valid_0's multi_logloss: 0.235037
[21

[103]	valid_0's multi_logloss: 0.192374
[104]	valid_0's multi_logloss: 0.192683
[105]	valid_0's multi_logloss: 0.189859
[106]	valid_0's multi_logloss: 0.188798
[107]	valid_0's multi_logloss: 0.187559
[108]	valid_0's multi_logloss: 0.187843
[109]	valid_0's multi_logloss: 0.187664
[110]	valid_0's multi_logloss: 0.186393
[111]	valid_0's multi_logloss: 0.185825
[112]	valid_0's multi_logloss: 0.186041
[113]	valid_0's multi_logloss: 0.185345
[114]	valid_0's multi_logloss: 0.183704
[115]	valid_0's multi_logloss: 0.185111
[116]	valid_0's multi_logloss: 0.184133
[117]	valid_0's multi_logloss: 0.184274
[118]	valid_0's multi_logloss: 0.18401
[119]	valid_0's multi_logloss: 0.184273
[120]	valid_0's multi_logloss: 0.184149
[121]	valid_0's multi_logloss: 0.185585
[122]	valid_0's multi_logloss: 0.186665
[123]	valid_0's multi_logloss: 0.186395
[124]	valid_0's multi_logloss: 0.187514
[125]	valid_0's multi_logloss: 0.187282
[126]	valid_0's multi_logloss: 0.187133
[127]	valid_0's multi_logloss: 0.186739
[

[13]	valid_0's multi_logloss: 0.551034
[14]	valid_0's multi_logloss: 0.567508
[15]	valid_0's multi_logloss: 0.518201
[16]	valid_0's multi_logloss: 0.476399
[17]	valid_0's multi_logloss: 0.507756
[18]	valid_0's multi_logloss: 0.469692
[19]	valid_0's multi_logloss: 0.437465
[20]	valid_0's multi_logloss: 0.450627
[21]	valid_0's multi_logloss: 0.419171
[22]	valid_0's multi_logloss: 0.424361
[23]	valid_0's multi_logloss: 0.436001
[24]	valid_0's multi_logloss: 0.407968
[25]	valid_0's multi_logloss: 0.418867
[26]	valid_0's multi_logloss: 0.390727
[27]	valid_0's multi_logloss: 0.367868
[28]	valid_0's multi_logloss: 0.348189
[29]	valid_0's multi_logloss: 0.351501
[30]	valid_0's multi_logloss: 0.331851
[31]	valid_0's multi_logloss: 0.335106
[32]	valid_0's multi_logloss: 0.339227
[33]	valid_0's multi_logloss: 0.354626
[34]	valid_0's multi_logloss: 0.35087
[35]	valid_0's multi_logloss: 0.333359
[36]	valid_0's multi_logloss: 0.319972
[37]	valid_0's multi_logloss: 0.322221
[38]	valid_0's multi_loglo

[224]	valid_0's multi_logloss: 0.232586
[225]	valid_0's multi_logloss: 0.232018
[226]	valid_0's multi_logloss: 0.232904
[227]	valid_0's multi_logloss: 0.232102
[228]	valid_0's multi_logloss: 0.231203
[229]	valid_0's multi_logloss: 0.23121
[230]	valid_0's multi_logloss: 0.230699
[231]	valid_0's multi_logloss: 0.229817
[232]	valid_0's multi_logloss: 0.23158
[233]	valid_0's multi_logloss: 0.231425
[234]	valid_0's multi_logloss: 0.230516
[235]	valid_0's multi_logloss: 0.231968
[236]	valid_0's multi_logloss: 0.231162
[237]	valid_0's multi_logloss: 0.229218
[238]	valid_0's multi_logloss: 0.230214
[239]	valid_0's multi_logloss: 0.229256
[240]	valid_0's multi_logloss: 0.228209
[241]	valid_0's multi_logloss: 0.22745
[242]	valid_0's multi_logloss: 0.226998
[243]	valid_0's multi_logloss: 0.227853
[244]	valid_0's multi_logloss: 0.226916
[245]	valid_0's multi_logloss: 0.226148
[246]	valid_0's multi_logloss: 0.227939
[247]	valid_0's multi_logloss: 0.227321
[248]	valid_0's multi_logloss: 0.226796
[24

[133]	valid_0's multi_logloss: 0.228559
[134]	valid_0's multi_logloss: 0.227933
[135]	valid_0's multi_logloss: 0.227553
[136]	valid_0's multi_logloss: 0.226853
[137]	valid_0's multi_logloss: 0.226895
[138]	valid_0's multi_logloss: 0.227477
[139]	valid_0's multi_logloss: 0.226363
[140]	valid_0's multi_logloss: 0.227384
[141]	valid_0's multi_logloss: 0.226972
[142]	valid_0's multi_logloss: 0.226771
[143]	valid_0's multi_logloss: 0.226528
[144]	valid_0's multi_logloss: 0.226124
[145]	valid_0's multi_logloss: 0.226598
[146]	valid_0's multi_logloss: 0.226514
[147]	valid_0's multi_logloss: 0.227804
[148]	valid_0's multi_logloss: 0.227418
[149]	valid_0's multi_logloss: 0.226818
[150]	valid_0's multi_logloss: 0.226594
[151]	valid_0's multi_logloss: 0.22494
[152]	valid_0's multi_logloss: 0.227332
[153]	valid_0's multi_logloss: 0.227199
[154]	valid_0's multi_logloss: 0.226684
[155]	valid_0's multi_logloss: 0.229367
[156]	valid_0's multi_logloss: 0.228877
[157]	valid_0's multi_logloss: 0.228928
[

[47]	valid_0's multi_logloss: 0.373402
[48]	valid_0's multi_logloss: 0.382307
[49]	valid_0's multi_logloss: 0.390525
[50]	valid_0's multi_logloss: 0.393724
[51]	valid_0's multi_logloss: 0.398123
[52]	valid_0's multi_logloss: 0.40739
[53]	valid_0's multi_logloss: 0.414898
[54]	valid_0's multi_logloss: 0.396905
[55]	valid_0's multi_logloss: 0.378828
[56]	valid_0's multi_logloss: 0.364449
[57]	valid_0's multi_logloss: 0.354152
[58]	valid_0's multi_logloss: 0.36057
[59]	valid_0's multi_logloss: 0.365068
[60]	valid_0's multi_logloss: 0.357331
[61]	valid_0's multi_logloss: 0.360909
[62]	valid_0's multi_logloss: 0.367182
[63]	valid_0's multi_logloss: 0.371998
[64]	valid_0's multi_logloss: 0.358557
[65]	valid_0's multi_logloss: 0.349471
[66]	valid_0's multi_logloss: 0.342649
[67]	valid_0's multi_logloss: 0.336515
[68]	valid_0's multi_logloss: 0.328696
[69]	valid_0's multi_logloss: 0.320272
[70]	valid_0's multi_logloss: 0.322139
[71]	valid_0's multi_logloss: 0.315002
[72]	valid_0's multi_loglos

[256]	valid_0's multi_logloss: 0.319638
[257]	valid_0's multi_logloss: 0.319828
[258]	valid_0's multi_logloss: 0.322074
[259]	valid_0's multi_logloss: 0.320835
[260]	valid_0's multi_logloss: 0.323857
[261]	valid_0's multi_logloss: 0.32637
[262]	valid_0's multi_logloss: 0.325315
[263]	valid_0's multi_logloss: 0.324063
[264]	valid_0's multi_logloss: 0.32287
[265]	valid_0's multi_logloss: 0.321466
[266]	valid_0's multi_logloss: 0.322268
[267]	valid_0's multi_logloss: 0.321312
[268]	valid_0's multi_logloss: 0.320392
[269]	valid_0's multi_logloss: 0.319489
[270]	valid_0's multi_logloss: 0.320725
[271]	valid_0's multi_logloss: 0.319401
[272]	valid_0's multi_logloss: 0.318642
[273]	valid_0's multi_logloss: 0.324698
[274]	valid_0's multi_logloss: 0.327917
[275]	valid_0's multi_logloss: 0.32682
[276]	valid_0's multi_logloss: 0.325436
[277]	valid_0's multi_logloss: 0.324204
[278]	valid_0's multi_logloss: 0.322975
[279]	valid_0's multi_logloss: 0.325162
[280]	valid_0's multi_logloss: 0.323996
[28

## LightGBM 모델 학습

In [8]:
p_val = np.zeros((trn.shape[0], 6))
p_tst = np.zeros((tst.shape[0], 6))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=3000,
                             num_leaves= 512,
                             learning_rate=0.03,
                             min_child_samples=10,
                             subsample=0.8,
                             colsample_bytree=.6,
                             random_state=42,
                             n_jobs=-1)
    clf.fit(trn[i_trn], label_int[i_trn],
            eval_set=[(trn[i_val], label_int[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=15)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / 15

training model for CV #1
[1]	valid_0's multi_logloss: 1.61628
Training until validation scores don't improve for 15 rounds
[2]	valid_0's multi_logloss: 1.5194
[3]	valid_0's multi_logloss: 1.43807
[4]	valid_0's multi_logloss: 1.36197
[5]	valid_0's multi_logloss: 1.29399
[6]	valid_0's multi_logloss: 1.23262
[7]	valid_0's multi_logloss: 1.17466
[8]	valid_0's multi_logloss: 1.12302
[9]	valid_0's multi_logloss: 1.07591
[10]	valid_0's multi_logloss: 1.03182
[11]	valid_0's multi_logloss: 0.991109
[12]	valid_0's multi_logloss: 0.953947
[13]	valid_0's multi_logloss: 0.919563
[14]	valid_0's multi_logloss: 0.885712
[15]	valid_0's multi_logloss: 0.853216
[16]	valid_0's multi_logloss: 0.823142
[17]	valid_0's multi_logloss: 0.794326
[18]	valid_0's multi_logloss: 0.767764
[19]	valid_0's multi_logloss: 0.743161
[20]	valid_0's multi_logloss: 0.718112
[21]	valid_0's multi_logloss: 0.696529
[22]	valid_0's multi_logloss: 0.67499
[23]	valid_0's multi_logloss: 0.654978
[24]	valid_0's multi_logloss: 0.635404

[78]	valid_0's multi_logloss: 0.349527
[79]	valid_0's multi_logloss: 0.348636
[80]	valid_0's multi_logloss: 0.347738
[81]	valid_0's multi_logloss: 0.346597
[82]	valid_0's multi_logloss: 0.346031
[83]	valid_0's multi_logloss: 0.345204
[84]	valid_0's multi_logloss: 0.344158
[85]	valid_0's multi_logloss: 0.34349
[86]	valid_0's multi_logloss: 0.34296
[87]	valid_0's multi_logloss: 0.342372
[88]	valid_0's multi_logloss: 0.341808
[89]	valid_0's multi_logloss: 0.342048
[90]	valid_0's multi_logloss: 0.342055
[91]	valid_0's multi_logloss: 0.341552
[92]	valid_0's multi_logloss: 0.341405
[93]	valid_0's multi_logloss: 0.341204
[94]	valid_0's multi_logloss: 0.341657
[95]	valid_0's multi_logloss: 0.341866
[96]	valid_0's multi_logloss: 0.342264
[97]	valid_0's multi_logloss: 0.342817
[98]	valid_0's multi_logloss: 0.34313
[99]	valid_0's multi_logloss: 0.343483
[100]	valid_0's multi_logloss: 0.34355
[101]	valid_0's multi_logloss: 0.344105
[102]	valid_0's multi_logloss: 0.344379
[103]	valid_0's multi_logl

[59]	valid_0's multi_logloss: 0.34437
[60]	valid_0's multi_logloss: 0.340879
[61]	valid_0's multi_logloss: 0.337111
[62]	valid_0's multi_logloss: 0.334275
[63]	valid_0's multi_logloss: 0.33107
[64]	valid_0's multi_logloss: 0.327371
[65]	valid_0's multi_logloss: 0.324843
[66]	valid_0's multi_logloss: 0.322657
[67]	valid_0's multi_logloss: 0.319974
[68]	valid_0's multi_logloss: 0.31823
[69]	valid_0's multi_logloss: 0.315878
[70]	valid_0's multi_logloss: 0.313828
[71]	valid_0's multi_logloss: 0.311251
[72]	valid_0's multi_logloss: 0.309504
[73]	valid_0's multi_logloss: 0.307611
[74]	valid_0's multi_logloss: 0.305692
[75]	valid_0's multi_logloss: 0.304097
[76]	valid_0's multi_logloss: 0.302916
[77]	valid_0's multi_logloss: 0.302101
[78]	valid_0's multi_logloss: 0.300506
[79]	valid_0's multi_logloss: 0.298269
[80]	valid_0's multi_logloss: 0.297254
[81]	valid_0's multi_logloss: 0.295223
[82]	valid_0's multi_logloss: 0.294045
[83]	valid_0's multi_logloss: 0.292961
[84]	valid_0's multi_logloss

[101]	valid_0's multi_logloss: 0.192746
[102]	valid_0's multi_logloss: 0.19232
[103]	valid_0's multi_logloss: 0.192503
[104]	valid_0's multi_logloss: 0.191552
[105]	valid_0's multi_logloss: 0.191248
[106]	valid_0's multi_logloss: 0.191718
[107]	valid_0's multi_logloss: 0.19191
[108]	valid_0's multi_logloss: 0.192099
[109]	valid_0's multi_logloss: 0.19162
[110]	valid_0's multi_logloss: 0.191051
[111]	valid_0's multi_logloss: 0.190968
[112]	valid_0's multi_logloss: 0.190716
[113]	valid_0's multi_logloss: 0.190839
[114]	valid_0's multi_logloss: 0.190711
[115]	valid_0's multi_logloss: 0.19037
[116]	valid_0's multi_logloss: 0.189619
[117]	valid_0's multi_logloss: 0.18962
[118]	valid_0's multi_logloss: 0.189467
[119]	valid_0's multi_logloss: 0.18932
[120]	valid_0's multi_logloss: 0.189474
[121]	valid_0's multi_logloss: 0.189355
[122]	valid_0's multi_logloss: 0.189231
[123]	valid_0's multi_logloss: 0.189588
[124]	valid_0's multi_logloss: 0.189696
[125]	valid_0's multi_logloss: 0.189412
[126]	

[35]	valid_0's multi_logloss: 0.47392
[36]	valid_0's multi_logloss: 0.462812
[37]	valid_0's multi_logloss: 0.451906
[38]	valid_0's multi_logloss: 0.441253
[39]	valid_0's multi_logloss: 0.43204
[40]	valid_0's multi_logloss: 0.423735
[41]	valid_0's multi_logloss: 0.415035
[42]	valid_0's multi_logloss: 0.406633
[43]	valid_0's multi_logloss: 0.399457
[44]	valid_0's multi_logloss: 0.391761
[45]	valid_0's multi_logloss: 0.38517
[46]	valid_0's multi_logloss: 0.378468
[47]	valid_0's multi_logloss: 0.371794
[48]	valid_0's multi_logloss: 0.364944
[49]	valid_0's multi_logloss: 0.358898
[50]	valid_0's multi_logloss: 0.352865
[51]	valid_0's multi_logloss: 0.34705
[52]	valid_0's multi_logloss: 0.342464
[53]	valid_0's multi_logloss: 0.336851
[54]	valid_0's multi_logloss: 0.332178
[55]	valid_0's multi_logloss: 0.327901
[56]	valid_0's multi_logloss: 0.323675
[57]	valid_0's multi_logloss: 0.319436
[58]	valid_0's multi_logloss: 0.314953
[59]	valid_0's multi_logloss: 0.311129
[60]	valid_0's multi_logloss:

[88]	valid_0's multi_logloss: 0.310309
[89]	valid_0's multi_logloss: 0.309669
[90]	valid_0's multi_logloss: 0.308981
[91]	valid_0's multi_logloss: 0.308955
[92]	valid_0's multi_logloss: 0.308372
[93]	valid_0's multi_logloss: 0.30809
[94]	valid_0's multi_logloss: 0.307592
[95]	valid_0's multi_logloss: 0.307491
[96]	valid_0's multi_logloss: 0.307126
[97]	valid_0's multi_logloss: 0.306659
[98]	valid_0's multi_logloss: 0.306188
[99]	valid_0's multi_logloss: 0.305966
[100]	valid_0's multi_logloss: 0.306018
[101]	valid_0's multi_logloss: 0.305363
[102]	valid_0's multi_logloss: 0.304871
[103]	valid_0's multi_logloss: 0.304318
[104]	valid_0's multi_logloss: 0.303748
[105]	valid_0's multi_logloss: 0.303637
[106]	valid_0's multi_logloss: 0.303711
[107]	valid_0's multi_logloss: 0.302948
[108]	valid_0's multi_logloss: 0.302891
[109]	valid_0's multi_logloss: 0.303018
[110]	valid_0's multi_logloss: 0.303604
[111]	valid_0's multi_logloss: 0.303617
[112]	valid_0's multi_logloss: 0.304099
[113]	valid_0

** ( 범주값 출력: clf.pedict(), 확률값출력: clf.predict_proba() ) **

In [9]:
print(f'{accuracy_score(label_int, np.argmax(p_val, axis=1)) * 100:.4f}%')

91.3318%


#강사 베이스90.7944  #2 90.7710
#hw8lgbm-cv 91.1682   
subsample_freq=1, 제거 _91.0047%  
n_e 500_91.0047%  
learning rate 001 91.1449  
subsample 8->7 91.1449 ->6 91.1449 ->5  911449  ->4  911449 제거 90.9813  
강사 하이퍼옵 90.8178%   
cv 15 >91.1916   
3000 256 003 10 08 06 42 15번  >>91.2617


#### [stacking 사용을 위해 p_val,p_tst 저장]

In [35]:
val_dir = Path('C:\\Users\\USER\\Desktop\\Dataset\\val')
tst_dir = Path('C:\\Users\\USER\\Desktop\\Dataset\\tst')
name = 'lgbmcvFeature'
model_name = f'{name}'
p_val_file = val_dir / f'{name}.val.csv'
p_tst_file = tst_dir / f'{name}.tst.csv'

In [36]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=', ')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=', ')

## Simple way Cross Validation

In [None]:
# dataset
logreg =lgb.LGBMClassifier(boosting_type='dart',
                           objective='multiclass',
                           n_estimators=600,
                           num_leaves=10,
                           learning_rate=0.1,
                           #min_child_samples=10,
                           subsample=.5,
                           subsample_freq=1,
                           colsample_bytree=.9,
                           random_state=150,
                           n_jobs=-1)

scores = cross_val_score(logreg, trn,y, cv=5) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()*100))#90.911 91.215

In [None]:
#{'objective': 'multiclass', 'n_estimators': 1000, 'subsample_freq': 1, 'random_state': 42, 'n_jobs': -1, 
#'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.03108434204266342, 'min_child_samples': 10, 
#'num_leaves': 127, 'subsample': 0.6000000000000001}
logreg =lgb.LGBMClassifier(objective='multiclass',
                           n_estimators=500,
                           num_leaves=127,
                           learning_rate=0.03108434204266342,
                           colsample_bytree=.7000000000000001,  
                           min_child_samples=10,
                           subsample=.6000000000000001,
                           random_state=42,
                           n_jobs=-1)

scores = cross_val_score(logreg, trn,y, cv=12) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()*100))
#91.4~5,강사님 베이스 91.776->에서 러닝=.05 낮아짐,
#hW8-lightgbm-cv 91.846 n_e 500> 90.981 001 91.192
#subsample 8->7 91.192 ->6 91.192->5 91.192 
#강사 하이퍼 옵91.659 91.706(12=cv)

In [37]:
scores = cross_val_score(clf, trn,y, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()*100))
#91.4~5,강사님 베이스 91.776->에서 러닝=.05 낮아짐,91.659(1500)91.893(500)
#92.033(N_e 300_마지노선)# 하이퍼로 한거 이제 하지 말아라..성능 존나 안나온다

cross-val-score.mean 
91.472


### [수치형 > 범주형 변환_종속변수]

In [38]:
pt=np.zeros(shape=(1833,))
pt = pt.astype(np.int64)
for x in range(0,1833):
    pt[x] = np.argmax(p_tst[x,:])
target=le.inverse_transform(pt)
target

array(['HI', 'EL', 'HI', ..., 'PH', 'PH', 'CO'], dtype=object)

## 제출 파일 생성

In [39]:
#sub = pd.read_csv(sample_file)
#sub['class']
sub = pd.read_csv(sample_file)
sub['class'] = target
sub.to_csv('C:\\Users\\USER\\Desktop\\Dataset\\sub\\lgbmcv-sub.csv',index=False,header=False)