# lgbm-cv 데모
- cross-validation 검증 방법: hold out validation, n-fold cv, stratified n-fold cv,leave one out ect.. 

## 라이브러리 import 및 설정

In [19]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing


rcParams['figure.figsize'] = (16, 8)             #그림 사이즈 지정
plt.style.use('fivethirtyeight')                 #그림 기본 스타일
pd.set_option('max_columns', 100)                #칼럼 100개 허용
pd.set_option("display.precision", 4)            #소수점 4자리허용
warnings.simplefilter('ignore')                  #경고분 무시

## 학습데이터 로드

In [20]:
feature_file = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\feature.csv'
sample_file  = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\sample_submission.csv'
tst_file     = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\testset.csv'
trn_file     = 'C:\\Users\\USER\\Desktop\\Dataset\\DataInput\\trainset.csv'

df= pd.read_csv(feature_file,index_col=0)        #학습/시험 데이터 
print(df.shape)                                  #(6113, 32)
df.head()                                        #위 5개            

(6113, 32)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,188,128,95,114,143,108,88,103,113,85,88,113,87,88,103,87,84,99,104,82,96,100,78,70,79,84,66,70,75,76,63,HI
1,174,112,88,104,119,92,74,79,88,74,67,90,68,71,73,68,71,77,90,67,71,82,65,70,75,89,73,67,71,89,73,PH
2,175,138,106,105,135,109,75,95,113,96,74,112,96,70,87,100,66,83,117,67,88,110,98,67,88,119,98,75,91,110,94,GR
3,176,111,80,106,131,96,76,99,104,85,75,89,75,79,91,75,84,103,109,82,91,96,78,78,91,96,78,82,104,112,85,PH
4,182,144,111,100,151,119,67,106,114,90,76,115,94,68,106,91,68,102,115,71,95,108,88,71,103,113,92,68,107,118,92,EL


In [21]:
y = df.iloc[:,31].values[:4280]                  #학습_종속 4820개
df.drop(df.columns[[31]], axis=1, inplace=True)  #독립을 위한 종속 제거
trn = df.iloc[:4280].values                      #학습데이터_독립 4280개
tst = df.iloc[4280:].values                      #시험데이터_독립 1883개
print(y.shape, trn.shape, tst.shape)             #(4280,) (4280, 31) (1833, 31)

(4280,) (4280, 31) (1833, 31)


In [22]:
seed=150
kfold=10

## Stratified K-Fold Cross Validation
*Stratified N-Fold CV: N-Fold CV에서 각각의 폴드에서 종속변수의 분포가 동일하도록 폴드를 나누는 방식.
현재 사용하는 데이터처럼 분류학습에서 종속변수의 범주의 분포가 균일하지 않을 때 사용된다.

In [23]:
cv = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=seed)

### [범주형 > 수치형 변환_종속변수]

In [24]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

label_str=y
label_int=le.fit_transform(label_str).astype('int')
label_int

array([3, 5, 2, ..., 1, 5, 1])

## LightGBM 모델 학습

In [27]:
p_val = np.zeros((trn.shape[0], 6))
p_tst = np.zeros((tst.shape[0], 6))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(#objective='multiclass',
                             boosting_type='goss', 
                             #n_estimators=300,
                             #learning_rate=0.1,
                             #num_leaves=64,
                             #min_child_samples=10,   
                             #subsample=.5,                       
                             #subsample_freq=1,    
                             #colsample_bytree= 0.8,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(trn[i_trn], label_int[i_trn],
            eval_set=[(trn[i_val], label_int[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=15)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / kfold
print(f'{accuracy_score(label_int, np.argmax(p_val, axis=1)) * 100:.4f}%')

training model for CV #1
[1]	valid_0's multi_logloss: 1.37971
Training until validation scores don't improve for 15 rounds
[2]	valid_0's multi_logloss: 1.16974
[3]	valid_0's multi_logloss: 1.01032
[4]	valid_0's multi_logloss: 0.88624
[5]	valid_0's multi_logloss: 0.790225
[6]	valid_0's multi_logloss: 0.710444
[7]	valid_0's multi_logloss: 0.644967
[8]	valid_0's multi_logloss: 0.590834
[9]	valid_0's multi_logloss: 0.546052
[10]	valid_0's multi_logloss: 0.508738
[11]	valid_0's multi_logloss: 0.477898
[12]	valid_0's multi_logloss: 0.451304
[13]	valid_0's multi_logloss: 0.426572
[14]	valid_0's multi_logloss: 0.404942
[15]	valid_0's multi_logloss: 0.386581
[16]	valid_0's multi_logloss: 0.371111
[17]	valid_0's multi_logloss: 0.356904
[18]	valid_0's multi_logloss: 0.344573
[19]	valid_0's multi_logloss: 0.333316
[20]	valid_0's multi_logloss: 0.324
[21]	valid_0's multi_logloss: 0.31552
[22]	valid_0's multi_logloss: 0.308556
[23]	valid_0's multi_logloss: 0.301295
[24]	valid_0's multi_logloss: 0.29

[45]	valid_0's multi_logloss: 0.287265
[46]	valid_0's multi_logloss: 0.287392
[47]	valid_0's multi_logloss: 0.289812
[48]	valid_0's multi_logloss: 0.289203
[49]	valid_0's multi_logloss: 0.287672
[50]	valid_0's multi_logloss: 0.289037
[51]	valid_0's multi_logloss: 0.288028
[52]	valid_0's multi_logloss: 0.288671
[53]	valid_0's multi_logloss: 0.289908
[54]	valid_0's multi_logloss: 0.291589
[55]	valid_0's multi_logloss: 0.291863
[56]	valid_0's multi_logloss: 0.291038
Early stopping, best iteration is:
[41]	valid_0's multi_logloss: 0.284805
training model for CV #5
[1]	valid_0's multi_logloss: 1.37717
Training until validation scores don't improve for 15 rounds
[2]	valid_0's multi_logloss: 1.16332
[3]	valid_0's multi_logloss: 1.0041
[4]	valid_0's multi_logloss: 0.881229
[5]	valid_0's multi_logloss: 0.784328
[6]	valid_0's multi_logloss: 0.704371
[7]	valid_0's multi_logloss: 0.63838
[8]	valid_0's multi_logloss: 0.580188
[9]	valid_0's multi_logloss: 0.533124
[10]	valid_0's multi_logloss: 0.494

[76]	valid_0's multi_logloss: 0.180062
[77]	valid_0's multi_logloss: 0.180848
[78]	valid_0's multi_logloss: 0.182646
[79]	valid_0's multi_logloss: 0.182146
[80]	valid_0's multi_logloss: 0.182173
[81]	valid_0's multi_logloss: 0.1833
Early stopping, best iteration is:
[66]	valid_0's multi_logloss: 0.176553
training model for CV #8
[1]	valid_0's multi_logloss: 1.37603
Training until validation scores don't improve for 15 rounds
[2]	valid_0's multi_logloss: 1.15465
[3]	valid_0's multi_logloss: 0.992935
[4]	valid_0's multi_logloss: 0.864333
[5]	valid_0's multi_logloss: 0.762683
[6]	valid_0's multi_logloss: 0.680739
[7]	valid_0's multi_logloss: 0.613639
[8]	valid_0's multi_logloss: 0.557434
[9]	valid_0's multi_logloss: 0.511163
[10]	valid_0's multi_logloss: 0.471506
[11]	valid_0's multi_logloss: 0.440839
[12]	valid_0's multi_logloss: 0.414555
[13]	valid_0's multi_logloss: 0.390467
[14]	valid_0's multi_logloss: 0.368942
[15]	valid_0's multi_logloss: 0.353537
[16]	valid_0's multi_logloss: 0.33

In [8]:
p_val = np.zeros((trn.shape[0], 6))
p_tst = np.zeros((tst.shape[0], 6))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=3000,
                             num_leaves= 512,
                             learning_rate=0.03,
                             min_child_samples=10,
                             subsample=0.8,
                             colsample_bytree=.6,
                             random_state=42,
                             n_jobs=-1)
    clf.fit(trn[i_trn], label_int[i_trn],
            eval_set=[(trn[i_val], label_int[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=15)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / 15

training model for CV #1
[1]	valid_0's multi_logloss: 1.61628
Training until validation scores don't improve for 15 rounds
[2]	valid_0's multi_logloss: 1.5194
[3]	valid_0's multi_logloss: 1.43807
[4]	valid_0's multi_logloss: 1.36197
[5]	valid_0's multi_logloss: 1.29399
[6]	valid_0's multi_logloss: 1.23262
[7]	valid_0's multi_logloss: 1.17466
[8]	valid_0's multi_logloss: 1.12302
[9]	valid_0's multi_logloss: 1.07591
[10]	valid_0's multi_logloss: 1.03182
[11]	valid_0's multi_logloss: 0.991109
[12]	valid_0's multi_logloss: 0.953947
[13]	valid_0's multi_logloss: 0.919563
[14]	valid_0's multi_logloss: 0.885712
[15]	valid_0's multi_logloss: 0.853216
[16]	valid_0's multi_logloss: 0.823142
[17]	valid_0's multi_logloss: 0.794326
[18]	valid_0's multi_logloss: 0.767764
[19]	valid_0's multi_logloss: 0.743161
[20]	valid_0's multi_logloss: 0.718112
[21]	valid_0's multi_logloss: 0.696529
[22]	valid_0's multi_logloss: 0.67499
[23]	valid_0's multi_logloss: 0.654978
[24]	valid_0's multi_logloss: 0.635404

[78]	valid_0's multi_logloss: 0.349527
[79]	valid_0's multi_logloss: 0.348636
[80]	valid_0's multi_logloss: 0.347738
[81]	valid_0's multi_logloss: 0.346597
[82]	valid_0's multi_logloss: 0.346031
[83]	valid_0's multi_logloss: 0.345204
[84]	valid_0's multi_logloss: 0.344158
[85]	valid_0's multi_logloss: 0.34349
[86]	valid_0's multi_logloss: 0.34296
[87]	valid_0's multi_logloss: 0.342372
[88]	valid_0's multi_logloss: 0.341808
[89]	valid_0's multi_logloss: 0.342048
[90]	valid_0's multi_logloss: 0.342055
[91]	valid_0's multi_logloss: 0.341552
[92]	valid_0's multi_logloss: 0.341405
[93]	valid_0's multi_logloss: 0.341204
[94]	valid_0's multi_logloss: 0.341657
[95]	valid_0's multi_logloss: 0.341866
[96]	valid_0's multi_logloss: 0.342264
[97]	valid_0's multi_logloss: 0.342817
[98]	valid_0's multi_logloss: 0.34313
[99]	valid_0's multi_logloss: 0.343483
[100]	valid_0's multi_logloss: 0.34355
[101]	valid_0's multi_logloss: 0.344105
[102]	valid_0's multi_logloss: 0.344379
[103]	valid_0's multi_logl

[59]	valid_0's multi_logloss: 0.34437
[60]	valid_0's multi_logloss: 0.340879
[61]	valid_0's multi_logloss: 0.337111
[62]	valid_0's multi_logloss: 0.334275
[63]	valid_0's multi_logloss: 0.33107
[64]	valid_0's multi_logloss: 0.327371
[65]	valid_0's multi_logloss: 0.324843
[66]	valid_0's multi_logloss: 0.322657
[67]	valid_0's multi_logloss: 0.319974
[68]	valid_0's multi_logloss: 0.31823
[69]	valid_0's multi_logloss: 0.315878
[70]	valid_0's multi_logloss: 0.313828
[71]	valid_0's multi_logloss: 0.311251
[72]	valid_0's multi_logloss: 0.309504
[73]	valid_0's multi_logloss: 0.307611
[74]	valid_0's multi_logloss: 0.305692
[75]	valid_0's multi_logloss: 0.304097
[76]	valid_0's multi_logloss: 0.302916
[77]	valid_0's multi_logloss: 0.302101
[78]	valid_0's multi_logloss: 0.300506
[79]	valid_0's multi_logloss: 0.298269
[80]	valid_0's multi_logloss: 0.297254
[81]	valid_0's multi_logloss: 0.295223
[82]	valid_0's multi_logloss: 0.294045
[83]	valid_0's multi_logloss: 0.292961
[84]	valid_0's multi_logloss

[101]	valid_0's multi_logloss: 0.192746
[102]	valid_0's multi_logloss: 0.19232
[103]	valid_0's multi_logloss: 0.192503
[104]	valid_0's multi_logloss: 0.191552
[105]	valid_0's multi_logloss: 0.191248
[106]	valid_0's multi_logloss: 0.191718
[107]	valid_0's multi_logloss: 0.19191
[108]	valid_0's multi_logloss: 0.192099
[109]	valid_0's multi_logloss: 0.19162
[110]	valid_0's multi_logloss: 0.191051
[111]	valid_0's multi_logloss: 0.190968
[112]	valid_0's multi_logloss: 0.190716
[113]	valid_0's multi_logloss: 0.190839
[114]	valid_0's multi_logloss: 0.190711
[115]	valid_0's multi_logloss: 0.19037
[116]	valid_0's multi_logloss: 0.189619
[117]	valid_0's multi_logloss: 0.18962
[118]	valid_0's multi_logloss: 0.189467
[119]	valid_0's multi_logloss: 0.18932
[120]	valid_0's multi_logloss: 0.189474
[121]	valid_0's multi_logloss: 0.189355
[122]	valid_0's multi_logloss: 0.189231
[123]	valid_0's multi_logloss: 0.189588
[124]	valid_0's multi_logloss: 0.189696
[125]	valid_0's multi_logloss: 0.189412
[126]	

[35]	valid_0's multi_logloss: 0.47392
[36]	valid_0's multi_logloss: 0.462812
[37]	valid_0's multi_logloss: 0.451906
[38]	valid_0's multi_logloss: 0.441253
[39]	valid_0's multi_logloss: 0.43204
[40]	valid_0's multi_logloss: 0.423735
[41]	valid_0's multi_logloss: 0.415035
[42]	valid_0's multi_logloss: 0.406633
[43]	valid_0's multi_logloss: 0.399457
[44]	valid_0's multi_logloss: 0.391761
[45]	valid_0's multi_logloss: 0.38517
[46]	valid_0's multi_logloss: 0.378468
[47]	valid_0's multi_logloss: 0.371794
[48]	valid_0's multi_logloss: 0.364944
[49]	valid_0's multi_logloss: 0.358898
[50]	valid_0's multi_logloss: 0.352865
[51]	valid_0's multi_logloss: 0.34705
[52]	valid_0's multi_logloss: 0.342464
[53]	valid_0's multi_logloss: 0.336851
[54]	valid_0's multi_logloss: 0.332178
[55]	valid_0's multi_logloss: 0.327901
[56]	valid_0's multi_logloss: 0.323675
[57]	valid_0's multi_logloss: 0.319436
[58]	valid_0's multi_logloss: 0.314953
[59]	valid_0's multi_logloss: 0.311129
[60]	valid_0's multi_logloss:

[88]	valid_0's multi_logloss: 0.310309
[89]	valid_0's multi_logloss: 0.309669
[90]	valid_0's multi_logloss: 0.308981
[91]	valid_0's multi_logloss: 0.308955
[92]	valid_0's multi_logloss: 0.308372
[93]	valid_0's multi_logloss: 0.30809
[94]	valid_0's multi_logloss: 0.307592
[95]	valid_0's multi_logloss: 0.307491
[96]	valid_0's multi_logloss: 0.307126
[97]	valid_0's multi_logloss: 0.306659
[98]	valid_0's multi_logloss: 0.306188
[99]	valid_0's multi_logloss: 0.305966
[100]	valid_0's multi_logloss: 0.306018
[101]	valid_0's multi_logloss: 0.305363
[102]	valid_0's multi_logloss: 0.304871
[103]	valid_0's multi_logloss: 0.304318
[104]	valid_0's multi_logloss: 0.303748
[105]	valid_0's multi_logloss: 0.303637
[106]	valid_0's multi_logloss: 0.303711
[107]	valid_0's multi_logloss: 0.302948
[108]	valid_0's multi_logloss: 0.302891
[109]	valid_0's multi_logloss: 0.303018
[110]	valid_0's multi_logloss: 0.303604
[111]	valid_0's multi_logloss: 0.303617
[112]	valid_0's multi_logloss: 0.304099
[113]	valid_0

** ( 범주값 출력: clf.pedict(), 확률값출력: clf.predict_proba() ) **

In [9]:
print(f'{accuracy_score(label_int, np.argmax(p_val, axis=1)) * 100:.4f}%')

91.3318%


#베이스 90.7944  #2 90.7710
#hw8lgbm-cv 91.1682   
subsample_freq=1, 제거 _91.0047%  
n_e 500_91.0047%  
learning rate 001 91.1449  
subsample 8->7 91.1449 ->6 91.1449 ->5  911449  ->4  911449 제거 90.9813  
 하이퍼옵 90.8178%   
cv 15 >91.1916   
3000 256 003 10 08 06 42 15번  >>91.2617


#### [stacking 사용을 위해 p_val,p_tst 저장]

In [35]:
val_dir = Path('C:\\Users\\USER\\Desktop\\Dataset\\val')
tst_dir = Path('C:\\Users\\USER\\Desktop\\Dataset\\tst')
name = 'lgbmcvFeature'
model_name = f'{name}'
p_val_file = val_dir / f'{name}.val.csv'
p_tst_file = tst_dir / f'{name}.tst.csv'

In [36]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=', ')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=', ')

## Simple way Cross Validation

In [18]:
scores = cross_val_score(clf, trn,y, cv=5) # model, train, target, cross validation
print('cross-val-score.mean \n{:.4f}%'.format(scores.mean()*100))
#91.4~5,강사님 베이스 91.776->에서 러닝=.05 낮아짐,91.659(1500)91.893(500)
#92.033(N_e 300_마지노선)# 하이퍼로 한거 이제 하지 말아라..성능 존나 안나온다

cross-val-score.mean 
90.5374%


### [수치형 > 범주형 변환_종속변수]

In [38]:
pt=np.zeros(shape=(1833,))
pt = pt.astype(np.int64)
for x in range(0,1833):
    pt[x] = np.argmax(p_tst[x,:])
target=le.inverse_transform(pt)
target

array(['HI', 'EL', 'HI', ..., 'PH', 'PH', 'CO'], dtype=object)

## 제출 파일 생성

In [39]:
#sub = pd.read_csv(sample_file)
#sub['class']
sub = pd.read_csv(sample_file)
sub['class'] = target
sub.to_csv('C:\\Users\\USER\\Desktop\\Dataset\\sub\\lgbmcv-sub.csv',index=False,header=False)

In [None]:
# dataset
#logreg =lgb.LGBMClassifier(boosting_type='dart',
#                          objective='multiclass',
#                           n_estimators=600,
##                           num_leaves=10,
#                           learning_rate=0.1,
#                           #min_child_samples=10,
#                           subsample=.5,
#                           subsample_freq=1,
#                           colsample_bytree=.9,
#                           random_state=150,
#                           n_jobs=-1)

#scores = cross_val_score(logreg, trn,y, cv=5) # model, train, target, cross validation
#print('cross-val-score \n{}'.format(scores))
#print('cross-val-score.mean \n{:.3f}'.format(scores.mean()*100))#90.911 91.215

In [None]:
#{'objective': 'multiclass', 'n_estimators': 1000, 'subsample_freq': 1, 'random_state': 42, 'n_jobs': -1, 
#'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.03108434204266342, 'min_child_samples': 10, 
#'num_leaves': 127, 'subsample': 0.6000000000000001}
#logreg =lgb.LGBMClassifier(objective='multiclass',
#                           n_estimators=500,
#                           num_leaves=127,
#                           learning_rate=0.03108434204266342,
#                           colsample_bytree=.7000000000000001,  
#                           min_child_samples=10,
#                           subsample=.6000000000000001,
#                           random_state=42,
#                           n_jobs=-1)

#scores = cross_val_score(logreg, trn,y, cv=12) # model, train, target, cross validation
#print('cross-val-score \n{}'.format(scores))
#print('cross-val-score.mean \n{:.3f}'.format(scores.mean()*100))
#91.4~5,강사님 베이스 91.776->에서 러닝=.05 낮아짐,
#hW8-lightgbm-cv 91.846 n_e 500> 90.981 001 91.192
#subsample 8->7 91.192 ->6 91.192->5 91.192 
#강사 하이퍼 옵91.659 91.706(12=cv)