In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
train_label = pd.read_csv('./data/train_labels.csv')
train_feature = pd.read_csv('./data/train_features.csv')
test_feature = pd.read_csv('./data/test_features.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
features = ['id', 'acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z']
X_data = train_feature[features].groupby('id').agg(['sum', 'median', 
                                                     'max', 'min', 'mean'])
X_exam = test_feature[features].groupby('id').agg(['sum', 'median', 
                                                   'max', 'min', 'mean'])

In [4]:
y_data = train_label['label']

In [22]:
from sklearn.model_selection import train_test_split
# 전체 데이터셋을 학습용 80%, 테스트용 20%로 분할
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, 
                                                    random_state=26, stratify=y_data)
# stratify=y_data: y가 한쪽으로 치우치지 않게 해주는 것
# random_state 는 재현가능(for reproducibility)하도록 난수의 초기값을 설정해주는 것이며, 아무 숫자나 넣어주면 됩니다.

# XGBoost
XGBoost는 loss함수가 최소가 되는 최적값(optimal output value)을 찾아
잔차 residual을 줄여 나간다

특징
- gbm보다 빠르다, 병렬 처리를 사용하기에 학습과 분류가 빠르다.
- 과적합 방지가 가능한 규제가 포함되어 있다.
- Tree pruning(트리 가지치기): 긍정 이득이 없는 분할을 가지치기해서 분할 수를 줄임
- 반복 수행시마다 내부적으로 교차검증을 수행해 최적화된 반복 수행횟수를 가질 수 있음
- 조기 종료(early stopping)을 제공한다
- CART(Classification And Regression Tree)를 기반으로 한다. 즉, 분류와 회귀 둘 다 가능하다
- 앙상블 부스팅(ensemble boosting)의 특징인 가중치 부여를 경사하강법(gradient descent)으로 한다.

In [23]:
import xgboost as xgb
from xgboost import XGBClassifier

In [24]:
xgb_clf = XGBClassifier(n_estimators=400, learning_rate = 0.01, max_depth=3)
# n_estimators: 결정 트리의 개수
# learning_rate: 학습률, 학습 단계별로 가중치를 얼마나 적용할지 결정하는 숫자(보통 0-1)
# max_dath: 트리의 깊이, 0으로 지정하면 깊이의 제한이 없음, 너무 크면 과적합(보통 3-10)

- early_stopping_rounds: 조기 중단을 활성화 시킴, 반복횟수지정
- eval_set: 성능평가를 위한 평가용 데이터 세트를 설정

- eval_metric: 평가 세트에 적용할 성능 평가 방법 (반복마다 eval_set으로 지정된 데이터 세트에서 eval_metric의 지정된 평가 지표로 예측 오류를 측정)
- mlogloss: Multiclass logloss
- logloss: negative log-likelihood

- verbose=True: 학습 로그 찍기
- predict_proba: 예측 확률(cross entropy 때문에 그렇다)

In [25]:
%%time

evals = [(X_test, y_test)] # A list of (X, y) tuple pairs
xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_set=evals, eval_metric='mlogloss', verbose=True)
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]


[0]	validation_0-mlogloss:3.99773
Will train until validation_0-mlogloss hasn't improved in 100 rounds.
[1]	validation_0-mlogloss:3.90022
[2]	validation_0-mlogloss:3.8163
[3]	validation_0-mlogloss:3.74108
[4]	validation_0-mlogloss:3.6735
[5]	validation_0-mlogloss:3.61092
[6]	validation_0-mlogloss:3.55276
[7]	validation_0-mlogloss:3.49878
[8]	validation_0-mlogloss:3.44786
[9]	validation_0-mlogloss:3.39967
[10]	validation_0-mlogloss:3.3546
[11]	validation_0-mlogloss:3.31259
[12]	validation_0-mlogloss:3.27276
[13]	validation_0-mlogloss:3.23367
[14]	validation_0-mlogloss:3.19717
[15]	validation_0-mlogloss:3.16192
[16]	validation_0-mlogloss:3.1272
[17]	validation_0-mlogloss:3.09494
[18]	validation_0-mlogloss:3.06339
[19]	validation_0-mlogloss:3.03293
[20]	validation_0-mlogloss:3.00264
[21]	validation_0-mlogloss:2.97441
[22]	validation_0-mlogloss:2.94711
[23]	validation_0-mlogloss:2.92057
[24]	validation_0-mlogloss:2.89463
[25]	validation_0-mlogloss:2.86853
[26]	validation_0-mlogloss:2.8443


[230]	validation_0-mlogloss:1.352
[231]	validation_0-mlogloss:1.34968
[232]	validation_0-mlogloss:1.34741
[233]	validation_0-mlogloss:1.34496
[234]	validation_0-mlogloss:1.34282
[235]	validation_0-mlogloss:1.34063
[236]	validation_0-mlogloss:1.33845
[237]	validation_0-mlogloss:1.33619
[238]	validation_0-mlogloss:1.33402
[239]	validation_0-mlogloss:1.33194
[240]	validation_0-mlogloss:1.32985
[241]	validation_0-mlogloss:1.32752
[242]	validation_0-mlogloss:1.32523
[243]	validation_0-mlogloss:1.32312
[244]	validation_0-mlogloss:1.3211
[245]	validation_0-mlogloss:1.31896
[246]	validation_0-mlogloss:1.31686
[247]	validation_0-mlogloss:1.31472
[248]	validation_0-mlogloss:1.3126
[249]	validation_0-mlogloss:1.31058
[250]	validation_0-mlogloss:1.30849
[251]	validation_0-mlogloss:1.30648
[252]	validation_0-mlogloss:1.30457
[253]	validation_0-mlogloss:1.30254
[254]	validation_0-mlogloss:1.30067
[255]	validation_0-mlogloss:1.2986
[256]	validation_0-mlogloss:1.29657
[257]	validation_0-mlogloss:1.294

In [21]:
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f'XGBM 정확도 : {xgb_accuracy:.4f}')

XGBM 정확도 : 0.7392


In [None]:
# random 42 => 0.68

# 

In [29]:
%%time
xgb_clf = XGBClassifier(n_estimators=400, learning_rate = 0.1, max_depth=3)

evals = [(X_test, y_test)] 

xgb_clf.fit(X_train, y_train, eval_set=evals, eval_metric='mlogloss', verbose=True)
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

[0]	validation_0-mlogloss:3.03421
[1]	validation_0-mlogloss:2.7644
[2]	validation_0-mlogloss:2.56405
[3]	validation_0-mlogloss:2.40533
[4]	validation_0-mlogloss:2.26807
[5]	validation_0-mlogloss:2.15344
[6]	validation_0-mlogloss:2.05636
[7]	validation_0-mlogloss:1.97
[8]	validation_0-mlogloss:1.89353
[9]	validation_0-mlogloss:1.82905
[10]	validation_0-mlogloss:1.7661
[11]	validation_0-mlogloss:1.71308
[12]	validation_0-mlogloss:1.6626
[13]	validation_0-mlogloss:1.61609
[14]	validation_0-mlogloss:1.57543
[15]	validation_0-mlogloss:1.53703
[16]	validation_0-mlogloss:1.50043
[17]	validation_0-mlogloss:1.46843
[18]	validation_0-mlogloss:1.43941
[19]	validation_0-mlogloss:1.41156
[20]	validation_0-mlogloss:1.38745
[21]	validation_0-mlogloss:1.36327
[22]	validation_0-mlogloss:1.34011
[23]	validation_0-mlogloss:1.31939
[24]	validation_0-mlogloss:1.29983
[25]	validation_0-mlogloss:1.2825
[26]	validation_0-mlogloss:1.26482
[27]	validation_0-mlogloss:1.24911
[28]	validation_0-mlogloss:1.2339
[29

[229]	validation_0-mlogloss:1.01555
[230]	validation_0-mlogloss:1.01546
[231]	validation_0-mlogloss:1.01611
[232]	validation_0-mlogloss:1.01659
[233]	validation_0-mlogloss:1.01649
[234]	validation_0-mlogloss:1.01682
[235]	validation_0-mlogloss:1.01691
[236]	validation_0-mlogloss:1.01713
[237]	validation_0-mlogloss:1.01727
[238]	validation_0-mlogloss:1.01751
[239]	validation_0-mlogloss:1.01792
[240]	validation_0-mlogloss:1.01852
[241]	validation_0-mlogloss:1.01822
[242]	validation_0-mlogloss:1.01886
[243]	validation_0-mlogloss:1.01936
[244]	validation_0-mlogloss:1.01968
[245]	validation_0-mlogloss:1.02004
[246]	validation_0-mlogloss:1.02018
[247]	validation_0-mlogloss:1.02038
[248]	validation_0-mlogloss:1.0204
[249]	validation_0-mlogloss:1.02079
[250]	validation_0-mlogloss:1.02137
[251]	validation_0-mlogloss:1.02146
[252]	validation_0-mlogloss:1.0218
[253]	validation_0-mlogloss:1.02229
[254]	validation_0-mlogloss:1.02263
[255]	validation_0-mlogloss:1.02317
[256]	validation_0-mlogloss:1.

In [None]:
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f'XGBM 정확도 : {xgb_accuracy:.4f}')

## 과적합 제어
- eta 값을 낮춥니다.(0.01 ~ 0.1) → eta 값을 낮추면 num_boost_round(n_estimator)를 반대로 높여주어야 합니다.
- max_depth 값을 낮춥니다.
- min_child_weight 값을 높입니다.
- gamma 값을 높입니다.
- subsample과 colsample_bytree를 낮춥니다.

## 41위
RandomForest, lgbm, xgb, LogisticRegression 중 xgb 가장 성능 좋아서 파라미터 튜닝 후 사용
https://dacon.io/competitions/official/235689/codeshare/2389

In [37]:
xgb_clf = XGBClassifier(random_state = 22, 
                          learning_rate = 0.1,
                          n_estimators = 300,
                          max_depth = 6,
                          min_child_weight = 3,
                          gamma = 0.4,
                          subsample = 0.7,
                          colsample_bytree = 0.6)
#                           tree_method='gpu_hist', 
#                           predictor='gpu_predictor',

min_child_weight:
- 관측치에 대한 가중치 합의 최소를 말한다.
- 값이 높을수록 과적합이 방지된다.

gamma: 
- 해당값보다 손실이 크게 감소할 때 분리
- 값이 클수록 과적합 감소효과

subsample:
- training 데이터셋에서 subset을 만들지 전부를 사용할지 정하는 파라미터
- 나무를 만들 때(iteration) 적용하며 과적합 문제를 방지하려고 사용
- 일반적으로 0.5~1 사이의 값을 사용

colsample_bytree:
- 나무를 만들 때 컬럼, 즉 변수를 샘플링해서 쓸지에 대한 파라미터
- 피처가 많을 때 과적합 조절에 사용
- 범위: 0 ~ 1

In [39]:
xgb_clf.fit(X_train, y_train, eval_set=evals, eval_metric='mlogloss', verbose=True)
xgb_pred = xgb_clf.predict(X_test)
xgb_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]

[0]	validation_0-mlogloss:3.09962
[1]	validation_0-mlogloss:2.88785
[2]	validation_0-mlogloss:2.7101
[3]	validation_0-mlogloss:2.5634
[4]	validation_0-mlogloss:2.43892
[5]	validation_0-mlogloss:2.3302
[6]	validation_0-mlogloss:2.23001
[7]	validation_0-mlogloss:2.14183
[8]	validation_0-mlogloss:2.05989
[9]	validation_0-mlogloss:1.98629
[10]	validation_0-mlogloss:1.91627
[11]	validation_0-mlogloss:1.85952
[12]	validation_0-mlogloss:1.80623
[13]	validation_0-mlogloss:1.75493
[14]	validation_0-mlogloss:1.70441
[15]	validation_0-mlogloss:1.66114
[16]	validation_0-mlogloss:1.62516
[17]	validation_0-mlogloss:1.59072
[18]	validation_0-mlogloss:1.55386
[19]	validation_0-mlogloss:1.52061
[20]	validation_0-mlogloss:1.49042
[21]	validation_0-mlogloss:1.46116
[22]	validation_0-mlogloss:1.43518
[23]	validation_0-mlogloss:1.40784
[24]	validation_0-mlogloss:1.38871
[25]	validation_0-mlogloss:1.36547
[26]	validation_0-mlogloss:1.34457
[27]	validation_0-mlogloss:1.32433
[28]	validation_0-mlogloss:1.3074

[228]	validation_0-mlogloss:0.961785
[229]	validation_0-mlogloss:0.961786
[230]	validation_0-mlogloss:0.961882
[231]	validation_0-mlogloss:0.961591
[232]	validation_0-mlogloss:0.961765
[233]	validation_0-mlogloss:0.96181
[234]	validation_0-mlogloss:0.962124
[235]	validation_0-mlogloss:0.961685
[236]	validation_0-mlogloss:0.961515
[237]	validation_0-mlogloss:0.961868
[238]	validation_0-mlogloss:0.961834
[239]	validation_0-mlogloss:0.962425
[240]	validation_0-mlogloss:0.962403
[241]	validation_0-mlogloss:0.96248
[242]	validation_0-mlogloss:0.962239
[243]	validation_0-mlogloss:0.962117
[244]	validation_0-mlogloss:0.962112
[245]	validation_0-mlogloss:0.962024
[246]	validation_0-mlogloss:0.961977
[247]	validation_0-mlogloss:0.961781
[248]	validation_0-mlogloss:0.96195
[249]	validation_0-mlogloss:0.962057
[250]	validation_0-mlogloss:0.96218
[251]	validation_0-mlogloss:0.962499
[252]	validation_0-mlogloss:0.962541
[253]	validation_0-mlogloss:0.962555
[254]	validation_0-mlogloss:0.9623
[255]	v

In [40]:
xgb_accuracy = accuracy_score(y_test, xgb_pred)
print(f'XGBM 정확도 : {xgb_accuracy:.4f}')

XGBM 정확도 : 0.7728


## 42위

https://dacon.io/competitions/official/235689/codeshare/2412

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgbpred = xgb.predict_proba(X_test)

## 44위
파라미터 변경하면서 계속 실행
https://dacon.io/competitions/official/235689/codeshare/2406

In [None]:
#변수 줄이면서 계속 시도
xgb_clf = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=400, eval_metric='mlogloss', eval_set=evals)