### 데이터 로드 및 전처리

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("./data/otto_train.csv")
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [3]:
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''

'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'

In [4]:
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 61878 nVar: 95


In [5]:
data = data.drop(['id'], axis = 1) # id 제거

In [6]:
mapping_dict = {"Class_1": 1,
                "Class_2": 2,
                "Class_3": 3,
                "Class_4": 4,
                "Class_5": 5,
                "Class_6": 6,
                "Class_7": 7,
                "Class_8": 8,
                "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

In [7]:
feature_columns = list(data.columns.difference(['target'])) # target을 제외한 모든 행
X = data[feature_columns] # 설명변수
y = after_mapping_target # 타겟변수
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(49502, 93) (12376, 93) (49502,) (12376,)


# AdaBoost 적합

In [8]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_model = DecisionTreeClassifier(max_depth=5)
clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)
clf.fit(train_x, train_y)
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.6853587588881707


* 추정 횟수를 증가

In [9]:
tree_model = DecisionTreeClassifier(max_depth=5)
clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=100, random_state=0)
clf.fit(train_x, train_y)
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.6105365223012282


* 트리의 깊이를 증가

In [10]:
tree_model = DecisionTreeClassifier(max_depth=20)
clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)
clf.fit(train_x, train_y)
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.7389301874595993


* 트리의 깊이를 최대로 증가

In [11]:
tree_model = DecisionTreeClassifier(max_depth=100)
clf = AdaBoostClassifier(base_estimator=tree_model, n_estimators=10, random_state=0)
clf.fit(train_x, train_y)
pred1 = clf.predict(test_x)
print(accuracy_score(test_y, pred1))

0.710245636716225


# Gradient Boosting

In [12]:
import xgboost as xgb
import time
start = time.time()
xgb_dtrain = xgb.DMatrix(data=train_x, label=train_y)
xgb_dtest = xgb.DMatrix(data=test_x)
xgb_param = {'max_depth': 10,
            'learning_rate': 0.01,
            'objective': 'multi:softmax', # 목적 함수
            'eval_metric': 'mlogloss',
            'num_class': len(set(train_y)) + 1}  # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
xgb_model = xgb.train(params=xgb_param, dtrain=xgb_dtrain)
xgb_model_predict = xgb_model.predict(xgb_dtest)
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

Accuracy: 76.67 %
Time: 15.99 seconds


In [13]:
xgb_model_predict

array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)

# LightGBM 적합

In [14]:
import lightgbm as lgb
start = time.time()
lgb_dtrain = lgb.Dataset(data=train_x, label=train_y)
lgb_param = {'max_depth': 10,
            'n_estimators': 100,
            'learning_rate': 0.01,
            'objective': 'multiclass',
            'num_class': len(set(train_y)) + 1}
lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain)
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis=1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 7.93 seconds


성능에 영향을 주는 parameter : max_depth, min_data_in_leaf  
lambda_l1, lambda_l2 : regularization 효과를 조정  
-> 여러 parameter 값들을 조정해보며 최적의 설정을 찾아야 한다

In [15]:
lgb_model.predict(test_x)

array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,
        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],
       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,
        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],
       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,
        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],
       ...,
       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,
        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],
       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,
        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],
       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,
        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])

# Catboost

In [16]:
import catboost as cb
start = time.time()
cb_dtrain = cb.Pool(data=train_x, label=train_y)
cb_param = {'max_depth': 10,
           'learning_rate': 0.01,
           'n_estimators': 100,
           'eval_metric': 'Accuracy',
           'loss_function': 'MultiClass'}
cb_model = cb.train(pool=cb_dtrain, params=cb_param)
cb_model_predict = np.argmax(cb_model.predict(test_x), axis=1) + 1
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

0:	learn: 0.5907034	total: 1.59s	remaining: 2m 37s
1:	learn: 0.6356107	total: 2.93s	remaining: 2m 23s
2:	learn: 0.6411256	total: 4.2s	remaining: 2m 15s
3:	learn: 0.6480344	total: 5.63s	remaining: 2m 15s
4:	learn: 0.6508222	total: 6.89s	remaining: 2m 10s
5:	learn: 0.6499939	total: 8.15s	remaining: 2m 7s
6:	learn: 0.6507818	total: 9.54s	remaining: 2m 6s
7:	learn: 0.6548422	total: 10.9s	remaining: 2m 5s
8:	learn: 0.6559533	total: 12.2s	remaining: 2m 3s
9:	learn: 0.6560947	total: 13.5s	remaining: 2m 1s
10:	learn: 0.6568421	total: 14.8s	remaining: 1m 59s
11:	learn: 0.6588219	total: 16.2s	remaining: 1m 59s
12:	learn: 0.6592259	total: 17.6s	remaining: 1m 57s
13:	learn: 0.6611248	total: 18.9s	remaining: 1m 55s
14:	learn: 0.6625591	total: 20.3s	remaining: 1m 54s
15:	learn: 0.6631853	total: 21.6s	remaining: 1m 53s
16:	learn: 0.6639328	total: 23s	remaining: 1m 52s
17:	learn: 0.6668821	total: 24.3s	remaining: 1m 50s
18:	learn: 0.6669630	total: 25.6s	remaining: 1m 49s
19:	learn: 0.6675286	total: 27

In [17]:
cb_model.predict(test_x)

array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,
        -0.02059177, -0.2130643 ],
       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,
         0.2719157 ,  0.25089315],
       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,
        -0.24018767, -0.32984969],
       ...,
       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,
         0.12789417,  1.51166757],
       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,
        -0.49799871, -0.38136323],
       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,
         1.05515787, -0.20799899]])

# Bagging과 LightBGM 비교

In [18]:
data = pd.read_csv("./data/kc_house_data.csv") 
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1)
feature_columns = list(data.columns.difference(['price'])) # Price를 제외한 모든 행
X = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # 학습데이터와 평가데이터의 비율을 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(15129, 8) (6484, 8) (15129,) (6484,)


In [19]:
start = time.time()
lgb_dtrain = lgb.Dataset(data=train_x, label=train_y)
lgb_param = {'max_depth': 10,
            'n_estimators': 500,
            'learning_rate': 0.01,
            'objective': 'regression'}
lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain)
lgb_model.predict(test_x)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537729.263666






array([501716.63620816, 632581.56353778, 947111.30341027, ...,
       341921.48670391, 923907.86981542, 457235.91311423])

In [20]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))

210904.17249451784

기본적으로 라이브러리들에 random_state가 지정되어 있기 때문에, 여러번 돌려도 같은 결과가 나온다  