참고 코드 : https://dacon.io/competitions/official/235869/codeshare/4450


In [2]:
import numpy as np
import pandas as pd
import random
import os
from pandas import read_csv, set_option

from sklearn.model_selection import cross_val_score, train_test_split, RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import resample, shuffle

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
DATA_PATH = 'D:\\Data\\LGAI_AutoDriveSensors\\'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# DATA
train_df = pd.read_csv(DATA_PATH + 'train.csv')
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

test_x = pd.read_csv(DATA_PATH + 'test.csv').drop(columns=['ID'])
submit = pd.read_csv(DATA_PATH +'sample_submission.csv')


In [5]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt.iloc[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

from sklearn.metrics import make_scorer

nmae_score = make_scorer(lg_nrmse, greater_is_better=False)

In [27]:
def lg_nrmse_2(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[idx], preds[idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

nmae_score2 = make_scorer(lg_nrmse_2, greater_is_better=False)


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

---

In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor, Pool
from ngboost import NGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.multioutput import MultiOutputRegressor


In [70]:
etr=ExtraTreesRegressor(n_estimators=200)
etr.fit(X_train,y_train)
y_etr=etr.predict(X_valid)
score_etr=lg_nrmse(y_valid, y_etr)
print(score_etr)
one_etr_pred = etr.predict(test_x)

1.6451451964111956


In [24]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_lr=lr.predict(X_valid)
score_lr=lg_nrmse(y_valid, y_lr)
print(score_lr)

1.665266095049671


In [26]:
rg=Ridge()
rg.fit(X_train,y_train)
y_rg=rg.predict(X_valid)
score_rg=lg_nrmse(y_valid, y_rg)
print(score_rg)

1.6652407246433516


In [27]:
rf=RandomForestRegressor(random_state = 42, criterion = 'mae')
rf.fit(X_train,y_train)
y_rf=rf.predict(X_valid)
score_rf=lg_nrmse(y_valid, y_rf)
print(score_rf)

In [71]:
gbr=MultiOutputRegressor(GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate=0.09, loss='huber', n_estimators=130))
gbr.fit(X_train,y_train)
y_gbr=gbr.predict(X_valid)
score_gbr=lg_nrmse(y_valid, y_gbr)
print(score_gbr)
one_gbr_pred = gbr.predict(test_x)

1.6497996449537005


In [72]:
xgb=MultiOutputRegressor(XGBRegressor(random_state = 42, learning_rate=0.05, n_estimators=200))
xgb.fit(X_train,y_train)
y_xgb=xgb.predict(X_valid)
score_xgb=lg_nrmse(y_valid, y_xgb)
print(score_xgb)
one_xgb_pred = xgb.predict(test_x)

1.6418925051151068


In [73]:
lgb=MultiOutputRegressor(LGBMRegressor(random_state = 42, learning_rate=0.05, n_estimators=200))
lgb.fit(X_train,y_train)
y_lgb=lgb.predict(X_valid)
score_lgb=lg_nrmse(y_valid, y_lgb)
print(score_lgb)
one_lgb_pred = lgb.predict(test_x)

1.6352221801084492


In [15]:
ada=MultiOutputRegressor(AdaBoostRegressor(random_state = 42, learning_rate=0.03, n_estimators=100))
ada.fit(X_train,y_train)
y_ada=ada.predict(X_valid)
score_ada=lg_nrmse(y_valid, y_ada)
print(score_ada)

1.6945845632061969


In [74]:
X_train_npy = X_train.to_numpy()
y_train_npy = y_train.to_numpy()
# hgb=MultiOutputRegressor(HistGradientBoostingRegressor(random_state=42, scoring=nmae_score, learning_rate=0.09))
hgb=MultiOutputRegressor(HistGradientBoostingRegressor(random_state=42, learning_rate=0.09))
hgb.fit(X_train, y_train)
y_hgb=hgb.predict(X_valid)
score_hgb=lg_nrmse(y_valid, y_hgb)
print(score_hgb)
one_hgbr_pred = hgb.predict(test_x)

1.6394618872778197


In [30]:
ngb=MultiOutputRegressor(NGBRegressor(random_state = 42, n_estimators = 500, verbose = 0, learning_rate = 0.03))
ngb.fit(X_train,y_train)
y_ngb=ngb.predict(X_valid)
score_ngb=lg_nrmse(y_valid, y_ngb)
print(score_ngb)

1.6484624958175929


In [69]:
cat_reg=MultiOutputRegressor(CatBoostRegressor(depth = 4, random_state = 42, loss_function = 'MAE', n_estimators = 3000, learning_rate = 0.03, verbose = 0))
cat_reg.fit(X_train,y_train)
y_cat=cat_reg.predict(X_valid)
score_cat=lg_nrmse(y_valid, y_cat)
print(score_cat)

1.6523897837660741


---

성능이 잘 나온 모델들만 KFold 적용해본다      


ExtraTreesRegressor 1.645279494427436 ***         
LinearRegression 1.665266095049671       
Ridge 1.6652407246433516       
RandomForestRegressor      
GradientBoostingRegressor 1.6497996449537005 ***           
XGBRegressor 1.6418925051151068 ***        
LGBMRegressor 1.6352221801084492 ***      
AdaBoostRegressor 1.6945845632061969   
HistGradientBoostingRegressor 1.6394618872778197  ***         
NGBRegressor 1.6484624958175929 ***       
CatBoostRegressor 1.6523897837660741       

In [33]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [37]:
# ExtraTreesRegression
etr_pred = np.zeros([test_x.shape[0], train_y.shape[1]])
etr_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_x, train_y)) :
    tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    etr = ExtraTreesRegressor(n_estimators=200)
    etr.fit(tr_x, tr_y)
    
    val_pred = etr.predict(val_x)
    val_nmae = lg_nrmse(val_y, val_pred)
    etr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}')
    
    fold_pred = etr.predict(test_x) / 10
    etr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(etr_val)}')

1 FOLD NMAE = 1.629603929164366
2 FOLD NMAE = 1.6571055218576294
3 FOLD NMAE = 1.6989656726897222
4 FOLD NMAE = 1.6572800592747863
5 FOLD NMAE = 1.6542910915218725
6 FOLD NMAE = 1.6656784892693195
7 FOLD NMAE = 1.680446085080938
8 FOLD NMAE = 1.624012426831225
9 FOLD NMAE = 1.6834269492328882
10 FOLD NMAE = 1.636172683274552
10FOLD Mean of NMAE = 1.6586982908197299


In [38]:
# GradientBoostingRegressor
gbr_pred = np.zeros([test_x.shape[0], train_y.shape[1]])
gbr_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_x, train_y)) :
    tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate=0.09, loss='huber', n_estimators=130))
    gbr.fit(tr_x, tr_y)
    
    val_pred = gbr.predict(val_x)
    val_nmae = lg_nrmse(val_y, val_pred)
    gbr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}')
    
    fold_pred = gbr.predict(test_x) / 10
    gbr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(gbr_val)}')

1 FOLD NMAE = 1.6391646363058967
2 FOLD NMAE = 1.6610711009793402
3 FOLD NMAE = 1.7118887411308636
4 FOLD NMAE = 1.6666550526892283
5 FOLD NMAE = 1.6522677944754105
6 FOLD NMAE = 1.6703466218753105
7 FOLD NMAE = 1.6861689217447793
8 FOLD NMAE = 1.6308138007022441
9 FOLD NMAE = 1.696032472376147
10 FOLD NMAE = 1.6411534465263025
10FOLD Mean of NMAE = 1.6655562588805524


In [42]:
# xgbr
xgb_pred = np.zeros([test_x.shape[0], train_y.shape[1]])
xgb_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_x, train_y)) :
    tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    eval_set = [(val_x, val_y)]
    xgb = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7, eval_set=eval_set) )
    xgb.fit(tr_x, tr_y)
    
    val_pred = xgb.predict(val_x)
    val_nmae = lg_nrmse(val_y, val_pred)
    xgb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}')
     
    fold_pred = xgb.predict(test_x) / 10
    xgb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(xgb_val)}')

Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "eval_set" } might not be used.

  This c

In [43]:
# lgbr
lgb_pred = np.zeros([test_x.shape[0], train_y.shape[1]])
lgb_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_x, train_y)) :
    tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    lgb = MultiOutputRegressor(LGBMRegressor(random_state = 42, learning_rate=0.05, n_estimators=200))
    lgb.fit(tr_x, tr_y)
    
    val_pred = lgb.predict(val_x)
    val_nmae = lg_nrmse(val_y, val_pred)
    lgb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}')
    
    fold_pred = lgb.predict(test_x) / 10
    lgb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(lgb_val)}')

1 FOLD NMAE = 1.6341002234682507
2 FOLD NMAE = 1.636868897077238
3 FOLD NMAE = 1.6872176395947003
4 FOLD NMAE = 1.651096001302942
5 FOLD NMAE = 1.6494997835747032
6 FOLD NMAE = 1.6427430180008227
7 FOLD NMAE = 1.6764849923835894
8 FOLD NMAE = 1.618472195548039
9 FOLD NMAE = 1.6729821819728596
10 FOLD NMAE = 1.6221218760550047
10FOLD Mean of NMAE = 1.649158680897815


In [45]:
# HGBRegressor
hgbr_pred = np.zeros([test_x.shape[0], train_y.shape[1]])
hgbr_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_x, train_y)) :
    tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    hgbr = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=42, early_stopping=20))
    hgbr.fit(tr_x, tr_y)
    
    val_pred = hgbr.predict(val_x)
    val_nmae = lg_nrmse(val_y, val_pred)
    hgbr_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}')
    
    fold_pred = hgbr.predict(test_x) / 10
    hgbr_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(hgbr_val)}')

1 FOLD NMAE = 1.6345493085182163
2 FOLD NMAE = 1.644164015874683
3 FOLD NMAE = 1.6937652543094563
4 FOLD NMAE = 1.6537856573063918
5 FOLD NMAE = 1.654039579390386
6 FOLD NMAE = 1.6526343833338741
7 FOLD NMAE = 1.683082853558101
8 FOLD NMAE = 1.62198249989197
9 FOLD NMAE = 1.6793520669673858
10 FOLD NMAE = 1.6241478142740347
10FOLD Mean of NMAE = 1.6541503433424498


In [77]:
# NGBRegressor
ngb_pred = np.zeros([test_x.shape[0], train_y.shape[1]])
ngb_val = []

for n, (tr_idx, val_idx) in enumerate(kf.split(train_x, train_y)) :
    tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    ngb = MultiOutputRegressor(NGBRegressor(random_state = 42, n_estimators = 500, verbose = 0, learning_rate = 0.03))
    ngb.fit(X_train,y_train)
    
    val_pred = ngb.predict(val_x)
    val_nmae = lg_nrmse(val_y, val_pred)
    ngb_val.append(val_nmae)
    print(f'{n + 1} FOLD NMAE = {val_nmae}')
    
    fold_pred = ngb.predict(test_x) / 10
    ngb_pred += fold_pred
print(f'10FOLD Mean of NMAE = {np.mean(ngb_val)}')

1 FOLD NMAE = 1.6374000410828387
2 FOLD NMAE = 1.6591095316886828
3 FOLD NMAE = 1.6365080629553284
4 FOLD NMAE = 1.6081978291592867
5 FOLD NMAE = 1.607713876230047
6 FOLD NMAE = 1.61248525231605
7 FOLD NMAE = 1.632369764111269
8 FOLD NMAE = 1.5737748849356306
9 FOLD NMAE = 1.6318012384876421
10 FOLD NMAE = 1.5910062634892046
10FOLD Mean of NMAE = 1.619036674445598


In [79]:
# 검증 성능 확인하기
val_list = [etr_val, gbr_val, xgb_val, lgb_val, hgbr_val, ngb_val]
for val in val_list :
  print("{:.8f}".format(np.mean(val))) 

1.65869829
1.66555626
1.65594080
1.64915868
1.65415034
1.61903667


---

In [66]:
# Submit
submit = pd.read_csv(DATA_PATH +'sample_submission.csv')
submit_pred = (etr_pred + gbr_pred + xgb_pred + 2*lgb_pred + hgbr_pred ) / 6

for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = submit_pred[:,idx-1]
print('Done.')

print(submit.head())
submit.to_csv(DATA_PATH + 'submit/multimodel_1.csv', index=False)

Done.
           ID      Y_01      Y_02      Y_03       Y_04       Y_05       Y_06  \
0  TEST_00001  1.421823  1.199289  1.118757  13.813216  31.262919  16.444118   
1  TEST_00002  1.484784  1.200973  1.141142  13.601456  31.130048  16.533546   
2  TEST_00003  1.425335  1.128164  1.076842  14.703900  32.020931  16.735256   
3  TEST_00004  1.439476  1.123849  1.052510  15.067139  32.440514  17.051939   
4  TEST_00005  1.326475  1.012024  0.958455  14.983642  31.736868  16.905560   

       Y_07       Y_08       Y_09       Y_10       Y_11       Y_12       Y_13  \
0  3.136132 -26.139966 -26.170184 -22.289191  24.451025 -26.094035 -26.070648   
1  3.147978 -26.186631 -26.195718 -22.308649  24.356020 -26.125816 -26.134185   
2  3.083407 -25.924131 -25.931979 -22.099016  24.588286 -25.854101 -25.846501   
3  3.068025 -25.686741 -25.695697 -21.800683  24.879417 -25.673866 -25.631074   
4  3.113377 -25.735997 -25.712771 -21.979755  24.795480 -25.647313 -25.672335   

        Y_14  
0 -26.10454

In [67]:
# multimodel_1.csv : score 1.9405257712

---

In [76]:
# Submit
# KFold 안 한 모델도 합쳐보자.

submit = pd.read_csv(DATA_PATH +'sample_submission.csv')
submit_pred = (etr_pred + gbr_pred + xgb_pred + 2*lgb_pred + hgbr_pred + one_etr_pred + one_gbr_pred + one_xgb_pred + 2*one_lgb_pred + one_hgbr_pred ) / 12

for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = submit_pred[:,idx-1]
print('Done.')

print(submit.head())
submit.to_csv(DATA_PATH + 'submit/multimodel_2.csv', index=False)

Done.
           ID      Y_01      Y_02      Y_03       Y_04       Y_05       Y_06  \
0  TEST_00001  1.411699  1.185183  1.106036  13.779403  31.313434  16.427175   
1  TEST_00002  1.476481  1.202281  1.135625  13.605003  31.143306  16.526610   
2  TEST_00003  1.441634  1.126433  1.054352  14.411322  31.885802  16.845717   
3  TEST_00004  1.437995  1.114545  1.048068  15.110330  32.341183  17.035992   
4  TEST_00005  1.353400  1.011366  0.974485  15.044207  31.720960  16.948335   

       Y_07       Y_08       Y_09       Y_10       Y_11       Y_12       Y_13  \
0  3.130484 -26.137900 -26.192287 -22.266059  24.420294 -26.091331 -26.076072   
1  3.141728 -26.180310 -26.180191 -22.291007  24.387692 -26.109267 -26.115644   
2  3.087740 -25.973161 -25.988678 -22.070209  24.507536 -25.919344 -25.837495   
3  3.080290 -25.705266 -25.693753 -21.784924  24.858571 -25.665595 -25.627581   
4  3.092334 -25.722490 -25.687504 -21.957834  24.801930 -25.642454 -25.642563   

        Y_14  
0 -26.09972

In [None]:
# multimodel_2.csv : score 1.9420249572

---

NGBRegressor 모델 결과 확인 -> 좋으면 합치기 -> 1.61 좋은 score


In [80]:
# Submit
submit = pd.read_csv(DATA_PATH +'sample_submission.csv')
submit_pred = (etr_pred + gbr_pred + xgb_pred + lgb_pred + hgbr_pred + 2*ngb_pred) / 7

for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = submit_pred[:,idx-1]
print('Done.')

print(submit.head())
submit.to_csv(DATA_PATH + 'submit/multimodel_3.csv', index=False)

Done.
           ID      Y_01      Y_02      Y_03       Y_04       Y_05       Y_06  \
0  TEST_00001  1.415879  1.182184  1.109609  13.730886  31.221967  16.433666   
1  TEST_00002  1.482662  1.200889  1.136801  13.598308  31.193445  16.527668   
2  TEST_00003  1.419215  1.117357  1.073230  14.339821  31.866489  16.750808   
3  TEST_00004  1.441641  1.119637  1.051996  15.107137  32.400547  16.961207   
4  TEST_00005  1.335441  1.005687  0.947809  15.023913  31.707833  16.895842   

       Y_07       Y_08       Y_09       Y_10       Y_11       Y_12       Y_13  \
0  3.143991 -26.142866 -26.180048 -22.272428  24.441211 -26.096777 -26.075345   
1  3.143267 -26.173184 -26.187798 -22.293179  24.358728 -26.117567 -26.123770   
2  3.066173 -25.938760 -25.936729 -22.065767  24.567287 -25.856680 -25.847655   
3  3.054633 -25.684965 -25.697044 -21.817222  24.848295 -25.672968 -25.637338   
4  3.100658 -25.756515 -25.713426 -21.997531  24.786606 -25.647218 -25.678778   

        Y_14  
0 -26.11971

In [None]:
# multimodel_3.csv : score 1.9433628177

---

In [84]:
# Submit
# KFold 안 한 모델도 합쳐보자. 하나씩만

submit = pd.read_csv(DATA_PATH +'sample_submission.csv')
submit_pred = (etr_pred + gbr_pred + xgb_pred + lgb_pred + hgbr_pred + one_etr_pred + one_gbr_pred + one_xgb_pred + one_lgb_pred + one_hgbr_pred ) / 10

for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = submit_pred[:,idx-1]
print('Done.')

print(submit.head())
submit.to_csv(DATA_PATH + 'submit/multimodel_4.csv', index=False)

Done.
           ID      Y_01      Y_02      Y_03       Y_04       Y_05       Y_06  \
0  TEST_00001  1.406865  1.177955  1.105111  13.720372  31.315833  16.381356   
1  TEST_00002  1.472500  1.202714  1.135320  13.611027  31.139283  16.544184   
2  TEST_00003  1.437114  1.124916  1.055616  14.319598  31.846189  16.851182   
3  TEST_00004  1.435978  1.114588  1.046672  15.103421  32.358982  17.029645   
4  TEST_00005  1.358921  1.012609  0.973907  15.039456  31.734480  16.937491   

       Y_07       Y_08       Y_09       Y_10       Y_11       Y_12       Y_13  \
0  3.128806 -26.150570 -26.199356 -22.280714  24.409863 -26.095168 -26.085812   
1  3.142770 -26.177999 -26.178334 -22.292308  24.383437 -26.106388 -26.111887   
2  3.084916 -25.968191 -25.973158 -22.064928  24.532671 -25.894770 -25.828137   
3  3.080163 -25.709695 -25.701379 -21.792593  24.849838 -25.667125 -25.631189   
4  3.096894 -25.718843 -25.689608 -21.955929  24.779313 -25.640890 -25.647326   

        Y_14  
0 -26.10805

score 1.9424072278

---
상관계수 높은 피쳐 X와 Y 따로 훈련

In [86]:
###############################################
# X2 : 스크류 삽입 깊이&방열 재료 1 무게
# Y2 : 신호대 잡음비 2 ~ 7
# 상관계수가 높은 피쳐들끼리 따로 훈련
train_x_2 = train_x.loc[:,['X_19','X_20','X_21','X_22','X_03']]
train_y_2 = train_y.loc[:,['Y_08','Y_09','Y_10','Y_12','Y_13','Y_14']]
test_x_2 = test_x.loc[:,['X_19','X_20','X_21','X_22','X_03']]
###############################################
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(train_x_2, train_y_2, test_size=0.2, random_state=42)


In [87]:
# GradientBoostingRegressor
gbr_pred2 = np.zeros([test_x.shape[0], train_y_2.shape[1]])
gbr_val_pred2 = np.zeros([X_valid2.shape[0], train_y_2.shape[1]])

for n, (tr_idx, val_idx) in enumerate(kf.split(X_train2, y_train2)) :
    tr_x, tr_y = X_train2.iloc[tr_idx], y_train2.iloc[tr_idx]
    val_x, val_y = X_train2.iloc[val_idx], y_train2.iloc[val_idx]
    
    gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state = 42, max_depth = 4, learning_rate=0.09, loss='huber', n_estimators=130))
    gbr.fit(tr_x, tr_y)
        
    fold_pred = gbr.predict(test_x_2) / 10
    gbr_pred2 += fold_pred

In [88]:
# lgbr
lgb_pred2 = np.zeros([test_x.shape[0], train_y_2.shape[1]])
lgb_val_pred2 = np.zeros([X_valid2.shape[0], train_y_2.shape[1]])

for n, (tr_idx, val_idx) in enumerate(kf.split(X_train2, y_train2)) :
    tr_x, tr_y = X_train2.iloc[tr_idx], y_train2.iloc[tr_idx]
    val_x, val_y = X_train2.iloc[val_idx], y_train2.iloc[val_idx]
    
    lgb = MultiOutputRegressor(LGBMRegressor(random_state = 42, learning_rate=0.05, n_estimators=200))
    lgb.fit(tr_x, tr_y)
    
    fold_pred = lgb.predict(test_x_2) / 10
    lgb_pred2 += fold_pred

In [90]:
# xgbr
xgb_pred2 = np.zeros([test_x.shape[0], train_y_2.shape[1]])
xgb_val_pred2 = np.zeros([X_valid2.shape[0], train_y_2.shape[1]])

for n, (tr_idx, val_idx) in enumerate(kf.split(X_train2, y_train2)) :
    tr_x, tr_y = X_train2.iloc[tr_idx], y_train2.iloc[tr_idx]
    val_x, val_y = X_train2.iloc[val_idx], y_train2.iloc[val_idx]
    
    eval_set = [(val_x, val_y)]
    xgb = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7, eval_set=eval_set) )
    xgb.fit(tr_x, tr_y)
    
    fold_pred = xgb.predict(test_x_2) / 10
    xgb_pred2 += fold_pred

Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "eval_set" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "eval_set" } might not be used.

  This c

In [92]:
# ExtraTreesRegression
etr_pred2 = np.zeros([test_x.shape[0], train_y_2.shape[1]])
etr_val_pred2 = np.zeros([X_valid2.shape[0], train_y_2.shape[1]])

for n, (tr_idx, val_idx) in enumerate(kf.split(X_train2, y_train2)) :
    tr_x, tr_y = X_train2.iloc[tr_idx], y_train2.iloc[tr_idx]
    val_x, val_y = X_train2.iloc[val_idx], y_train2.iloc[val_idx]
    
    etr = ExtraTreesRegressor(n_estimators=200)
    etr.fit(tr_x, tr_y) 
    
    fold_pred = etr.predict(test_x_2) / 10
    etr_pred2 += fold_pred
    

In [94]:
pred2 = (gbr_pred2 + lgb_pred2 + xgb_pred2 + etr_pred2) / 4
pred2.shape

(39608, 6)

In [96]:
# Submit
submit = pd.read_csv(DATA_PATH +'sample_submission.csv')
submit_pred = (etr_pred + gbr_pred + xgb_pred + 2*lgb_pred + hgbr_pred ) / 6

# data 1에 있던 값들을 data 2 값으로 바꾼다.
submit_pred[:,7:10] = pred2[:,:3].copy()
submit_pred[:,11:] = pred2[:,3:].copy()

for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = submit_pred[:,idx-1]
print('Done.')

print(submit.head())
submit.to_csv(DATA_PATH + 'submit/multimodel_5.csv', index=False)

Done.
           ID      Y_01      Y_02      Y_03       Y_04       Y_05       Y_06  \
0  TEST_00001  1.421823  1.199289  1.118757  13.813216  31.262919  16.444118   
1  TEST_00002  1.484784  1.200973  1.141142  13.601456  31.130048  16.533546   
2  TEST_00003  1.425335  1.128164  1.076842  14.703900  32.020931  16.735256   
3  TEST_00004  1.439476  1.123849  1.052510  15.067139  32.440514  17.051939   
4  TEST_00005  1.326475  1.012024  0.958455  14.983642  31.736868  16.905560   

       Y_07       Y_08       Y_09       Y_10       Y_11       Y_12       Y_13  \
0  3.136132 -26.168706 -26.180301 -22.284578  24.451025 -26.106640 -26.101171   
1  3.147978 -26.213424 -26.248511 -22.295752  24.356020 -26.170891 -26.155865   
2  3.083407 -26.210328 -26.213248 -22.379520  24.588286 -26.121032 -26.135101   
3  3.068025 -26.231208 -26.254199 -22.197295  24.879417 -26.179749 -26.177516   
4  3.113377 -26.359606 -26.358690 -22.378162  24.795480 -26.295532 -26.296187   

        Y_14  
0 -26.11795