In [1]:
import numpy as np
import pandas as pd
import math
import gc

from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from tpot.export_utils import set_param_recursive
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
train = pd.read_csv('data/train_df.csv', encoding='euc-kr')
test = pd.read_csv('data/test_df.csv', encoding='euc-kr')
submission = pd.read_csv('data/sample_submission.csv')

In [2]:
train_data, train_target = train.drop(['index', 'INVC_CONT'],axis=1), train['INVC_CONT']
test_data = test.drop(['index'],axis=1)

In [3]:
data = pd.concat([train_data, test_data])
data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM,DL_GD_MCLS_NM
0,1129000014045300,5011000220046300,패션의류,상의
1,1135000009051200,5011000178037300,생활/건강,반려동물
2,1135000030093100,5011000265091400,패션의류,기타패션의류
3,1154500002014200,5011000315087400,식품,농산물
4,1165000021008300,5011000177051200,식품,가공식품
...,...,...,...,...
4635,5013000858004400,4725000719072200,식품,농산물
4636,5013000870018300,2826000106075300,식품,농산물
4637,5013000897086300,4311100034004300,식품,농산물
4638,5013000902065100,4145000013011200,식품,농산물


In [4]:
data_one = pd.get_dummies(data)
cols = ['SEND_SPG_INNB', 'REC_SPG_INNB']
for col in cols:
    mm = MinMaxScaler()
    data_one[col] = mm.fit_transform(np.array(data_one[col]).reshape(-1, 1))

train_data, test_data = data_one.iloc[:-len(test_data)], data_one.iloc[-len(test_data):]

In [5]:
train_data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,0.004613,0.999487,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0.006151,0.999487,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0.006151,0.999487,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.011148,0.999487,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.013839,0.999487,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31995,0.999487,0.390569,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31996,0.999487,0.014608,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31997,0.999487,0.771066,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31998,0.999487,0.005382,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test_data

Unnamed: 0,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_LCLS_NM_디지털/가전,DL_GD_LCLS_NM_생활/건강,DL_GD_LCLS_NM_식품,DL_GD_LCLS_NM_여행/문화,DL_GD_LCLS_NM_패션의류,DL_GD_LCLS_NM_화장품/미용,DL_GD_MCLS_NM_가공식품,DL_GD_MCLS_NM_건강식품,...,DL_GD_MCLS_NM_문화컨텐츠,DL_GD_MCLS_NM_반려동물,DL_GD_MCLS_NM_상의,DL_GD_MCLS_NM_생활용품,DL_GD_MCLS_NM_수산,DL_GD_MCLS_NM_스킨케어,DL_GD_MCLS_NM_음료,DL_GD_MCLS_NM_음반,DL_GD_MCLS_NM_주방용품,DL_GD_MCLS_NM_축산
0,1.0,0.013839,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.011148,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.776012,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.797027,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.413890,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,1.0,0.926192,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4636,1.0,0.439518,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4637,1.0,0.820118,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4638,1.0,0.777550,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# Average CV score on the training set was: -5.2776756339554325
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=3, min_child_weight=17, n_estimators=100, n_jobs=-1, objective="reg:squarederror", subsample=0.6, verbosity=0)),
    StackingEstimator(estimator=AdaBoostRegressor(learning_rate=1.0, loss="linear", n_estimators=100)),
    RobustScaler(),
    RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=7, min_samples_split=18, n_estimators=100, n_jobs=-1)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

In [51]:
exported_pipeline.fit(train_data, train_target)
prediction = exported_pipeline.predict(test_data)

Fold: 1
(25600, 28) (6400, 28)
Fold 1 | rmse Score: 5.442106316762841
Fold: 2
(25600, 28) (6400, 28)
Fold 2 | rmse Score: 5.207007920102432
Fold: 3
(25600, 28) (6400, 28)
Fold 3 | rmse Score: 4.858355876731301
Fold: 4
(25600, 28) (6400, 28)
Fold 4 | rmse Score: 5.84428813866861
Fold: 5
(25600, 28) (6400, 28)
Fold 5 | rmse Score: 4.8509988276021
rmse Score = 5.240551415973457
OOF rmse Score = 5.253969317403912


In [52]:
submission['INVC_CONT'] = prediction
submission.to_csv('submission/final_pred.csv', index=False)