In [51]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, ensemble
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import lightgbm as lgb

%matplotlib inline
#경고문구 무시
import warnings
warnings.filterwarnings('ignore')

matplotlib.rcParams['axes.unicode_minus']=False
matplotlib.rc('font',family='Malgun Gothic')

In [52]:
train = pd.read_csv('train4.csv')

In [53]:
train.rename(columns={"방송일시": "datetime", "노출(분)": "ExposureTime",
                      "마더코드": "MotherCode", "상품코드": "ItemCode",
                      "상품명": "ItemName", "상품군": "ItemCat",
                      "판매단가": "Price", "취급액": "SellingPrice",
                     '평균기온': 'mean_temp',
                     '최고기온': 'high_temp',
                     '최저기온': 'low_temp',
                     '강수량(mm)': 'rain',
                     '시간': 'time',
                     '판매량': 'sales'}, inplace=True)

In [54]:
train=train.drop('일시',axis=1)

## 1. 상품명 상품군과 같은 object 컬럼또한 category값으로 변경
### (2. 마더코드와 상품코드는 int가 아니라 category 값으로 변경)

In [55]:
for c in train.columns:
    col_type = train[c].dtype
    if col_type == 'object':
        train[c] = train[c].astype('category')

In [56]:
#train['MotherCode'] = train['MotherCode'].astype('category')
#train['ItemCode'] = train['ItemCode'].astype('category')

## 의류

In [167]:
train_clothes=train[train['ItemCat']=='의류']

Y_train_clothes = train_clothes['sales']
X_train_clothes = train_clothes[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price"]]

#train,valid 나누기
train_x_clothes,valid_x_clothes,train_y_clothes,valid_y_clothes=train_test_split(X_train_clothes,Y_train_clothes,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_clothes.shape))
print('train data y size:{}'.format(train_y_clothes.shape))
print('valid data x size:{}'.format(valid_x_clothes.shape))
print('valid data y size:{}\n'.format(valid_y_clothes.shape))

#dataset 
train_clothes_ds = lgb.Dataset(train_x_clothes, label = train_y_clothes) 
valid_clothes_ds = lgb.Dataset(valid_x_clothes, label = valid_y_clothes)

train data x size:(3031, 6)
train data y size:(3031,)
valid data x size:(1300, 6)
valid data y size:(1300,)



## 주방

In [58]:
train_kitchen=train[train['ItemCat']=='주방']

Y_train_kitchen = train_kitchen['sales']
X_train_kitchen = train_kitchen[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_kitchen,valid_x_kitchen,train_y_kitchen,valid_y_kitchen=train_test_split(X_train_kitchen,Y_train_kitchen,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_kitchen.shape))
print('train data y size:{}'.format(train_y_kitchen.shape))
print('valid data x size:{}'.format(valid_x_kitchen.shape))
print('valid data y size:{}\n'.format(valid_y_kitchen.shape))

#dataset 
train_kitchen_ds = lgb.Dataset(train_x_kitchen, label = train_y_kitchen) 
valid_kitchen_ds = lgb.Dataset(valid_x_kitchen, label = valid_y_kitchen)

train data x size:(4596, 7)
train data y size:(4596,)
valid data x size:(1971, 7)
valid data y size:(1971,)



## 가전

In [59]:
train_electronic=train[train['ItemCat']=='가전']

Y_train_electronic = train_electronic['sales']
X_train_electronic = train_electronic[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_electronic,valid_x_electronic,train_y_electronic,valid_y_electronic=train_test_split(X_train_electronic,Y_train_electronic,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_electronic.shape))
print('train data y size:{}'.format(train_y_electronic.shape))
print('valid data x size:{}'.format(valid_x_electronic.shape))
print('valid data y size:{}\n'.format(valid_y_electronic.shape))

#dataset 
train_electronic_ds = lgb.Dataset(train_x_electronic, label = train_y_electronic) 
valid_electronic_ds = lgb.Dataset(valid_x_electronic, label = valid_y_electronic)

train data x size:(3614, 7)
train data y size:(3614,)
valid data x size:(1549, 7)
valid data y size:(1549,)



## 속옷

In [60]:
train_underwear=train[train['ItemCat']=='속옷']

Y_train_underwear = train_underwear['sales']
X_train_underwear = train_underwear[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_underwear,valid_x_underwear,train_y_underwear,valid_y_underwear=train_test_split(X_train_underwear,Y_train_underwear,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_underwear.shape))
print('train data y size:{}'.format(train_y_underwear.shape))
print('valid data x size:{}'.format(valid_x_underwear.shape))
print('valid data y size:{}\n'.format(valid_y_underwear.shape))

#dataset 
train_underwear_ds = lgb.Dataset(train_x_underwear, label = train_y_underwear) 
valid_underwear_ds = lgb.Dataset(valid_x_underwear, label = valid_y_underwear)

train data x size:(2737, 7)
train data y size:(2737,)
valid data x size:(1173, 7)
valid data y size:(1173,)



## 농수축

In [61]:
train_farm=train[train['ItemCat']=='농수축']

Y_train_farm = train_farm['sales']
X_train_farm = train_farm[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_farm,valid_x_farm,train_y_farm,valid_y_farm=train_test_split(X_train_farm,Y_train_farm,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_farm.shape))
print('train data y size:{}'.format(train_y_farm.shape))
print('valid data x size:{}'.format(valid_x_farm.shape))
print('valid data y size:{}\n'.format(valid_y_farm.shape))

#dataset 
train_farm_ds = lgb.Dataset(train_x_farm, label = train_y_farm) 
valid_farm_ds = lgb.Dataset(valid_x_farm, label = valid_y_farm)

train data x size:(2718, 7)
train data y size:(2718,)
valid data x size:(1166, 7)
valid data y size:(1166,)



## 잡화

In [62]:
train_goods=train[train['ItemCat']=='잡화']

Y_train_goods = train_goods['sales']
X_train_goods = train_goods[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_goods,valid_x_goods,train_y_goods,valid_y_goods=train_test_split(X_train_goods,Y_train_goods,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_goods.shape))
print('train data y size:{}'.format(train_y_goods.shape))
print('valid data x size:{}'.format(valid_x_goods.shape))
print('valid data y size:{}\n'.format(valid_y_goods.shape))

#dataset 
train_goods_ds = lgb.Dataset(train_x_goods, label = train_y_goods) 
valid_goods_ds = lgb.Dataset(valid_x_goods, label = valid_y_goods)

train data x size:(2585, 7)
train data y size:(2585,)
valid data x size:(1109, 7)
valid data y size:(1109,)



## 생활용품

In [63]:
train_dailygoods=train[train['ItemCat']=='생활용품']

Y_train_dailygoods = train_dailygoods['sales']
X_train_dailygoods = train_dailygoods[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_dailygoods,valid_x_dailygoods,train_y_dailygoods,valid_y_dailygoods=train_test_split(X_train_dailygoods,Y_train_dailygoods,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_dailygoods.shape))
print('train data y size:{}'.format(train_y_dailygoods.shape))
print('valid data x size:{}'.format(valid_x_dailygoods.shape))
print('valid data y size:{}\n'.format(valid_y_dailygoods.shape))

#dataset 
train_dailygoods_ds = lgb.Dataset(train_x_dailygoods, label = train_y_dailygoods) 
valid_dailygoods_ds = lgb.Dataset(valid_x_dailygoods, label = valid_y_dailygoods)

train data x size:(1938, 7)
train data y size:(1938,)
valid data x size:(831, 7)
valid data y size:(831,)



## 가구

In [64]:
train_funiture=train[train['ItemCat']=='가구']

Y_train_funiture = train_funiture['sales']
X_train_funiture = train_funiture[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_funiture,valid_x_funiture,train_y_funiture,valid_y_funiture=train_test_split(X_train_funiture,Y_train_funiture,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_funiture.shape))
print('train data y size:{}'.format(train_y_funiture.shape))
print('valid data x size:{}'.format(valid_x_funiture.shape))
print('valid data y size:{}\n'.format(valid_y_funiture.shape))

#dataset 
train_funiture_ds = lgb.Dataset(train_x_funiture, label = train_y_funiture) 
valid_funiture_ds = lgb.Dataset(valid_x_funiture, label = valid_y_funiture)

train data x size:(1611, 7)
train data y size:(1611,)
valid data x size:(691, 7)
valid data y size:(691,)



## 이미용

In [65]:
train_hair=train[train['ItemCat']=='주방']

Y_train_hair = train_hair['sales']
X_train_hair = train_hair[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_hair,valid_x_hair,train_y_hair,valid_y_hair=train_test_split(X_train_hair,Y_train_hair,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_hair.shape))
print('train data y size:{}'.format(train_y_hair.shape))
print('valid data x size:{}'.format(valid_x_hair.shape))
print('valid data y size:{}\n'.format(valid_y_hair.shape))

#dataset 
train_hair_ds = lgb.Dataset(train_x_hair, label = train_y_hair) 
valid_hair_ds = lgb.Dataset(valid_x_hair, label = valid_y_hair)

train data x size:(4596, 7)
train data y size:(4596,)
valid data x size:(1971, 7)
valid data y size:(1971,)



## 건강기능

In [66]:
train_health=train[train['ItemCat']=='주방']

Y_train_health = train_health['sales']
X_train_health = train_health[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_health,valid_x_health,train_y_health,valid_y_health=train_test_split(X_train_health,Y_train_health,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_health.shape))
print('train data y size:{}'.format(train_y_health.shape))
print('valid data x size:{}'.format(valid_x_health.shape))
print('valid data y size:{}\n'.format(valid_y_health.shape))

#dataset 
train_health_ds = lgb.Dataset(train_x_health, label = train_y_health) 
valid_health_ds = lgb.Dataset(valid_x_health, label = valid_y_health)

train data x size:(4596, 7)
train data y size:(4596,)
valid data x size:(1971, 7)
valid data y size:(1971,)



## 침구

In [67]:
train_bed=train[train['ItemCat']=='주방']

Y_train_bed = train_bed['sales']
X_train_bed = train_bed[[ "ExposureTime", "MotherCode", "ItemCode","ItemName", "ItemCat","Price","SellingPrice"]]

#train,valid 나누기
train_x_bed,valid_x_bed,train_y_bed,valid_y_bed=train_test_split(X_train_bed,Y_train_bed,test_size=0.3,random_state=1234)
print('train data x size:{}'.format(train_x_bed.shape))
print('train data y size:{}'.format(train_y_bed.shape))
print('valid data x size:{}'.format(valid_x_bed.shape))
print('valid data y size:{}\n'.format(valid_y_bed.shape))

#dataset 
train_bed_ds = lgb.Dataset(train_x_bed, label = train_y_bed) 
valid_bed_ds = lgb.Dataset(valid_x_bed, label = valid_y_bed)

train data x size:(4596, 7)
train data y size:(4596,)
valid data x size:(1971, 7)
valid data y size:(1971,)



### 모델 생성

* early stopping을 위한 설정

In [168]:
params_clothes={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_clothes = lgb.train(params_clothes, train_clothes_ds, 1000, valid_clothes_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3031, number of used features: 5
[LightGBM] [Info] Start training from score 377.141788
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 131029
[200]	valid_0's l2: 114033
[300]	valid_0's l2: 108247
[400]	valid_0's l2: 104843
[500]	valid_0's l2: 103176
[600]	valid_0's l2: 102542
[700]	valid_0's l2: 101631
[800]	valid_0's l2: 100845
[900]	valid_0's l2: 100533
[1000]	valid_0's l2: 99871.7
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 99871.7


In [69]:
params_kitchen={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_kitchen = lgb.train(params_kitchen, train_kitchen_ds, 1000, valid_kitchen_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 4596, number of used features: 6
[LightGBM] [Info] Start training from score 190.678568
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 15297.5
[200]	valid_0's l2: 5600.76
[300]	valid_0's l2: 3232.05
[400]	valid_0's l2: 2488.02
[500]	valid_0's l2: 2214.84
[600]	valid_0's l2: 2087.33
[700]	valid_0's l2: 2018.63
[800]	valid_0's l2: 1972.25
[900]	valid_0's l2: 1940.84
[1000]	valid_0's l2: 1934.55
Did not meet early stopping. Best iteration is:
[975]	valid_0's l2: 1929.61


In [70]:
params_electronic={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_electronic = lgb.train(params_electronic, train_electronic_ds, 1000, valid_electronic_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 695
[LightGBM] [Info] Number of data points in the train set: 3614, number of used features: 6
[LightGBM] [Info] Start training from score 19.331006
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 192.959
[200]	valid_0's l2: 72.3379
[300]	valid_0's l2: 44.6137
[400]	valid_0's l2: 35.3847
[500]	valid_0's l2: 30.404
[600]	valid_0's l2: 27.8036
[700]	valid_0's l2: 25.9447
[800]	valid_0's l2: 24.2977
[900]	valid_0's l2: 23.4866
[1000]	valid_0's l2: 22.5864
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 22.5864


In [71]:
params_underwear={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_underwear = lgb.train(params_underwear, train_underwear_ds, 1000, valid_underwear_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 749
[LightGBM] [Info] Number of data points in the train set: 2737, number of used features: 6
[LightGBM] [Info] Start training from score 361.862463
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 24712.9
[200]	valid_0's l2: 7607.01
[300]	valid_0's l2: 3456.13
[400]	valid_0's l2: 2256.02
[500]	valid_0's l2: 1884.96
[600]	valid_0's l2: 1654.14
[700]	valid_0's l2: 1556.88
[800]	valid_0's l2: 1461.07
[900]	valid_0's l2: 1411.19
[1000]	valid_0's l2: 1372.04
Did not meet early stopping. Best iteration is:
[990]	valid_0's l2: 1371.5


In [72]:
params_farm={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_farm = lgb.train(params_farm, train_farm_ds, 1000, valid_farm_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 628
[LightGBM] [Info] Number of data points in the train set: 2718, number of used features: 6
[LightGBM] [Info] Start training from score 973.056921
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 75181.8
[200]	valid_0's l2: 27563.7
[300]	valid_0's l2: 14792.6
[400]	valid_0's l2: 11149.6
[500]	valid_0's l2: 10233.2
[600]	valid_0's l2: 9504.91
[700]	valid_0's l2: 9242.05
Early stopping, best iteration is:
[695]	valid_0's l2: 9213.89


In [73]:
params_goods={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_goods = lgb.train(params_goods, train_goods_ds, 1000, valid_goods_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 906
[LightGBM] [Info] Number of data points in the train set: 2585, number of used features: 6
[LightGBM] [Info] Start training from score 246.262908
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 27570.9
[200]	valid_0's l2: 7516.18
[300]	valid_0's l2: 3025.18
[400]	valid_0's l2: 1868.95
[500]	valid_0's l2: 1488.05
[600]	valid_0's l2: 1302.79
[700]	valid_0's l2: 1176.24
[800]	valid_0's l2: 1098.67
[900]	valid_0's l2: 1045.28
[1000]	valid_0's l2: 990.032
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 990.032


In [74]:
params_dailygoods={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_dailygoods = lgb.train(params_dailygoods, train_dailygoods_ds, 1000, valid_dailygoods_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 696
[LightGBM] [Info] Number of data points in the train set: 1938, number of used features: 6
[LightGBM] [Info] Start training from score 182.043029
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 15403.1
[200]	valid_0's l2: 5898.15
[300]	valid_0's l2: 3438.76
[400]	valid_0's l2: 2520.3
[500]	valid_0's l2: 2144.26
[600]	valid_0's l2: 1877.83
[700]	valid_0's l2: 1677.42
[800]	valid_0's l2: 1576.08
[900]	valid_0's l2: 1458.41
[1000]	valid_0's l2: 1362.57
Did not meet early stopping. Best iteration is:
[995]	valid_0's l2: 1361.09


In [75]:
params_funiture={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_funiture = lgb.train(params_funiture, train_funiture_ds, 1000, valid_funiture_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 451
[LightGBM] [Info] Number of data points in the train set: 1611, number of used features: 6
[LightGBM] [Info] Start training from score 19.955384
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 214.425
[200]	valid_0's l2: 146.213
[300]	valid_0's l2: 123.199
[400]	valid_0's l2: 107.318
[500]	valid_0's l2: 95.3518
[600]	valid_0's l2: 85.9674
[700]	valid_0's l2: 79.7711
[800]	valid_0's l2: 74.4774
[900]	valid_0's l2: 69.7496
[1000]	valid_0's l2: 65.6292
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 65.6292


In [76]:
params_hair={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_hair = lgb.train(params_hair, train_hair_ds, 1000, valid_hair_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 4596, number of used features: 6
[LightGBM] [Info] Start training from score 190.678568
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 15297.5
[200]	valid_0's l2: 5600.76
[300]	valid_0's l2: 3232.05
[400]	valid_0's l2: 2488.02
[500]	valid_0's l2: 2214.84
[600]	valid_0's l2: 2087.33
[700]	valid_0's l2: 2018.63
[800]	valid_0's l2: 1972.25
[900]	valid_0's l2: 1940.84
[1000]	valid_0's l2: 1934.55
Did not meet early stopping. Best iteration is:
[975]	valid_0's l2: 1929.61


In [77]:
params_health={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_health = lgb.train(params_health, train_health_ds, 1000, valid_health_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 4596, number of used features: 6
[LightGBM] [Info] Start training from score 190.678568
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 15297.5
[200]	valid_0's l2: 5600.76
[300]	valid_0's l2: 3232.05
[400]	valid_0's l2: 2488.02
[500]	valid_0's l2: 2214.84
[600]	valid_0's l2: 2087.33
[700]	valid_0's l2: 2018.63
[800]	valid_0's l2: 1972.25
[900]	valid_0's l2: 1940.84
[1000]	valid_0's l2: 1934.55
Did not meet early stopping. Best iteration is:
[975]	valid_0's l2: 1929.61


In [78]:
params_bed={'learning_rate': 0.01, 
          'max_depth': 10, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 10, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2020}

model_bed = lgb.train(params_bed, train_bed_ds, 1000, valid_bed_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 807
[LightGBM] [Info] Number of data points in the train set: 4596, number of used features: 6
[LightGBM] [Info] Start training from score 190.678568
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 15297.5
[200]	valid_0's l2: 5600.76
[300]	valid_0's l2: 3232.05
[400]	valid_0's l2: 2488.02
[500]	valid_0's l2: 2214.84
[600]	valid_0's l2: 2087.33
[700]	valid_0's l2: 2018.63
[800]	valid_0's l2: 1972.25
[900]	valid_0's l2: 1940.84
[1000]	valid_0's l2: 1934.55
Did not meet early stopping. Best iteration is:
[975]	valid_0's l2: 1929.61


# 성능평가

In [79]:
def MAPE(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / (actual))) * 100

In [80]:
def SMAPE(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs(pred-actual) / (np.abs(actual)+np.abs(pred))/2) * 100

In [81]:
valid_y2=valid_y[valid_y>=1]

NameError: name 'valid_y' is not defined

In [None]:
predict_train = model.predict(train_x)
predict_valid = model.predict(valid_x)

mape_train = MAPE(train_y, predict_train)
mape_valid = MAPE(valid_y, predict_valid)

print('train mape :', mape_train)
print('valid mape :',mape_valid)

In [None]:
smape_train = SMAPE(train_y, predict_train)
smape_valid = SMAPE(valid_y, predict_valid)

print('train smape :', smape_train)
print('valid smape :',smape_valid)

In [None]:
v_feature_names=train_x.columns

df_importance=pd.DataFrame()
df_importance['feature']=v_feature_names
df_importance['importance']=model.feature_importance()
df_importance.sort_values('importance',ascending=False,inplace=True)
display(df_importance.round(3))

df_importance.sort_values('importance',ascending=True,inplace=True)
coordinates=range(len(df_importance))
plt.barh(y=coordinates,width=df_importance['importance'])
plt.yticks(coordinates,df_importance['feature'])
plt.xlabel('변수 중요도')
plt.ylabel('변수')


# test

In [82]:
test=pd.read_excel('testdata.xlsx',header=1)  

In [83]:
test.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액
0,2020-06-01 06:20:00,20.0,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,
1,2020-06-01 06:40:00,20.0,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,
2,2020-06-01 07:00:00,20.0,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,
3,2020-06-01 07:20:00,20.0,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,
4,2020-06-01 07:40:00,20.0,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,


In [149]:
test.rename(columns={"방송일시": "datetime", "노출(분)": "ExposureTime",
                      "마더코드": "MotherCode", "상품코드": "ItemCode",
                      "상품명": "ItemName", "상품군": "ItemCat",
                      "판매단가": "Price", "취급액": "SellingPrice"}, inplace=True)

In [161]:
test['sales']=np.nan

In [162]:
test.head()

Unnamed: 0,datetime,ExposureTime,MotherCode,ItemCode,ItemName,ItemCat,Price,SellingPrice,sales
0,2020-06-01 06:20:00,20.0,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,,
1,2020-06-01 06:40:00,20.0,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,,
2,2020-06-01 07:00:00,20.0,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,,
3,2020-06-01 07:20:00,20.0,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,
4,2020-06-01 07:40:00,20.0,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,,


In [165]:
test_y = test['sales']
test_x = test.drop(['sales','SellingPrice'],axis=1)

In [166]:
test_x

Unnamed: 0,datetime,ExposureTime,MotherCode,ItemCode,ItemName,ItemCat,Price
0,2020-06-01 06:20:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800
1,2020-06-01 06:40:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800
2,2020-06-01 07:00:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800
3,2020-06-01 07:20:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900
4,2020-06-01 07:40:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900
...,...,...,...,...,...,...,...
2886,2020-07-01 00:20:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0
2887,2020-07-01 00:40:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0
2888,2020-07-01 01:00:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0
2889,2020-07-01 01:20:00,20.000000,100261,200875,아놀드파마 티셔츠레깅스세트,의류,69900


In [170]:

for i in test_x.index:
    if (test_x.loc[i]['ItemCat']=='의류')==True:
        y_pred=model_clothes.predict(test_x.loc[i])
        test_y.loc[i]['sales']=y_pred
        #test_x.loc[i]['SellingPrice']=y_pred*test.loc[i]['Price']
        print('의류')

TypeError: Cannot predict data for type Series

In [89]:
i=0
for i in test.iloc[[i]]:
    if test.iloc[[i]]['ItemCat']=='의류':
        y_pred=model_clothes.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='주방':
        y_pred=model_kitchen.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='가전':
        y_pred=model_electronic.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='속옷':
        y_pred=model_underwear.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='농수축':
        y_pred=model_farm.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='잡화':
        y_pred=model_goods.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='생활용품':
        y_pred=model_dailygoods.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='가구':
        y_pred=model_funiture.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='이미용':
        y_pred=model_hair.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    elif test.iloc[[i]]['ItemCat']=='건강기능':
        y_pred=model_health.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
        
    else :
        y_pred=model_bed.predict(test_x)
        test.iloc[[i]]['sales']=y_pred
        test.iloc[[i]]['SellingPrice']=y_pred*test.iloc[[i]]['Price']
    i=i+1   


TypeError: Cannot index by location index with a non-integer key