In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
from datetime import date, timedelta
import warnings 
warnings.filterwarnings('ignore')

In [2]:
path = './input/'
df_train = pd.read_csv(path+'train.csv',
converters={'unit_sales':lambda u: np.log1p(float(u)) if float(u) > 0 else 0},parse_dates=["date"])
df_test  = pd.read_csv(path + "test.csv",parse_dates=["date"])
items = pd.read_csv(path+'items.csv')
stores = pd.read_csv(path+'stores.csv')
# 类型转换
df_train['onpromotion'] = df_train['onpromotion'].astype(bool)
df_test['onpromotion'] = df_test['onpromotion'].astype(bool)

In [3]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2015,12,1)] 
del df_train

df_2017 = df_2017.merge(items, on='item_nbr', how='left')
df_2017 = df_2017.merge(stores, on='store_nbr', how='left')

In [4]:
tmp = df_2017[df_2017['date']=='2016-12-26']
tmp['date'] = '2016-12-25'
df_2017 = pd.concat([df_2017, tmp], axis=0, ignore_index=True)

In [5]:
promo_2017_train = df_2017.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

promo_2017_test = df_test.set_index(["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
promo_2017 = promo_2017.astype('int')

In [6]:
df_2017 = df_2017.set_index(["store_nbr", "item_nbr", "city", "class", "date"])[["unit_sales"]].unstack(level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1) 

# df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()

In [7]:
def get_date_range(df, dt, forward_steps, periods, freq='D'):
    return df[pd.date_range(start=dt-timedelta(days=forward_steps), periods=periods, freq=freq)]

In [8]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        # 视点前 N日促销次数
        'promo_3_2017': get_date_range(promo_2017, t2017, 3, 3).sum(axis=1).values,
        'promo_7_2017': get_date_range(promo_2017, t2017, 7, 7).sum(axis=1).values,
        'promo_14_2017': get_date_range(promo_2017, t2017, 14, 14).sum(axis=1).values,
        # 预测集一年前的16日统计销量
        "last_year_mean": get_date_range(df_2017, t2017, 365, 16).mean(axis=1).values,
        "last_year_meidan": get_date_range(df_2017, t2017, 365, 16).median(axis=1).values,
        "last_year_max": get_date_range(df_2017, t2017, 365, 16).max(axis=1).values,
        "last_year_min": get_date_range(df_2017, t2017, 365, 16).min(axis=1).values,
        # 预测集一年前的16日0销次数
        "last_year_count0": (get_date_range(df_2017, t2017, 365, 16)==0).sum(axis=1).values,
        # 预测集一年前的16日促销次数
        "last_year_promo": get_date_range(promo_2017, t2017, 365, 16).sum(axis=1).values
    })
    
    for i in range(1,8):
        # 历史平移，前 N天的销量
        X["day_{}_hist".format(i)] = get_date_range(df_2017, t2017, i, 1).values.ravel()
        X["day_{}_hist_haspromo".format(i)] = get_date_range(df_2017, t2017, i, 1)[get_date_range(promo_2017, t2017, i, i)==1].values.ravel()
        X["day_{}_hist_nopromo".format(i)] = get_date_range(df_2017, t2017, i, 1)[get_date_range(promo_2017, t2017, i, i)==0].values.ravel()
        
    
    for i in [3,5,7,14,21,30,60,90,150,365]:
        for d in [0,7,14]:
            # 窗口统计，销量 diff/mean/meidan/max/min/std
            X['before_diff_{}_day_mean'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).diff(1,axis=1).mean(axis=1).values
            X['after_diff_{}_day_mean'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).diff(-1,axis=1).mean(axis=1).values
            
            X['before_diff_{}_day_decay_mean'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).diff(1,axis=1).apply(lambda x: x * 0.9 ** np.arange(len(x))[::-1]).mean(axis=1).values
            X['after_diff_{}_day_decay_mean'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).diff(-1,axis=1).apply(lambda x: x * 0.9 ** np.arange(len(x))[::-1]).mean(axis=1).values
            
            X['mean_%s_decay_1' % i] = (get_date_range(df_2017, t2017-timedelta(days=d), i, i) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
            X['mean_%s_decay_2' % i] = (get_date_range(df_2017, t2017-timedelta(days=d), i, i) * np.power(0.7, np.arange(i)[::-1])).sum(axis=1).values
            X['mean_%s_decay_3' % i] = (get_date_range(df_2017, t2017-timedelta(days=d), i, i) * np.power(0.5, np.arange(i)[::-1])).sum(axis=1).values
            
            X['mean_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).mean(axis=1).values
            X['median_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).median(axis=1).values
            X['max_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).max(axis=1).values
            X['min_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).min(axis=1).values
            X['std_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).std(axis=1).values
            X['sum_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).sum(axis=1).values
            X['skew_{}_day'.format(i)] = get_date_range(df_2017, t2017-timedelta(days=d), i, i).skew(axis=1).values

        # 有/无促销的时间，销量统计
        X['mean_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].mean(axis=1).values
        X['median_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].median(axis=1).values
        X['max_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].max(axis=1).values
        X['min_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].min(axis=1).values
        X['std_{}_day_hasnopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].std(axis=1).values
        X['sum_{}_day_haspromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].sum(axis=1).values
        X['skew_{}_day_hasnopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==1].skew(axis=1).values
        
        X['mean_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].mean(axis=1).values
        X['median_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].median(axis=1).values
        X['max_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].max(axis=1).values
        X['min_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].min(axis=1).values
        X['std_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].std(axis=1).values
        X['sum_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].sum(axis=1).values
        X['skew_{}_day_nopromo'.format(i)] = get_date_range(df_2017, t2017, i, i)[get_date_range(promo_2017, t2017, i, i)==0].skew(axis=1).values

        # 无销量次数与促销次数
        X['nopromo_counts_{}_2017'.format(i)] = (get_date_range(df_2017, t2017, i, i)==0).sum(axis=1).values
        X['promo_counts_{}_2017'.format(i)] = get_date_range(promo_2017, t2017, i, i).sum(axis=1).values
        
        
        # 预先按行聚合进行统计
        type_mean = get_date_range(df_2017, t2017, i, i).mean(axis=1)
#         type_median = get_date_range(df_2017, t2017, i, i).median(axis=1)
#         type_max = get_date_range(df_2017, t2017, i, i).max(axis=1)
#         type_min = get_date_range(df_2017, t2017, i, i).min(axis=1)
#         type_std = get_date_range(df_2017, t2017, i, i).std(axis=1)
        
        # 统计类型映射
        type_map = {0:'mean',1:'median',2:'max',3:'min',4:'std'}
        # 不同粒度的统计
        for keys in [['item_nbr'],['store_nbr'],['city'],['class'],['store_nbr','class'],['city','class']]:
            colname = '_'.join(keys)
            for m, tmp_type in enumerate([type_mean]):
                tmp = tmp_type.groupby(keys).mean().to_frame('{}_{}_mean'.format(colname,m))
                X['{}_{}_{}_mean'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_mean'.format(colname,m)].values   
                tmp = tmp_type.groupby(keys).median().to_frame('{}_{}_median'.format(colname,m))
                X['{}_{}_{}_median'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_median'.format(colname,m)].values   
                tmp = tmp_type.groupby(keys).max().to_frame('{}_{}_max'.format(colname,m))
                X['{}_{}_{}_max'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_max'.format(colname,m)].values   
                tmp = tmp_type.groupby(keys).min().to_frame('{}_{}_min'.format(colname,m))
                X['{}_{}_{}_min'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_min'.format(colname,m)].values   
                tmp = tmp_type.groupby(keys).std().to_frame('{}_{}_std'.format(colname,m))
                X['{}_{}_{}_std'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_std'.format(colname,m)].values
                tmp = tmp_type.groupby(keys).sum().to_frame('{}_{}_sum'.format(colname,m))
                X['{}_{}_{}_sum'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_sum'.format(colname,m)].values   
                tmp = tmp_type.groupby(keys).skew().to_frame('{}_{}_skew'.format(colname,m))
                X['{}_{}_{}_skew'.format(colname,i,m)] = df_2017.join(tmp)['{}_{}_skew'.format(colname,m)].values
                               
        
    for i in range(7):
        # 前 N 周平均每周 i 的销量
        for periods in [5,10,15,20]:
            steps = periods * 7
            X['before_diff_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').diff(1,axis=1).mean(axis=1).values
            X['after_diff_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').diff(-1,axis=1).mean(axis=1).values
            X['mean_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').mean(axis=1).values
            X['median_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').median(axis=1).values
            X['max_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').max(axis=1).values
            X['min_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').min(axis=1).values
            X['std_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').std(axis=1).values
            X['sum_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').sum(axis=1).values
            X['skew_{}_dow{}'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D').skew(axis=1).values
            
            X['before_diff_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].diff(1,axis=1).mean(axis=1).values
            X['after_diff_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].diff(-1,axis=1).mean(axis=1).values
            X['mean_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].mean(axis=1).values
            X['median_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].median(axis=1).values
            X['max_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].max(axis=1).values
            X['min_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].min(axis=1).values
            X['std_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].std(axis=1).values
            X['sum_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].sum(axis=1).values
            X['skew_{}_dow{}_hasnopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==1].skew(axis=1).values
            
            X['before_diff_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].diff(1,axis=1).mean(axis=1).values
            X['after_diff_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].diff(-1,axis=1).mean(axis=1).values
            X['mean_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].mean(axis=1).values
            X['median_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].median(axis=1).values
            X['max_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].max(axis=1).values
            X['min_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].min(axis=1).values
            X['std_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].std(axis=1).values
            X['sum_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].sum(axis=1).values
            X['skew_{}_dow{}_nopromo'.format(periods,i)] = get_date_range(df_2017, t2017, steps-i, periods, freq='7D')[get_date_range(promo_2017, t2017, i, i)==0].skew(axis=1).values
            
            
    for i in range(16):
        # 未来16天是否促销日
        X["promo_{}".format(i)] = promo_2017[str(t2017 + timedelta(days=i))].values.astype(np.uint8)
        X["promo_bef_{}".format(i)] = promo_2017[str(t2017 + timedelta(days=-i))].values.astype(np.uint8)
    
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [9]:
X_l, y_l = [], []
t2017 = date(2017, 7, 5)
n_range = 25
for i in tqdm(range(n_range)):
    
    X_tmp, y_tmp = prepare_dataset(t2017 - timedelta(days=7 * i))
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l

#【验证集】7.26
X_val, y_val = prepare_dataset(date(2017, 7, 26))
#【测试集】8.16
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

100%|██████████| 25/25 [7:52:58<00:00, 1236.42s/it]


In [10]:
# X_train.to_csv('X_train.csv', index=False)
# y_df = pd.DataFrame(y_train)
# y_df.to_csv('y_train.csv', index=False)

# X_train = pd.read_csv('X_train.csv')
# y_train = pd.read_csv('y_train.csv')
# y_train = y_train.values

In [11]:
del promo_2017_train
del promo_2017_test
del promo_2017
del X_tmp, y_tmp
gc.collect()

0

In [None]:
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'n_jobs': 24,
    'num_threads': -1
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []

item_perishable_dict = dict(zip(items['item_nbr'],items['perishable'].values))
train_weight = []
val_weight = []

items_ = df_2017.reset_index()['item_nbr'].tolist() * n_range
for item in items_:
    train_weight.append(item_perishable_dict[item] * 0.25 + 1)

items_ = df_2017.reset_index()['item_nbr'].values
for item in items_:
    val_weight.append(item_perishable_dict[item] * 0.25 + 1)

del df_2017
gc.collect()

for i in range(16):

    print("====== Step %d ======" % (i+1))
    
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i], weight=train_weight
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain, weight=val_weight
    )
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], verbose_eval=100)
    
    val_pred.append(bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    
    if i == 0:
        imps = sorted(zip(X_train.columns, bst.feature_importance("gain")), key=lambda x: x[1], reverse=True)
        print(list(sorted(zip(X_train.columns, bst.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
        
# 0.279288 0.310069
# 0.278647 0.309754
# 0.27667  0.310175
# 0.276117 0.309686
# 0.274491 0.307200 sale_lgb_9
# 0.272708 0.305561 sale_lgb_10



In [None]:
# top_500 = [items[0] for items in imps[:500]]

# for i in range(1):

#     print("====== Step %d ======" % (i+1))
    
#     dtrain = lgb.Dataset(
#         X_train[top_500], label=y_train[:, i], weight=train_weight
#     )
#     dval = lgb.Dataset(
#         X_val[top_500], label=y_val[:, i], reference=dtrain, weight=val_weight
#     )
#     bst = lgb.train(
#         params, dtrain, num_boost_round=MAX_ROUNDS,
#         valid_sets=[dtrain, dval], verbose_eval=100)

In [None]:
# top_1000 = [items[0] for items in imps[:1000]]

# for i in range(1):

#     print("====== Step %d ======" % (i+1))
    
#     dtrain = lgb.Dataset(
#         X_train[top_1000], label=y_train[:, i], weight=train_weight
#     )
#     dval = lgb.Dataset(
#         X_val[top_1000], label=y_val[:, i], reference=dtrain, weight=val_weight
#     )
#     bst = lgb.train(
#         params, dtrain, num_boost_round=MAX_ROUNDS,
#         valid_sets=[dtrain, dval], verbose_eval=100)

In [None]:
# top_2000 = [items[0] for items in imps[:2000]]

# for i in range(1):

#     print("====== Step %d ======" % (i+1))
    
#     dtrain = lgb.Dataset(
#         X_train[top_2000], label=y_train[:, i], weight=train_weight
#     )
#     dval = lgb.Dataset(
#         X_val[top_2000], label=y_val[:, i], reference=dtrain, weight=val_weight
#     )
#     bst = lgb.train(
#         params, dtrain, num_boost_round=MAX_ROUNDS,
#         valid_sets=[dtrain, dval], verbose_eval=100)

In [None]:
# tail_2000 = [items[0] for items in imps[-2000:]]

# for i in range(1):

#     print("====== Step %d ======" % (i+1))
    
#     dtrain = lgb.Dataset(
#         X_train[tail_2000], label=y_train[:, i], weight=train_weight
#     )
#     dval = lgb.Dataset(
#         X_val[tail_2000], label=y_val[:, i], reference=dtrain, weight=val_weight
#     )
#     bst = lgb.train(
#         params, dtrain, num_boost_round=MAX_ROUNDS,
#         valid_sets=[dtrain, dval], verbose_eval=100)

In [None]:
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=(15,10))
# lgb.plot_importance(bst, max_num_features=20, ax=ax,importance_type='gain')
# plt.yticks(fontsize=8)
# plt.xlabel('Feature importance',fontsize=14)
# plt.ylabel('Features',fontsize=14)
# plt.savefig("12_feature_importance.svg", format="svg")

In [25]:
print("验证集 mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame('unit_sales')
df_preds = df_preds.reset_index()
df_preds.columns = ['store_nbr', 'item_nbr', 'city', 'class', 'date', 'unit_sales']

submission = df_test[['id','date','store_nbr','item_nbr']].merge(df_preds, on=['date','store_nbr','item_nbr'], how='left').fillna(0)
submission['unit_sales'] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission[['id','unit_sales']].to_csv('sale_lgb_10.csv', float_format='%.4f', index=None)

# 0.3479347767794617
# 0.3436085951363862
# 0.3420840578048175
# 0.3417190536788496
# 0.3388075841917157 sale_lgb_8.csv
# 0.3383982506391189 sale_lgb_10

验证集 mse: 0.3383982506391189


In [18]:
print("验证集 mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

验证集 mse: 0.3383982506391189


In [None]:
sale_lgb1 = pd.read_csv('sale_lgb_98.csv')
sale_lgb2 = pd.read_csv('sale_lgb.csv')

In [6]:
df_train

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,
...,...,...,...,...,...,...
125497035,125497035,2017-08-15,54,2089339,4.0,False
125497036,125497036,2017-08-15,54,2106464,1.0,True
125497037,125497037,2017-08-15,54,2110456,192.0,False
125497038,125497038,2017-08-15,54,2113914,198.0,True
