In [1]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from time import time
from datetime import date
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 1000)

# 1 数据预处理

In [3]:
# 载入数据
order = pd.read_csv("../../data/level2/m111-order-sku.csv", sep=',', parse_dates=['order_date'])
dis = pd.read_csv("../../data/level2/m111-dis-sku.csv", sep=',', parse_dates=['dis_date'])
inv = pd.read_csv(
    "../../data/level2/m111-inv-sku.csv", sep=',', parse_dates=['period_wid']
).rename(columns={'period_wid': 'inv_date'})
category = pd.read_csv(
    "../../data/level2/m111-item-category.csv", sep=','
).rename(columns={'sales_segment1_code': 'category'})

## 1.1 处理订单数据

In [4]:
# 取2017年至2018年区间中的数据
order = order.loc[(order['order_date'] >= '2017-01-01') & (order['order_date'] <= '2018-12-31')]

In [5]:
order_month = order.copy()
order_month['month'] = order_month['order_date'].astype('str').apply(lambda x: x[:7])
order_month = order_month.groupby(['item_code', 'month'])[['qty']].sum()
order_month = order_month.unstack(level=-1).fillna(0)
order_month.columns = pd.date_range('2017-01-31', '2018-12-31', freq='M')

## 1.2 处理分销数据

In [6]:
# 取2017年至2018年区间中的数据（分销数据只有从3月份开始的数据且3月份不完整）>_<|||
dis = dis.loc[(dis['dis_date'] >= '2017-04-01') & (dis['dis_date'] <= '2018-12-31')]

# 处理分销量为负数的情况
dis['qty'] = dis['qty'].apply(lambda x: -x if x < 0 else x)

In [7]:
dis_month = dis.copy()
dis_month['month'] = dis_month['dis_date'].astype('str').apply(lambda x: x[:7])
dis_month = dis_month.groupby(['item_code', 'month'])[['qty']].sum()
dis_month = dis_month.unstack(level=-1).fillna(0)
dis_month.columns = pd.date_range('2017-04-30', '2018-12-31', freq='M')
dis_month = dis_month.reindex(order_month.index).fillna(0)

## 1.3 处理库存数据

In [8]:
# 取2017年至2018年区间中的数据（库存数据只有从2017年6月份开始的数据）>_<|||
inv = inv.loc[(inv['inv_date'] >= '2017-06-01') & (inv['inv_date'] <= '2018-12-31')]

In [9]:
# 删除异常值
inv = inv.loc[~(inv.qty > 1000000)]

In [10]:
# 取每月的最后一天作为当月的库存
inv = inv.loc[inv['inv_date'].isin(pd.date_range('2017-06-30', '2018-12-31', freq='M'))]

In [11]:
inv_month = inv.copy()
inv_month['month'] = inv_month['inv_date'].astype('str').apply(lambda x: x[:7])
inv_month = inv_month.groupby(['item_code', 'month'])[['qty']].sum()
inv_month = inv_month.unstack(level=-1).fillna(0)
inv_month.columns = pd.date_range('2017-06-30', '2018-12-31', freq='M')
inv_month = inv_month.reindex(order_month.index).fillna(0)

## 1.4 处理品类信息

In [12]:
category = pd.read_csv(
    "../../data/level2/m111-item-category.csv", sep=','
).rename(columns={'sales_segment1_code': 'category'})

In [13]:
# Pandas的bug：'21054110000024', '21054110000025', '21054110000084', '21054110000085' 这4个字符串居然是重复的，取消注释即可看到
# category.loc[category.item_code.duplicated()]

In [14]:
# 解决上面bug的下策，避免下面reindex报错
category.drop_duplicates(['item_code'], keep='first', inplace=True)

In [15]:
category = category.set_index('item_code').reindex(order_month.index)

In [16]:
encoder = LabelEncoder()
category['category'] = encoder.fit_transform(category['category'])

## 1.5 得到每个品类每个月的提货数据

In [17]:
order_cate_month = order_month.reset_index()
order_cate_month['category'] = category['category'].values
order_cate_month = order_cate_month.groupby('category')[order_month.columns].sum()

## 1.6 得到每个品类每个月的分销数据

In [18]:
dis_cate_month = dis_month.reset_index()
dis_cate_month['category'] = category['category'].values
dis_cate_month = dis_cate_month.groupby('category')[dis_month.columns].sum()

## 1.7 得到每个品类每个月的库存数据

In [19]:
inv_cate_month = inv_month.reset_index()
inv_cate_month['category'] = category['category'].values
inv_cate_month = inv_cate_month.groupby('category')[inv_month.columns].sum()

# 2 特征工程

In [20]:
def prepare_dataset(order, dis, inv, year, month, is_train=True, name_prefix=None):
    X = {}
    
    # 提货的统计特征（28个特征）
    for i in [3, 6, 9, 12]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月提货量
        X['ord_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月提货量的平均一阶差分
        X['ord_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月提货量的和（带衰减）
        X['ord_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月提货量的平均值
        X['ord_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月提货量的中位数
        X['ord_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月提货量的最大值
        X['ord_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月提货量的最小值
        X['ord_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月提货量的标准差
        
    # 分销的统计特征（21个特征）>_<|||
    for i in [3, 6, 9]:
        dt = date(year, month, 1)
        tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月分销量
        X['dis_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月分销量的平均一阶差分
        X['dis_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月分销量的和（带衰减）
        X['dis_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月分销量的均值
        X['dis_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月分销量的中位数
        X['dis_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月分销量的最大值
        X['dis_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月分销量的最小值
        X['dis_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月分销量的标准差
        
    # 库存的统计特征（14个）>_<|||
    for i in [3, 6]:
        dt = date(year, month, 1)
        tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月库存量
        X['inv_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月库存量的平均一阶差分
        X['inv_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月库存量的和（带衰减）
        X['inv_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月库存量的均值
        X['inv_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月库存量的中位数
        X['inv_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月库存量的最大值
        X['inv_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月库存量的最小值
        X['inv_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月库存量的标准差
        
    # 提货天数特征（12个特征）
    for i in [3, 6, 9, 12]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]
        X['has_ord_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有提货的天数
        X['last_ord_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有提货的天数
        X['first_ord_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有提货的天数
        
    # 分销天数特征（9个特征）>_<|||
    for i in [3, 6, 9]:
        dt = date(year, month, 1)
        tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]
        X['has_dis_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有分销的天数
        X['last_dis_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有分销的天数
        X['first_dis_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有分销的天数
        
    # 库存天数特征（6个特征）>_<|||
    for i in [3, 6]:
        dt = date(year, month, 1)
        tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]
        X['has_inv_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有库存的天数
        X['last_inv_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有库存的天数
        X['first_inv_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有库存的天数
        
    # 前12个月的提货量
    for i in range(1, 13):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['ord_pre_%s' % i] = order[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    # 前9个月的分销量>_<|||
    for i in range(1, 10):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['dis_pre_%s' % i] = dis[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
       
    # 前6个月的库存量>_<|||
    for i in range(1, 7):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['inv_pre_%s' % i] = inv[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    X = pd.DataFrame(X)
    
    if is_train:
        start_dt = date(year, month, 1)
        y = order[pd.date_range(start_dt, periods=3, freq='M')].values
        return X, y
    
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
        
    return X

## 2.1 准备训练集

In [21]:
train_month = [ 
    '2018-01', 
    '2018-02', 
    '2018-03', 
    '2018-04'
]

X_l, y_l = [], []
for month in train_month:
    y, m = int(month.split('-')[0]), int(month.split('-')[1])
    
    X_tmp, y_tmp = prepare_dataset(order_month, dis_month, inv_month, y, m)
    
    X_tmp2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, y, m, is_train=False, name_prefix='cate')
    X_tmp2.index = order_cate_month.index
    X_tmp2 = X_tmp2.reindex(category['category']).reset_index(drop=True)
    
    X_tmp = pd.concat([X_tmp, X_tmp2, category.reset_index(drop=True)], axis=1)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
    del X_tmp, y_tmp, X_tmp2
    gc.collect()
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

## 2.2 准备验证集

In [26]:
X_val, y_val = prepare_dataset(order_month, dis_month, inv_month, 2018, 7)

X_val2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 7, is_train=False, name_prefix='cate')
X_val2.index = order_cate_month.index
X_val2 = X_val2.reindex(category['category']).reset_index(drop=True)

X_val = pd.concat([X_val, X_val2, category.reset_index(drop=True)], axis=1)

del X_val2
gc.collect()

## 2.3 准备测试集

In [27]:
X_test, y_test = prepare_dataset(order_month, dis_month, inv_month, 2018, 10)

X_test2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 10, is_train=False, name_prefix='cate')
X_test2.index = order_cate_month.index
X_test2 = X_test2.reindex(category['category']).reset_index(drop=True)

X_test = pd.concat([X_test, X_test2, category.reset_index(drop=True)], axis=1)

del X_test2
gc.collect()

# 3 训练和预测

In [30]:
print("[INFO] Start training and predicting...")
t0 = time()

[INFO] Start training and predicting...


In [35]:
params = {
    'num_leaves': 80, 
    'objective': 'regression', 
    'min_data_in_leaf': 200, 
    'learning_rate': 0.02, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.7, 
    'bagging_freq': 1, 
    'metric': 'l2', 
    'num_threads': 16
}

MAX_ROUNDS = 5000
pred_val = []
pred_test = []
cate_vars = []

for i in range(3):
    print('=' * 50)
    print("Step %d" % (i + 1))
    print('=' * 50)
    
    dtrain = lgb.Dataset(X_train, label=y_train[:, i], categorical_feature=cate_vars)
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS, 
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    feat_imp = [("%s: %.2f" % x) for x in sorted(zip(X_train.columns, bst.feature_importance('gain')), key=lambda x: x[1], reverse=True)]
    print('\n'.join(feat_imp))
    pred_val.append(
        bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    pred_test.append(
        bst.predict(X_test, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    
print("[INFO] Finished! ( ^ _ ^ ) V")
print("[INFO] Done in %f seconds." % (time() - t0))

Step 1
Training until validation scores don't improve for 125 rounds.
[50]	training's l2: 1.49004e+07	valid_1's l2: 4.94339e+06
[100]	training's l2: 1.31539e+07	valid_1's l2: 4.29263e+06
[150]	training's l2: 1.24224e+07	valid_1's l2: 4.13895e+06




[200]	training's l2: 1.2026e+07	valid_1's l2: 4.08059e+06
[250]	training's l2: 1.17823e+07	valid_1's l2: 4.05905e+06
[300]	training's l2: 1.15978e+07	valid_1's l2: 4.05203e+06
[350]	training's l2: 1.14472e+07	valid_1's l2: 4.09448e+06
[400]	training's l2: 1.13261e+07	valid_1's l2: 4.07144e+06
Early stopping, best iteration is:
[289]	training's l2: 1.16393e+07	valid_1's l2: 4.02805e+06
ord_sum_decay_pre_3: 350461174974.59
ord_pre_2: 305572057171.00
ord_pre_4: 143435768983.00
ord_mean_pre_3: 138232301429.80
ord_pre_1: 75743007038.00
ord_min_pre_3: 56662486431.00
ord_pre_7: 56361873764.00
ord_max_pre_9: 54734720754.90
ord_sum_decay_pre_9: 52295064771.61
ord_max_pre_6: 49004801445.83
ord_sum_decay_pre_6: 42270128618.50
ord_mean_pre_6: 31456503104.54
ord_max_pre_3: 30778020526.70
ord_mean_pre_9: 29905669758.11
ord_median_pre_6: 25284898095.00
ord_max_pre_12: 24266505673.24
ord_pre_12: 24154193449.31
ord_std_pre_6: 22855236127.31
ord_pre_10: 22342258583.54
ord_std_pre_12: 20779408712.71
ord_

In [36]:
print("Validation mse:", mean_squared_error(y_val, np.array(pred_val).transpose()))

Validation mse: 7888494.379996519
