In [1]:
# Import necessary libraries
import gc
import numpy as np
import pandas as pd
from time import time
from datetime import date
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 1000)

# 1 数据预处理

In [3]:
# 载入数据
order = pd.read_csv("../../data/level2/m111-sku-order.csv", sep=',', parse_dates=['order_date'])
dis = pd.read_csv("../../data/level2/m111-sku-dis.csv", sep=',', parse_dates=['dis_date'])
inv = pd.read_csv(
    "../../data/level2/m111-sku-inv.csv", sep=',', parse_dates=['period_wid']
).rename(columns={'period_wid': 'inv_date'})
category = pd.read_csv(
    "../../data/level2/item2category-from-3.csv", sep=','
).rename(columns={'sales_segment1_code': 'category'})

In [4]:
# 考虑的品类有：消毒柜、洗碗机、烟机、灶具、电热、净水机、燃热、饮水机
cates_considered = ['CRXDG', 'CRXWJ', 'CRYJ', 'CRZJ', 'DR', 'JSJ', 'RR', 'YSJ']

## 1.1 处理订单数据

In [5]:
# 取2017年至2018年区间中的数据（2019/03/01取数有164945条记录）
order = order.loc[(order.order_date >= '2017-01-01') & (order.order_date <= '2018-12-31')]

In [6]:
# 删除其他品类的数据（剩余记录数为161064）
order = order.join(category.set_index('item_code'), on='item_code', how='left')
order = order.loc[order.category.isin(cates_considered)]

In [7]:
# 划分数据集
df_test = order.loc[order.order_date >= '2018-11-01']  # 测试集
order = order.loc[order.order_date <= '2018-10-31']  # 训练和验证集

In [8]:
df_test['month'] = df_test.order_date.astype('str').apply(lambda x: x[:7])
df_test = df_test.groupby(['category', 'month'])[['qty']].sum()

In [9]:
# 每个品类每个月的提货量
order_cate_month = order.copy()
order_cate_month['month'] = order_cate_month.order_date.astype('str').apply(lambda x: x[:7])
order_cate_month = order_cate_month.groupby(['category', 'month'])[['qty']].sum()

In [10]:
# 取对数
# order_cate_month['qty'] = np.log1p(order_cate_month.qty)

In [11]:
order_cate_month = order_cate_month.unstack(level=-1).fillna(0)
order_cate_month.columns = pd.date_range('2017-01-31', '2018-10-31', freq='M')

## 1.2 处理分销数据

In [12]:
# 取2017年至2018年区间中的数据（分销数据从2017年6月开始才可用，记录数为358247）
dis = dis.loc[(dis.dis_date >= '2017-06-01') & (dis.dis_date <= '2018-10-31')]

In [13]:
# 删除其他品类的数据（剩余记录数为343637）
dis = dis.join(category.set_index('item_code'), on='item_code', how='left')
dis = dis.loc[dis.category.isin(cates_considered)]

In [14]:
# 每个品类每个月的分销量
dis_cate_month = dis.copy()
dis_cate_month['month'] = dis_cate_month.dis_date.astype('str').apply(lambda x: x[:7])
dis_cate_month = dis_cate_month.groupby(['category', 'month'])[['qty']].sum()

In [15]:
# 取对数
# dis_cate_month['qty'] = np.log1p(dis_cate_month.qty)

In [16]:
dis_cate_month = dis_cate_month.unstack(level=-1).fillna(0)
dis_cate_month.columns = pd.date_range('2017-06-30', '2018-10-31', freq='M')

In [17]:
dis_cate_month = dis_cate_month.reindex(order_cate_month.index).fillna(0)

## 1.3 处理库存数据

In [18]:
# 取2017年至2018年区间中的数据（库存数据从2017年12月开始可用，记录数为138437）
inv = inv.loc[(inv.inv_date >= '2017-12-01') & (inv.inv_date <= '2018-10-31')]

In [19]:
# 删除其他品类的数据（剩余记录数为121249）
inv = inv.join(category.set_index('item_code'), on='item_code', how='left')
inv = inv.loc[inv.category.isin(cates_considered)]

In [20]:
# 取每月的最后一天作为当月的库存（剩余记录数为33698）
inv_lastday = inv.loc[inv.inv_date.isin(pd.date_range('2017-12-31', '2018-10-31', freq='M'))]

In [21]:
# 每个品类每月的库存量
inv_cate_month = inv_lastday.copy()
inv_cate_month['month'] = inv_cate_month.inv_date.astype('str').apply(lambda x: x[:7])
inv_cate_month = inv_cate_month.groupby(['category', 'month'])[['qty']].sum()

In [22]:
# 取对数
# inv_cate_month['qty'] = np.log1p(inv_cate_month.qty)

In [23]:
inv_cate_month = inv_cate_month.unstack(level=-1).fillna(0)
inv_cate_month.columns = pd.date_range('2017-12-31', '2018-10-31', freq='M')

In [24]:
inv_cate_month = inv_cate_month.reindex(order_cate_month.index).fillna(0)

## 2.4 处理品类特征

In [25]:
cates = pd.DataFrame(cates_considered, columns=['category'])

encoder = LabelEncoder()
cates['cate_enc'] = encoder.fit_transform(cates.category)

cates = cates.set_index('category').reindex(order_cate_month.index)

# 2 特征工程

In [26]:
def prepare_dataset(order, dis, inv, year, month, is_train=True, name_prefix=None):
    X = {}

    # 提货的统计特征
    for i in [3]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月提货量
        X['ord_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月提货量的平均一阶差分
        X['ord_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月提货量的和（带衰减）
        X['ord_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月提货量的平均值
        X['ord_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月提货量的中位数
        X['ord_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月提货量的最大值
        X['ord_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月提货量的最小值
        X['ord_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月提货量的标准差

    # 分销的统计特征
#     for i in [3]:
#         dt = date(year, month, 1)
#         tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月分销量
#         X['dis_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月分销量的平均一阶差分
#         X['dis_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月分销量的和（带衰减）
#         X['dis_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月分销量的均值
#         X['dis_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月分销量的中位数
#         X['dis_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月分销量的最大值
#         X['dis_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月分销量的最小值
#         X['dis_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月分销量的标准差
        
    # 库存的统计特征
#     for i in [3, 6]:
#         dt = date(year, month, 1)
#         tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月库存量
#         X['inv_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月库存量的平均一阶差分
#         X['inv_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月库存量的和（带衰减）
#         X['inv_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月库存量的均值
#         X['inv_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月库存量的中位数
#         X['inv_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月库存量的最大值
#         X['inv_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月库存量的最小值
#         X['inv_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月库存量的标准差

    # 前3个月的提货量
    for i in range(1, 4):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['ord_pre_%s' % i] = order[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()

    # 前3个月分销量
#     for i in range(1, 4):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['dis_pre_%s' % i] = dis[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
       
    # 前6个月的库存量
#     for i in range(1, 7):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['inv_pre_%s' % i] = inv[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    X = pd.DataFrame(X)
    
    if is_train:
        start_dt = date(year, month, 1)
        y = order[pd.date_range(start_dt, periods=2, freq='M')].values
        return X, y
    
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
        
    return X

In [27]:
def get_pre_10_days(order, dis, inv, index, year, month):
    X = {} 
    start_dt, end_dt = date(year, month, 1), date(year, month, 10)
    
    # 每个品类M月前10天的提货量
    ord_tmp = order.loc[order.order_date.isin(pd.date_range(start_dt, end_dt, freq='D'))]
    ord_tmp = ord_tmp.groupby('category')[['qty']].sum()
    ord_tmp = ord_tmp.reindex(index).fillna(0)
#     ord_tmp['qty'] = ord_tmp.qty.apply(lambda x: np.log1p(x) if x > 0 else 0)
    X['ord_pre_10_days'] = ord_tmp.values.ravel()
    
    # 每个品类M月前10天的分销量
#     dis_tmp = dis.loc[dis.dis_date.isin(pd.date_range(start_dt, end_dt, freq='D'))]
#     dis_tmp = dis_tmp.groupby('category')[['qty']].sum()
#     dis_tmp = dis_tmp.reindex(index).fillna(0)
#     dis_tmp['qty'] = dis_tmp.qty.apply(lambda x: np.log1p(x) if x > 0 else 0)
#     X['dis_pre_10_days'] = dis_tmp.values.ravel()
    
    # 每个品类M月前10天的库存
#     inv_tmp = inv.loc[inv.inv_date.isin(pd.date_range(end=end_dt, periods=1, freq='D'))]
#     inv_tmp = inv_tmp.groupby('category')[['qty']].sum()
#     inv_tmp = inv_tmp.reindex(index).fillna(0)
#     inv_tmp['qty'] = inv_tmp.qty.apply(lambda x: np.log1p(x) if x > 0 else 0)
#     X['inv_pre_10_days'] = inv_tmp.values.ravel()
    
    X = pd.DataFrame(X)
    
    return X

## 2.1 准备训练集

In [28]:
train_month = [ 
    '2017-05', 
    '2017-06', 
    '2017-07', 
    '2017-08', 
    '2017-09', 
    '2017-10', 
    '2017-11', 
    '2017-12', 
    '2018-01', 
    '2018-02', 
    '2018-03', 
    '2018-04', 
    '2018-05', 
    '2018-06', 
    '2018-07'
]

X_l, y_l = [], []
for month in train_month:
    y, m = int(month.split('-')[0]), int(month.split('-')[1]) 
    pre_10_days = get_pre_10_days(order, dis, inv, order_cate_month.index, y, m)   
    X_tmp, y_tmp = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, y, m)
    X_tmp = pd.concat([X_tmp, cates.reset_index(drop=True)], axis=1)
#     X_tmp = pd.concat([X_tmp, pre_10_days, cates.reset_index(drop=True)], axis=1)
    X_tmp['pred_month'] = m
    X_tmp['is_spring_fest'] = X_tmp.pred_month.apply(lambda x: 1 if (y == 2017 and x == 1) or (y == 2018 and x == 2) else 0)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
    del X_tmp, y_tmp
    gc.collect()
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

In [29]:
X_train.head()

Unnamed: 0,ord_diff_mean_pre_3,ord_sum_decay_pre_3,ord_mean_pre_3,ord_median_pre_3,ord_max_pre_3,ord_min_pre_3,ord_std_pre_3,ord_pre_1,ord_pre_2,ord_pre_3,cate_enc,pred_month,is_spring_fest
0,-9782.5,122536.48,45950.666667,42618.0,72181.0,23053.0,24732.975889,23053.0,72181.0,42618.0,0,5,0
1,6738.5,46424.23,16658.666667,16873.0,23290.0,9813.0,6741.056025,23290.0,16873.0,9813.0,1,5,0
2,13806.5,555500.95,204336.666667,130678.0,379267.0,103065.0,152121.943132,130678.0,379267.0,103065.0,2,5,0
3,-7098.0,453921.59,168264.0,102839.0,313310.0,88643.0,125813.903012,88643.0,313310.0,102839.0,3,5,0
4,-39795.0,1285382.08,477148.666667,503100.0,503968.0,424378.0,45702.798616,424378.0,503100.0,503968.0,4,5,0


In [30]:
X_train.shape

(120, 13)

## 2.2 准备验证集

In [31]:
pre_10_days = get_pre_10_days(order, dis, inv, order_cate_month.index, 2018, 9)
X_val, y_val = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 9)
X_val = pd.concat([X_val, cates.reset_index(drop=True)], axis=1)
# X_val = pd.concat([X_val, pre_10_days, cates.reset_index(drop=True)], axis=1)
X_val['pred_month'] = 9
X_val['is_spring_fest'] = X_val.pred_month.apply(lambda x: 1 if (y == 2017 and x == 1) or (y == 2018 and x == 2) else 0)

In [32]:
X_val.head()

Unnamed: 0,ord_diff_mean_pre_3,ord_sum_decay_pre_3,ord_mean_pre_3,ord_median_pre_3,ord_max_pre_3,ord_min_pre_3,ord_std_pre_3,ord_pre_1,ord_pre_2,ord_pre_3,cate_enc,pred_month,is_spring_fest
0,11554.5,52202.94,18436.0,11484.0,34593.0,9231.0,14037.645422,34593.0,9231.0,11484.0,0,9,0
1,-3077.5,29574.57,11122.0,9912.0,16067.0,7387.0,4464.714437,9912.0,7387.0,16067.0,1,9,0
2,39864.0,245800.51,87807.0,74881.0,154609.0,33931.0,61368.607838,154609.0,33931.0,74881.0,2,9,0
3,36426.0,192307.59,68335.666667,51609.0,124461.0,28937.0,49910.366432,124461.0,28937.0,51609.0,3,9,0
4,186786.0,1252545.1,449072.0,434944.0,642922.0,269350.0,187186.2978,642922.0,434944.0,269350.0,4,9,0


In [33]:
X_val.shape

(8, 13)

## 2.3 准备测试集

In [34]:
pre_10_days = get_pre_10_days(order, dis, inv, order_cate_month.index, 2018, 11)
X_test = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 11, is_train=False)
X_test = pd.concat([X_test, cates.reset_index(drop=True)], axis=1)
# X_test = pd.concat([X_test, pre_10_days, cates.reset_index(drop=True)], axis=1)
X_test['pred_month'] = 11
X_test['is_spring_fest'] = X_test.pred_month.apply(lambda x: 1 if (y == 2017 and x == 1) or (y == 2018 and x == 2) else 0)

In [35]:
X_test.head()

Unnamed: 0,ord_diff_mean_pre_3,ord_sum_decay_pre_3,ord_mean_pre_3,ord_median_pre_3,ord_max_pre_3,ord_min_pre_3,ord_std_pre_3,ord_pre_1,ord_pre_2,ord_pre_3,cate_enc,pred_month,is_spring_fest
0,1697.5,109112.03,40158.0,37988.0,47893.0,34593.0,6910.439566,37988.0,47893.0,34593.0,0,11,0
1,12416.5,57135.92,20205.0,15958.0,34745.0,9912.0,12949.796485,34745.0,15958.0,9912.0,1,11,0
2,40298.5,525424.59,191044.0,183317.0,235206.0,154609.0,40850.324344,235206.0,183317.0,154609.0,2,11,0
3,60818.0,506646.91,182681.0,177485.0,246097.0,124461.0,60984.243342,246097.0,177485.0,124461.0,3,11,0
4,139556.0,2278011.62,830989.333333,922034.0,928012.0,642922.0,162898.51295,922034.0,928012.0,642922.0,4,11,0


In [36]:
X_test.shape

(8, 13)

## 2.4 将 Categorical Feature 转化成 OneHot 向量

**备注**：
对于树模型而言，没有必要把 Categorical Feature 转化成 OneHot 向量，反而会使得准确率略微下降。

In [37]:
# train_len = len(X_train)
# val_len = len(X_val)

# tmp = pd.concat([X_train, X_val, X_test], axis=0)
# tmp = pd.get_dummies(tmp, columns=['cate_enc', 'pred_month'])

# X_train = tmp.iloc[:train_len]
# X_val = tmp.iloc[train_len:(train_len + val_len)]
# X_test = tmp.iloc[(train_len + val_len):]

# del tmp
# gc.collect()

# 3 训练和预测

In [38]:
def error(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true)

In [39]:
def acc(y_true, y_pred):
    return 1 - error(y_true, y_pred)

In [40]:
print("[INFO] Start training and predicting...")
t0 = time()

regr = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=8), 
    n_estimators=1200, random_state=np.random.RandomState(1)
)

preds_train, preds_val, preds_test = [], [], []
accs_train, accs_val = [], []
mses_train, mses_val = [], []

for i in range(2):
    print()
    print('# ' + '=' * 100 + ' #')
    print('# ' + 'Step %d' % i + ' ' * (100 - len('Step %d' % i)) + ' #')
    print('# ' + '=' * 100 + ' #')
    
    # Add previous predictions as a new feature
    if preds_train:
        X_train['m%s' % (i - 1)] = pd.Series(preds_train[i - 1])
        X_val['m%s' % (i - 1)] = pd.Series(preds_val[i - 1])
        X_test['m%s' % (i - 1)] = pd.Series(preds_test[i - 1])
    
    # Adjust the month predicted
    if i != 0:
        X_train['pred_month'] = X_train.pred_month.apply(lambda x: x - 11 if x + 1 > 12 else x + 1)
        X_val['pred_month'] = X_val.pred_month.apply(lambda x: x - 11 if x + 1 > 12 else x + 1)
        X_test['pred_month'] = X_test.pred_month.apply(lambda x: x - 11 if x + 1 > 12 else x + 1)
    
    print("[INFO] Fit the model...")
    regr.fit(X_train.values, y_train[:, i])
    
    # Predict
    pred_train = regr.predict(X_train.values)
    pred_val = regr.predict(X_val.values)
    pred_test = regr.predict(X_test.values)
    
    # Calculate accuracy
    acc_train = acc(y_train[:, i], pred_train)
    acc_val = acc(y_val[:, i], pred_val)
    print("[INFO] The accuracy of training set is: %.2f%%\t The accuracy of validation set is: %.2f%%" % (acc_train * 100, acc_val * 100))
    
    # Calculate MSE
    mse_train = mean_squared_error(y_train[:, i], pred_train)
    mse_val = mean_squared_error(y_val[:, i], pred_val)
    print("[INFO] The MSE of training set is: %.2f%%\t The MSE of validation set is: %.2f%%" % (acc_train * 100, acc_val * 100))
    
    # Store the intermediate results
    preds_train.append(pred_train)
    preds_val.append(pred_val)
    preds_test.append(pred_test)
    accs_train.append(acc_train)
    accs_val.append(acc_val)
    mses_train.append(mse_train)
    mses_val.append(mse_val)

print()
print("[INFO] Finished! ( ^ _ ^ ) V")
print("[INFO] Done in %f seconds." % (time() - t0))

[INFO] Start training and predicting...

# Step 0                                                                                               #
[INFO] Fit the model...
[INFO] The accuracy of training set is: 97.14%	 The accuracy of validation set is: 55.22%
[INFO] The MSE of training set is: 97.14%	 The MSE of validation set is: 55.22%

# Step 1                                                                                               #
[INFO] Fit the model...
[INFO] The accuracy of training set is: 96.42%	 The accuracy of validation set is: 56.86%
[INFO] The MSE of training set is: 96.42%	 The MSE of validation set is: 56.86%

[INFO] Finished! ( ^ _ ^ ) V
[INFO] Done in 2.993553 seconds.


In [41]:
print("The MSE error of validation set is:", mean_squared_error(y_val, np.array(preds_val).transpose()))

The MSE error of validation set is: 45826516996.703125


# 4 结果评估

## 4.1 技术指标

评价指标为**归一化加权均方根对数误差（Normalized Weighted Root Mean Squared Logarithmic Error, NWRMSLE）**，表达式如下：

$$
\text{NWRMSLE} = \sqrt{ \frac{\sum_{i=1}^{m}\omega_i \sum_{j=1}^{t}\left( \ln \left(\widehat{y}_j^{(i)} + 1 \right) -\ln \left(y_j^{(i)} + 1 \right)\right)^2}{t\cdot \sum_{i=1}^{m}\omega_i} }
$$

In [42]:
# 归一化加权均方根对数误差
err = (y_val - np.array(preds_val).transpose()) ** 2
err = err.sum(axis=1)
err = np.sqrt(err.sum() / 2 / len(y_val))
print("The NWRMSLE error of validation set is:", err)

The NWRMSLE error of validation set is: 214071.2895198773


## 4.2 业务指标

业务指标的表达式如下：

$$
E(t, y) = \frac{\sum_{i=0}^{n} | y^{(i)} - t^{(i)} |}{\sum_{i=0}^{n} t^{(i)}}
$$

### 4.2.1 验证集

In [43]:
def expm1_with_clip(x, l=0, r=100000):
    return np.clip(np.expm1(x), l, r)

In [44]:
df_val = pd.DataFrame(
    np.array(y_val), index=order_cate_month.index, 
    columns=['2018-09', '2018-10']
)

df_pred_val = pd.DataFrame(
    np.array(preds_val).transpose(), index=order_cate_month.index, 
    columns=['2018-09', '2018-10']
)

In [45]:
# df_val = pd.DataFrame(
#     np.array(y_val), index=order_cate_month.index, 
#     columns=['2018-09', '2018-10']
# ).apply(np.expm1)

# df_pred_val = pd.DataFrame(
#     np.array(preds_val).transpose(), index=order_cate_month.index, 
#     columns=['2018-09', '2018-10']
# ).apply(np.expm1)

In [46]:
# 验证集（9-10月）准确率（品类）
m_acc = acc(df_val['2018-09'], df_pred_val['2018-09'])
m1_acc = acc(df_val['2018-10'], df_pred_val['2018-10'])
print("The accuracy of 'M' order amount is: %.2f%%" % (m_acc * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % (m1_acc * 100))

The accuracy of 'M' order amount is: 55.22%
The accuracy of 'M+1' order amount is: 56.86%


### 4.2.2 测试集

In [47]:
df_pred_test = pd.DataFrame(
    np.array(preds_test).transpose(), index=order_cate_month.index, 
    columns=['2018-11', '2018-12']
).stack().to_frame('pred_qty')
df_pred_test.index.set_names(['category', 'month'], inplace=True)

In [48]:
# df_pred_test = pd.DataFrame(
#     np.array(preds_test).transpose(), index=order_cate_month.index, 
#     columns=['2018-11', '2018-12']
# ).stack().to_frame('pred_qty')
# df_pred_test.index.set_names(['category', 'month'], inplace=True)
# df_pred_test['pred_qty'] = np.floor(np.expm1(df_pred_test.pred_qty))

In [49]:
comp = df_test.join(df_pred_test, how='left').fillna(0).reset_index()

m_comp = comp.loc[comp['month'] == '2018-11']
m1_comp = comp.loc[comp['month'] == '2018-12']

In [50]:
# 测试集（11-12月）准确率（品类）
m_acc = acc(m_comp['qty'], m_comp['pred_qty'])
m1_acc = acc(m1_comp['qty'], m1_comp['pred_qty'])
print("The accuracy of 'M' order amount is: %.2f%%" % (m_acc * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % (m1_acc * 100))

The accuracy of 'M' order amount is: 50.54%
The accuracy of 'M+1' order amount is: 63.98%
