In [1]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from time import time
from datetime import date
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 1000)

# 1 数据预处理

In [44]:
# 载入数据
order = pd.read_csv("../../data/level2/m111-sku-order.csv", sep=',', parse_dates=['order_date'])
dis = pd.read_csv("../../data/level2/m111-sku-dis.csv", sep=',', parse_dates=['dis_date'])
inv = pd.read_csv(
    "../../data/level2/m111-sku-inv.csv", sep=',', parse_dates=['period_wid']
).rename(columns={'period_wid': 'inv_date'})
category = pd.read_csv(
    "../../data/level2/item2category-from-3.csv", sep=','
).rename(columns={'sales_segment1_code': 'category'})

In [45]:
# 考虑的品类有：消毒柜、洗碗机、烟机、灶具、电热、净水机、燃热、饮水机
cates_considered = ['CRXDG', 'CRXWJ', 'CRYJ', 'CRZJ', 'DR', 'JSJ', 'RR', 'YSJ']

## 1.1 处理订单数据

In [46]:
# 取2017年至2018年区间中的数据（2019/03/01取数有164945条记录）
order = order.loc[(order.order_date >= '2017-01-01') & (order.order_date <= '2018-12-31')]

In [47]:
# 删除其他品类的数据（剩余记录数为161064）
order = order.join(category.set_index('item_code'), on='item_code', how='left')
order = order.loc[order.category.isin(cates_considered)]

In [48]:
# 划分数据集
df_test = order.loc[order.order_date >= '2018-11-01']  # 测试集
order = order.loc[order.order_date <= '2018-10-31']  # 训练和验证集

In [49]:
df_test['month'] = df_test.order_date.astype('str').apply(lambda x: x[:7])
df_test = df_test.groupby(['category', 'month'])[['qty']].sum()

In [50]:
# 每个品类每个月的提货量
order_cate_month = order.copy()
order_cate_month['month'] = order_cate_month.order_date.astype('str').apply(lambda x: x[:7])
order_cate_month = order_cate_month.groupby(['category', 'month'])[['qty']].sum()

In [51]:
# 取对数
order_cate_month['qty'] = np.log1p(order_cate_month.qty)

In [52]:
order_cate_month = order_cate_month.unstack(level=-1).fillna(0)
order_cate_month.columns = pd.date_range('2017-01-31', '2018-10-31', freq='M')

## 1.2 处理分销数据

In [53]:
# 取2017年至2018年区间中的数据（分销数据从2017年6月开始才可用，记录数为358247）
dis = dis.loc[(dis.dis_date >= '2017-06-01') & (dis.dis_date <= '2018-10-31')]

In [54]:
# 删除其他品类的数据（剩余记录数为343637）
dis = dis.join(category.set_index('item_code'), on='item_code', how='left')
dis = dis.loc[dis.category.isin(cates_considered)]

In [55]:
# 每个品类每个月的分销量
dis_cate_month = dis.copy()
dis_cate_month['month'] = dis_cate_month.dis_date.astype('str').apply(lambda x: x[:7])
dis_cate_month = dis_cate_month.groupby(['category', 'month'])[['qty']].sum()

In [56]:
# 取对数
dis_cate_month['qty'] = np.log1p(dis_cate_month.qty)

In [57]:
dis_cate_month = dis_cate_month.unstack(level=-1).fillna(0)
dis_cate_month.columns = pd.date_range('2017-06-30', '2018-10-31', freq='M')

In [62]:
dis_cate_month = dis_cate_month.reindex(order_cate_month.index).fillna(0)

## 1.3 处理库存数据

In [64]:
# 取2017年至2018年区间中的数据（库存数据从2017年12月开始可用，记录数为138437）
inv = inv.loc[(inv.inv_date >= '2017-12-01') & (inv.inv_date <= '2018-10-31')]

In [65]:
# 删除其他品类的数据（剩余记录数为121249）
inv = inv.join(category.set_index('item_code'), on='item_code', how='left')
inv = inv.loc[inv.category.isin(cates_considered)]

In [66]:
# 取每月的最后一天作为当月的库存（剩余记录数为33698）
inv_lastday = inv.loc[inv.inv_date.isin(pd.date_range('2017-12-31', '2018-10-31', freq='M'))]

In [70]:
# 每个品类每月的库存量
inv_cate_month = inv_lastday.copy()
inv_cate_month['month'] = inv_cate_month.inv_date.astype('str').apply(lambda x: x[:7])
inv_cate_month = inv_cate_month.groupby(['category', 'month'])[['qty']].sum()

In [72]:
# 取对数
inv_cate_month['qty'] = np.log1p(inv_cate_month.qty)

In [73]:
inv_cate_month = inv_cate_month.unstack(level=-1).fillna(0)
inv_cate_month.columns = pd.date_range('2017-12-31', '2018-10-31', freq='M')

In [77]:
inv_cate_month = inv_cate_month.reindex(order_cate_month.index).fillna(0)

## 2.4 处理品类特征

In [122]:
cates = pd.DataFrame(cates_considered, columns=['category'])

encoder = LabelEncoder()
cates['cate_enc'] = encoder.fit_transform(cates.category)

cates = cates.set_index('category').reindex(order_cate_month.index)

# 2 特征工程

In [123]:
def prepare_dataset(order, dis, inv, year, month, is_train=True, name_prefix=None):
    X = {}

    # 提货的统计特征
    for i in [3]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月提货量
        X['ord_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月提货量的平均一阶差分
        X['ord_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月提货量的和（带衰减）
        X['ord_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月提货量的平均值
        X['ord_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月提货量的中位数
        X['ord_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月提货量的最大值
        X['ord_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月提货量的最小值
        X['ord_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月提货量的标准差

    # 分销的统计特征
#     for i in [3]:
#         dt = date(year, month, 1)
#         tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月分销量
#         X['dis_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月分销量的平均一阶差分
#         X['dis_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月分销量的和（带衰减）
#         X['dis_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月分销量的均值
#         X['dis_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月分销量的中位数
#         X['dis_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月分销量的最大值
#         X['dis_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月分销量的最小值
#         X['dis_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月分销量的标准差
        
    # 库存的统计特征
#     for i in [3, 6]:
#         dt = date(year, month, 1)
#         tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月库存量
#         X['inv_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月库存量的平均一阶差分
#         X['inv_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月库存量的和（带衰减）
#         X['inv_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月库存量的均值
#         X['inv_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月库存量的中位数
#         X['inv_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月库存量的最大值
#         X['inv_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月库存量的最小值
#         X['inv_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月库存量的标准差

    # 前3个月的提货量
    for i in range(1, 4):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['ord_pre_%s' % i] = order[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()

    # 前3个月分销量
#     for i in range(1, 4):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['dis_pre_%s' % i] = dis[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
       
    # 前6个月的库存量
#     for i in range(1, 7):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['inv_pre_%s' % i] = inv[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    X = pd.DataFrame(X)
    
    if is_train:
        start_dt = date(year, month, 1)
        y = order[pd.date_range(start_dt, periods=2, freq='M')].values
        return X, y
    
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
        
    return X

In [124]:
def get_pre_10_days(order, dis, inv, index, year, month):
    X = {} 
    start_dt, end_dt = date(year, month, 1), date(year, month, 10)
    
    # 每个品类M月前10天的提货量
    ord_tmp = order.loc[order.order_date.isin(pd.date_range(start_dt, end_dt, freq='D'))]
    ord_tmp = ord_tmp.groupby('category')[['qty']].sum()
    ord_tmp = ord_tmp.reindex(index).fillna(0)
    ord_tmp['qty'] = ord_tmp.qty.apply(lambda x: np.log1p(x) if x > 0 else 0)
    X['ord_pre_10_days'] = ord_tmp.values.ravel()
    
    # 每个品类M月前10天的分销量
#     dis_tmp = dis.loc[dis.dis_date.isin(pd.date_range(start_dt, end_dt, freq='D'))]
#     dis_tmp = dis_tmp.groupby('category')[['qty']].sum()
#     dis_tmp = dis_tmp.reindex(index).fillna(0)
#     dis_tmp['qty'] = dis_tmp.qty.apply(lambda x: np.log1p(x) if x > 0 else 0)
#     X['dis_pre_10_days'] = dis_tmp.values.ravel()
    
    # 每个品类M月前10天的库存
#     inv_tmp = inv.loc[inv.inv_date.isin(pd.date_range(end=end_dt, periods=1, freq='D'))]
#     inv_tmp = inv_tmp.groupby('category')[['qty']].sum()
#     inv_tmp = inv_tmp.reindex(index).fillna(0)
#     inv_tmp['qty'] = inv_tmp.qty.apply(lambda x: np.log1p(x) if x > 0 else 0)
#     X['inv_pre_10_days'] = inv_tmp.values.ravel()
    
    X = pd.DataFrame(X)
    
    return X

## 2.1 准备训练集

In [131]:
train_month = [ 
    '2017-05', 
    '2017-06', 
    '2017-07', 
    '2017-08', 
    '2017-09', 
    '2017-10', 
    '2017-11', 
    '2017-12', 
    '2018-01', 
    '2018-03', 
    '2018-04', 
    '2018-05', 
    '2018-06', 
    '2018-07'
]

X_l, y_l = [], []
category = pd.DataFrame(order_cate_month.index, columns=['category'])
for month in train_month:
    y, m = int(month.split('-')[0]), int(month.split('-')[1]) 
    pre_10_days = get_pre_10_days(order, dis, inv, order_cate_month.index, y, m)   
    X_tmp, y_tmp = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, y, m)
    X_tmp = pd.concat([X_tmp, pre_10_days, cates.reset_index(drop=True)], axis=1)
    X_tmp['pred_month'] = m
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
    del X_tmp, y_tmp
    gc.collect()
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

In [132]:
X_train.head()

Unnamed: 0,ord_diff_mean_pre_3,ord_max_pre_3,ord_mean_pre_3,ord_median_pre_3,ord_min_pre_3,ord_pre_1,ord_pre_2,ord_pre_3,ord_std_pre_3,ord_sum_decay_pre_3,ord_pre_10_days,cate_enc,pred_month
0,-0.30723,11.186946,10.630865,10.660055,10.045595,10.045595,11.186946,10.660055,0.571235,28.748491,9.974458,0,5
1,0.432129,10.055822,9.660306,9.733529,9.191565,10.055822,9.733529,9.191565,0.436757,26.261166,9.188299,1,5
2,0.118687,12.845998,12.056541,11.780499,11.543125,11.780499,12.845998,11.543125,0.693916,32.691829,11.278999,2,5
3,-0.074273,12.654952,11.862755,11.54093,11.392384,11.392384,12.654952,11.54093,0.690071,32.129993,11.174217,3,5
4,-0.085944,13.13027,13.072399,13.128546,12.958382,12.958382,13.128546,13.13027,0.098746,35.409593,12.121473,4,5


In [133]:
X_train.shape

(112, 13)

## 2.2 准备验证集

In [134]:
pre_10_days = get_pre_10_days(order, dis, inv, order_cate_month.index, 2018, 9)
X_val, y_val = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 9)
X_val = pd.concat([X_val, pre_10_days, cates.reset_index(drop=True)], axis=1)
X_val['pred_month'] = 9

In [135]:
X_val.head()

Unnamed: 0,ord_diff_mean_pre_3,ord_max_pre_3,ord_mean_pre_3,ord_median_pre_3,ord_min_pre_3,ord_pre_1,ord_pre_2,ord_pre_3,ord_std_pre_3,ord_sum_decay_pre_3,ord_pre_10_days,cate_enc,pred_month
0,0.551319,10.451436,9.643555,9.348797,9.130431,10.451436,9.130431,9.348797,0.708113,26.241349,9.354354,0,9
1,-0.241491,9.684585,9.2646,9.201602,8.907612,9.201602,8.907612,9.684585,0.392299,25.062967,8.471568,1,9
2,0.362496,11.948661,11.201481,11.223669,10.432114,11.948661,10.432114,11.223669,0.758517,30.428735,10.957416,2,9
3,0.440142,11.731756,10.952046,10.851471,10.272911,11.731756,10.272911,10.851471,0.734604,29.767067,10.852071,3,9
4,0.435005,13.37378,12.953509,12.982975,12.503771,13.37378,12.982975,12.503771,0.435753,35.186512,12.634974,4,9


In [136]:
X_val.shape

(8, 13)

## 2.3 准备测试集

In [139]:
pre_10_days = get_pre_10_days(order, dis, inv, order_cate_month.index, 2018, 11)
X_test = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 11, is_train=False)
X_test = pd.concat([X_test, pre_10_days, cates.reset_index(drop=True)], axis=1)
X_test['pred_month'] = 11

In [143]:
X_test.head()

Unnamed: 0,ord_diff_mean_pre_3,ord_max_pre_3,ord_mean_pre_3,ord_median_pre_3,ord_min_pre_3,ord_pre_1,ord_pre_2,ord_pre_3,ord_std_pre_3,ord_sum_decay_pre_3,ord_pre_10_days,cate_enc,pred_month
0,0.046808,10.776746,10.591078,10.545052,10.451436,10.545052,10.776746,10.451436,0.167468,28.709786,0,0,11
1,0.627109,10.45582,9.7784,9.677778,9.201602,10.45582,9.677778,9.201602,0.633134,26.619118,0,1,11
2,0.20978,12.368221,12.145287,12.118978,11.948661,12.368221,12.118978,11.948661,0.211014,32.953717,0,2,11
3,0.340865,12.413485,12.077296,12.086647,11.731756,12.413485,12.086647,11.731756,0.340961,32.79419,0,3,11
4,0.180279,13.740801,13.616307,13.734338,13.37378,13.734338,13.740801,13.37378,0.210059,36.933821,0,4,11


In [144]:
X_test.shape

(8, 13)

## 2.4 将 Categorical Feature 转化成 OneHot 向量

**备注**：
对于树模型而言，没有必要把 Categorical Feature 转化成 OneHot 向量，反而会使得准确率略微下降。

In [145]:
# train_len = len(X_train)
# val_len = len(X_val)

# tmp = pd.concat([X_train, X_val, X_test], axis=0)
# tmp = pd.get_dummies(tmp, columns=['category', 'pred_month'])

# X_train = tmp.iloc[:train_len]
# X_val = tmp.iloc[train_len:(train_len + val_len)]
# X_test = tmp.iloc[(train_len + val_len):]

# del tmp
# gc.collect()

# 3 训练和预测

In [148]:
print("[INFO] Start training and predicting...")
t0 = time()

params = {
    'num_leaves': 80, 
    'objective': 'regression', 
    'min_data_in_leaf': 200, 
    'learning_rate': 0.02, 
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 1, 
    'metric': 'l2', 
    'num_threads': 16
}

MAX_ROUNDS = 5000
pred_val = []
pred_test = []
cate_vars = []

for i in range(2):
    print('=' * 50)
    print("Step %d" % (i + 1))
    print('=' * 50)
    
    dtrain = lgb.Dataset(X_train, label=y_train[:, i], categorical_feature=cate_vars)
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS, 
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    feat_imp = [("%s: %.2f" % x) for x in sorted(zip(X_train.columns, bst.feature_importance('gain')), key=lambda x: x[1], reverse=True)]
    print('\n'.join(feat_imp))
    pred_val.append(
        bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    pred_test.append(
        bst.predict(X_test, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    
print("[INFO] Finished! ( ^ _ ^ ) V")
print("[INFO] Done in %f seconds." % (time() - t0))

[INFO] Start training and predicting...
Step 1




LightGBMError: Cannot construct Dataset since there are not useful features.
It should be at least two unique rows.
If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.
Otherwise please make sure you are using the right dataset

In [47]:
print("The MSE error of validation set is:", mean_squared_error(y_val, np.array(pred_val).transpose()))

The MSE error of validation set is: 3.0398676206330864


# 4 结果评估

## 4.1 技术指标

评价指标为**归一化加权均方根对数误差（Normalized Weighted Root Mean Squared Logarithmic Error, NWRMSLE）**，表达式如下：

$$
\text{NWRMSLE} = \sqrt{ \frac{\sum_{i=1}^{m}\omega_i \sum_{j=1}^{t}\left( \ln \left(\widehat{y}_j^{(i)} + 1 \right) -\ln \left(y_j^{(i)} + 1 \right)\right)^2}{t\cdot \sum_{i=1}^{m}\omega_i} }
$$

In [248]:
# 归一化加权均方根对数误差
err = (y_val - np.array(pred_val).transpose()) ** 2
err = err.sum(axis=1)
err = np.sqrt(err.sum() / 2 / len(y_val))
print("The NWRMSLE error of validation set is:", err)

The NWRMSLE error of validation set is: 1.7435216146159722


## 4.2 业务指标

业务指标的表达式如下：

$$
E(t, y) = \frac{\sum_{i=0}^{n} | y^{(i)} - t^{(i)} |}{\sum_{i=0}^{n} t^{(i)}}
$$

In [249]:
def error(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true)

In [250]:
def expm1_with_clip(x, l=0, r=35000):
    return np.clip(np.expm1(x), l, r)

### 4.2.1 验证集

In [251]:
df_val = pd.DataFrame(
    np.array(y_val), index=order_month.index, 
    columns=['2018-09', '2018-10']
).apply(np.expm1)

df_pred_val = pd.DataFrame(
    np.array(pred_val).transpose(), index=order_month.index, 
    columns=['2018-09', '2018-10']
).apply(expm1_with_clip).apply(np.floor)

In [252]:
# 验证集（9-10月）准确率（SKU）
m_error = error(df_val['2018-09'], df_pred_val['2018-09'])
m1_error = error(df_val['2018-10'], df_pred_val['2018-10'])
print("The accuracy of 'M' order amount is: %.2f%%" % ((1 - m_error) * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - m1_error) * 100))

The accuracy of 'M' order amount is: 60.00%
The accuracy of 'M+1' order amount is: 33.86%


In [253]:
df_val = df_val.join(category, how='left')
df_pred_val = df_pred_val.join(category, how='left')

df_val = df_val.groupby('category')[['2018-09', '2018-10']].sum()
df_pred_val = df_pred_val.groupby('category')[['2018-09', '2018-10']].sum()

In [254]:
# 验证集（9-10月）准确率（品类）
m_error = error(df_val['2018-09'], df_pred_val['2018-09'])
m1_error = error(df_val['2018-10'], df_pred_val['2018-10'])
print("The accuracy of 'M' order amount is: %.2f%%" % ((1 - m_error) * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - m1_error) * 100))

The accuracy of 'M' order amount is: 75.67%
The accuracy of 'M+1' order amount is: 41.73%


### 4.2.2 测试集

In [264]:
df_pred_test = pd.DataFrame(
    np.array(pred_test).transpose(), index=order_month.index, 
    columns=['2018-11', '2018-12']
).stack().to_frame('pred_qty')
df_pred_test.index.set_names(['item_code', 'month'], inplace=True)
df_pred_test['pred_qty'] = np.floor(expm1_with_clip(df_pred_test.pred_qty))

In [265]:
comp = df_test.join(df_pred_test, how='left').fillna(0).reset_index()
comp = comp.join(category, on='item_code', how='left')
comp.category.fillna('-1', inplace=True)

m_comp = comp.loc[comp['month'] == '2018-11']
m1_comp = comp.loc[comp['month'] == '2018-12']

m_comp_without_new = m_comp.loc[m_comp.category != '-1']
m1_comp_without_new = m1_comp.loc[m1_comp.category != '-1']

In [266]:
# 测试集（11-12月）准确率（带新品）（SKU）
# 备注：总共1653条提货记录，有120条提货记录是新品
m_error = error(m_comp['qty'], m_comp['pred_qty'])
m1_error = error(m1_comp['qty'], m1_comp['pred_qty'])
print("The accuracy of 'M' order amount is: %.2f%%" % ((1 - m_error) * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - m1_error) * 100))

The accuracy of 'M' order amount is: 34.89%
The accuracy of 'M+1' order amount is: 18.48%


In [267]:
# 测试集（11-12月）准确率（不带新品）（SKU）
# 备注：总共1653条提货记录，有120条提货记录是新品
m_error = error(m_comp_without_new['qty'], m_comp['pred_qty'])
m1_error = error(m1_comp_without_new['qty'], m1_comp['pred_qty'])
print("The accuracy of 'M' order amount is: %.2f%%" % ((1 - m_error) * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - m1_error) * 100))

The accuracy of 'M' order amount is: 35.85%
The accuracy of 'M+1' order amount is: 20.79%


In [268]:
comp_cate = comp.groupby(['category', 'month'])[['qty', 'pred_qty']].sum().reset_index()

m_comp = comp_cate.loc[comp_cate['month'] == '2018-11']
m1_comp = comp_cate.loc[comp_cate['month'] == '2018-12']

m_comp_without_new = m_comp.loc[m_comp.category != '-1']
m1_comp_without_new = m1_comp.loc[m1_comp.category != '-1']

In [269]:
# 测试集（11-12月）准确率（带新品）（品类）
m_error = error(m_comp['qty'], m_comp['pred_qty'])
m1_error = error(m1_comp['qty'], m1_comp['pred_qty'])
print("The accuracy of 'M' order amount is: %.2f%%" % ((1 - m_error) * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - m1_error) * 100))

The accuracy of 'M' order amount is: 51.43%
The accuracy of 'M+1' order amount is: 23.73%


In [270]:
# 测试集（11-12月）准确率（不带新品）（品类）
m_error = error(m_comp_without_new['qty'], m_comp['pred_qty'])
m1_error = error(m1_comp_without_new['qty'], m1_comp['pred_qty'])
print("The accuracy of 'M' order amount is: %.2f%%" % ((1 - m_error) * 100))
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - m1_error) * 100))

The accuracy of 'M' order amount is: 52.85%
The accuracy of 'M+1' order amount is: 26.69%


In [271]:
m_comp['acc'] = 1 - (np.abs(m_comp.qty - m_comp.pred_qty) / m_comp.qty)
m1_comp['acc'] = 1 - (np.abs(m1_comp.qty - m1_comp.pred_qty) / m1_comp.qty)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [273]:
m_comp.sort_values(by='acc', ascending=False)

Unnamed: 0,category,month,qty,pred_qty,acc
10,5,2018-11,373660.0,344541.0,0.922071
2,1,2018-11,13369.0,9384.0,0.701922
0,0,2018-11,17996.0,11753.0,0.65309
14,7,2018-11,144644.0,93196.0,0.644313
6,3,2018-11,128296.0,60867.0,0.474426
8,4,2018-11,979037.0,416864.0,0.42579
4,2,2018-11,146590.0,56985.0,0.388737
12,6,2018-11,267108.0,100790.0,0.377338
16,-1,2018-11,57027.0,0.0,0.0


In [272]:
m1_comp.sort_values(by='acc', ascending=False)

Unnamed: 0,category,month,qty,pred_qty,acc
11,5,2018-12,479641.0,218327.0,0.455188
1,0,2018-12,37870.0,14475.0,0.382229
15,7,2018-12,166659.0,63274.0,0.379661
9,4,2018-12,1295903.0,318142.0,0.245498
5,2,2018-12,313670.0,69742.0,0.222342
7,3,2018-12,316960.0,70098.0,0.221157
13,6,2018-12,531017.0,88310.0,0.166304
3,1,2018-12,24690.0,2889.0,0.117011
17,-1,2018-12,396033.0,0.0,0.0
