In [1]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from time import time
from datetime import date
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 1000)

# 1 数据预处理

In [147]:
# 载入数据
order = pd.read_csv("../../data/level2/m111-order-sku.csv", sep=',', parse_dates=['order_date'])
dis = pd.read_csv("../../data/level2/m111-dis-sku.csv", sep=',', parse_dates=['dis_date'])
inv = pd.read_csv(
    "../../data/level2/m111-inv-sku.csv", sep=',', parse_dates=['period_wid']
).rename(columns={'period_wid': 'inv_date'})
category = pd.read_csv(
    "../../data/level2/m111-item-category.csv", sep=','
).rename(columns={'sales_segment1_code': 'category'})

## 1.1 处理订单数据

In [148]:
# 取2017年至2018年区间中的数据
order = order.loc[(order['order_date'] >= '2017-01-01') & (order['order_date'] <= '2018-12-31')]

In [149]:
# 删除订单量为0的记录
order = order.loc[~(order.qty == 0)]

In [150]:
# 删除异常值
order = order.join(category.set_index('item_code'), on='item_code', how='left').fillna('-1')
order = order.loc[~(order.category == 'TGWL')]
order = order.loc[~(order.category == -1)]
order = order.loc[~((order.category == 'CRYJ') & (order.qty > 10000))]
order = order.loc[~((order.category == 'CRZJ') & (order.qty > 10000))]
order = order.loc[~((order.category == 'CRZTCF') & (order.qty > 10000))]
order = order.loc[~((order.category == 'RR') & (order.qty > 10000))]

In [151]:
# 划分数据集
df_test = order.loc[order['order_date'] >= '2018-10-01']
order = order.loc[order['order_date'] <= '2018-09-30']

In [152]:
# 取对数
order['qty'] = np.log1p(order['qty'])

In [143]:
df_test['month'] = df_test['order_date'].astype('str').apply(lambda x: x[:7])
df_test = df_test.groupby(['item_code', 'month'])[['qty']].sum()

In [144]:
order_month = order.copy()
order_month['month'] = order_month['order_date'].astype('str').apply(lambda x: x[:7])
order_month = order_month.groupby(['item_code', 'month'])[['qty']].sum()
order_month = order_month.unstack(level=-1).fillna(0)
order_month.columns = pd.date_range('2017-01-31', '2018-09-30', freq='M')

In [146]:
order_month.max()

2017-01-31    134.011422
2017-02-28    183.566300
2017-03-31    218.085962
2017-04-30    197.334593
2017-05-31    202.509418
2017-06-30    211.804348
2017-07-31    193.135912
2017-08-31    214.188567
2017-09-30    197.569899
2017-10-31    178.609036
2017-11-30    210.801347
2017-12-31    203.466181
2018-01-31    225.432875
2018-02-28    112.873987
2018-03-31    218.139181
2018-04-30    207.971889
2018-05-31    181.807937
2018-06-30    188.515586
2018-07-31    215.171512
2018-08-31    237.144476
2018-09-30    225.412598
Freq: M, dtype: float64

## 1.2 处理分销数据

In [124]:
# 取2017年至2018年区间中的数据（分销数据只有从3月份开始的数据且3月份不完整）>_<|||
dis = dis.loc[(dis['dis_date'] >= '2017-04-01') & (dis['dis_date'] <= '2018-09-30')]

In [125]:
# 处理分销量为负数的情况
dis['qty'] = dis['qty'].apply(lambda x: -x if x < 0 else x)

In [126]:
# 删除分销量为0的记录
dis = dis.loc[~(dis.qty == 0)]

In [127]:
# 删除异常值
dis = dis.join(category.set_index('item_code'), on='item_code', how='left').fillna('-1')
dis = dis.loc[~(dis.category == 'TGWL')]
dis = dis.loc[~(dis.category == '-1')]
dis = dis.loc[~((dis.category == 'CRYJ') & (dis.item_code == '12173000000014'))]
dis = dis.loc[~((dis.category == 'CRZTCF') & (dis.qty > 10000))]
dis = dis.loc[~((dis.category == 'RR') & (dis.item_code == '21099910000123'))]

In [131]:
dis['qty'] = np.log1p(dis['qty'])

In [128]:
dis_month = dis.copy()
dis_month['month'] = dis_month['dis_date'].astype('str').apply(lambda x: x[:7])
dis_month = dis_month.groupby(['item_code', 'month'])[['qty']].sum()
dis_month = dis_month.unstack(level=-1).fillna(0)
dis_month.columns = pd.date_range('2017-04-30', '2018-09-30', freq='M')
dis_month = dis_month.reindex(order_month.index).fillna(0)

## 1.3 处理库存数据

In [13]:
# 取2017年至2018年区间中的数据（库存数据只有从2017年6月份开始的数据）>_<|||
inv = inv.loc[(inv['inv_date'] >= '2017-06-01') & (inv['inv_date'] <= '2018-09-30')]

In [14]:
# 处理分销量为负数的情况
inv['qty'] = inv['qty'].apply(lambda x: -x if x < 0 else x)

In [15]:
# 删除异常值
inv = inv.loc[~(inv.qty > 1000000)]

In [16]:
# 取每月的最后一天作为当月的库存
inv = inv.loc[inv['inv_date'].isin(pd.date_range('2017-06-30', '2018-09-30', freq='M'))]

In [17]:
inv['qty'] = np.log1p(inv['qty'])

In [18]:
inv_month = inv.copy()
inv_month['month'] = inv_month['inv_date'].astype('str').apply(lambda x: x[:7])
inv_month = inv_month.groupby(['item_code', 'month'])[['qty']].sum()
inv_month = inv_month.unstack(level=-1).fillna(0)
inv_month.columns = pd.date_range('2017-06-30', '2018-09-30', freq='M')
inv_month = inv_month.reindex(order_month.index).fillna(0)

## 1.4 处理品类信息

In [19]:
# Pandas的bug：'21054110000024', '21054110000025', '21054110000084', '21054110000085' 这4个字符串居然是重复的，取消注释即可看到
# category.loc[category.item_code.duplicated()]

In [20]:
# 解决上面bug的下策，避免下面reindex报错
category.drop_duplicates(['item_code'], keep='first', inplace=True)

In [21]:
category = category.set_index('item_code').reindex(order_month.index)

In [22]:
encoder = LabelEncoder()
category['category'] = encoder.fit_transform(category['category'])

## 1.5 得到每个品类每个月的提货数据

In [23]:
order_cate_month = order_month.reset_index()
order_cate_month['category'] = category['category'].values
order_cate_month = order_cate_month.groupby('category')[order_month.columns].sum()

## 1.6 得到每个品类每个月的分销数据

In [24]:
dis_cate_month = dis_month.reset_index()
dis_cate_month['category'] = category['category'].values
dis_cate_month = dis_cate_month.groupby('category')[dis_month.columns].sum()

## 1.7 得到每个品类每个月的库存数据

In [25]:
inv_cate_month = inv_month.reset_index()
inv_cate_month['category'] = category['category'].values
inv_cate_month = inv_cate_month.groupby('category')[inv_month.columns].sum()

# 2 特征工程

In [26]:
def prepare_dataset(order, dis, inv, year, month, is_train=True, name_prefix=None):
    X = {}
    
    # 提货的统计特征（28个特征）
    for i in [3, 6, 9, 12]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月提货量
        X['ord_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月提货量的平均一阶差分
        X['ord_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月提货量的和（带衰减）
        X['ord_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月提货量的平均值
        X['ord_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月提货量的中位数
        X['ord_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月提货量的最大值
        X['ord_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月提货量的最小值
        X['ord_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月提货量的标准差
        
    # 分销的统计特征（21个特征）>_<|||
    for i in [3, 6, 9]:
        dt = date(year, month, 1)
        tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月分销量
        X['dis_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月分销量的平均一阶差分
        X['dis_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月分销量的和（带衰减）
        X['dis_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月分销量的均值
        X['dis_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月分销量的中位数
        X['dis_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月分销量的最大值
        X['dis_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月分销量的最小值
        X['dis_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月分销量的标准差
        
    # 库存的统计特征（14个）>_<|||
    for i in [3, 6]:
        dt = date(year, month, 1)
        tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月库存量
        X['inv_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月库存量的平均一阶差分
        X['inv_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月库存量的和（带衰减）
        X['inv_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月库存量的均值
        X['inv_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月库存量的中位数
        X['inv_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月库存量的最大值
        X['inv_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月库存量的最小值
        X['inv_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月库存量的标准差
        
    # 提货天数特征（12个特征）
    for i in [3, 6, 9, 12]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]
        X['has_ord_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有提货的天数
        X['last_ord_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有提货的天数
        X['first_ord_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有提货的天数
        
    # 分销天数特征（9个特征）>_<|||
    for i in [3, 6, 9]:
        dt = date(year, month, 1)
        tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]
        X['has_dis_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有分销的天数
        X['last_dis_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有分销的天数
        X['first_dis_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有分销的天数
        
    # 库存天数特征（6个特征）>_<|||
    for i in [3, 6]:
        dt = date(year, month, 1)
        tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]
        X['has_inv_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有库存的天数
        X['last_inv_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有库存的天数
        X['first_inv_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有库存的天数
        
    # 前12个月的提货量
    for i in range(1, 13):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['ord_pre_%s' % i] = order[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    # 前9个月的分销量>_<|||
    for i in range(1, 10):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['dis_pre_%s' % i] = dis[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
       
    # 前6个月的库存量>_<|||
    for i in range(1, 7):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['inv_pre_%s' % i] = inv[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    X = pd.DataFrame(X)
    
    if is_train:
        start_dt = date(year, month, 1)
        y = order[pd.date_range(start_dt, periods=3, freq='M')].values
        return X, y
    
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
        
    return X

## 2.1 准备训练集

In [27]:
train_month = [ 
    '2018-01', 
    '2018-02', 
    '2018-03', 
    '2018-04'
]

X_l, y_l = [], []
for month in train_month:
    y, m = int(month.split('-')[0]), int(month.split('-')[1])
    
    X_tmp, y_tmp = prepare_dataset(order_month, dis_month, inv_month, y, m)
    
    X_tmp2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, y, m, is_train=False, name_prefix='cate')
    X_tmp2.index = order_cate_month.index
    X_tmp2 = X_tmp2.reindex(category['category']).reset_index(drop=True)
    
    X_tmp = pd.concat([X_tmp, X_tmp2, category.reset_index(drop=True)], axis=1)
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
    del X_tmp, y_tmp, X_tmp2
    gc.collect()
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

## 2.2 准备验证集

In [28]:
X_val, y_val = prepare_dataset(order_month, dis_month, inv_month, 2018, 7)

X_val2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 7, is_train=False, name_prefix='cate')
X_val2.index = order_cate_month.index
X_val2 = X_val2.reindex(category['category']).reset_index(drop=True)

X_val = pd.concat([X_val, X_val2, category.reset_index(drop=True)], axis=1)

del X_val2
gc.collect()

0

## 2.3 准备测试集

In [29]:
X_test = prepare_dataset(order_month, dis_month, inv_month, 2018, 10, is_train=False)

X_test2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 10, is_train=False, name_prefix='cate')
X_test2.index = order_cate_month.index
X_test2 = X_test2.reindex(category['category']).reset_index(drop=True)

X_test = pd.concat([X_test, X_test2, category.reset_index(drop=True)], axis=1)

del X_test2
gc.collect()

0

# 3 训练和预测

In [30]:
print("[INFO] Start training and predicting...")
t0 = time()

params = {
    'num_leaves': 80, 
    'objective': 'regression', 
    'min_data_in_leaf': 200, 
    'learning_rate': 0.02, 
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 1, 
    'metric': 'l2', 
    'num_threads': 16
}

MAX_ROUNDS = 5000
pred_val = []
pred_test = []
cate_vars = []

for i in range(3):
    print('=' * 50)
    print("Step %d" % (i + 1))
    print('=' * 50)
    
    dtrain = lgb.Dataset(X_train, label=y_train[:,i], categorical_feature=cate_vars)
    dval = lgb.Dataset(X_val, label=y_val[:,i], reference=dtrain, categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS, 
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    feat_imp = [("%s: %.2f" % x) for x in sorted(zip(X_train.columns, bst.feature_importance('gain')), key=lambda x: x[1], reverse=True)]
    print('\n'.join(feat_imp))
    pred_val.append(
        bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    pred_test.append(
        bst.predict(X_test, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    
print("[INFO] Finished! ( ^ _ ^ ) V")
print("[INFO] Done in %f seconds." % (time() - t0))

[INFO] Start training and predicting...
Step 1




Training until validation scores don't improve for 125 rounds.
[50]	training's l2: 350.745	valid_1's l2: 312.35
[100]	training's l2: 242.89	valid_1's l2: 242.813
[150]	training's l2: 213.273	valid_1's l2: 226.491
[200]	training's l2: 193.402	valid_1's l2: 222.741
[250]	training's l2: 179.55	valid_1's l2: 220.825
[300]	training's l2: 169.712	valid_1's l2: 219.021
[350]	training's l2: 160.247	valid_1's l2: 215.871
[400]	training's l2: 153.346	valid_1's l2: 214.828
[450]	training's l2: 147.156	valid_1's l2: 213.204
[500]	training's l2: 142.724	valid_1's l2: 211.291
[550]	training's l2: 138.707	valid_1's l2: 210.396
[600]	training's l2: 134.746	valid_1's l2: 209.556
[650]	training's l2: 131.63	valid_1's l2: 208.161
[700]	training's l2: 128.568	valid_1's l2: 206.921
[750]	training's l2: 125.519	valid_1's l2: 206.61
[800]	training's l2: 122.848	valid_1's l2: 206.239
[850]	training's l2: 120.28	valid_1's l2: 205.514
[900]	training's l2: 117.719	valid_1's l2: 204.99
[950]	training's l2: 115.42

[300]	training's l2: 175.65	valid_1's l2: 418.917
[350]	training's l2: 170.053	valid_1's l2: 418.312
[400]	training's l2: 164.732	valid_1's l2: 414.17
[450]	training's l2: 159.925	valid_1's l2: 413.816
[500]	training's l2: 155.786	valid_1's l2: 412.572
[550]	training's l2: 152.21	valid_1's l2: 412.256
[600]	training's l2: 148.964	valid_1's l2: 413.417
Early stopping, best iteration is:
[519]	training's l2: 154.359	valid_1's l2: 411.828
ord_pre_1: 36061499.32
ord_sum_decay_pre_3: 17595123.68
ord_max_pre_9: 3599780.17
ord_max_pre_3: 3112470.08
ord_max_pre_6: 1989475.11
ord_pre_6: 1795395.32
cate_dis_pre_8: 1057288.90
cate_ord_diff_mean_pre_6: 707145.71
dis_pre_8: 666940.16
ord_max_pre_12: 652897.44
ord_median_pre_3: 572356.35
ord_pre_12: 509877.49
ord_min_pre_12: 430515.26
ord_std_pre_6: 419294.60
ord_pre_2: 378576.13
inv_diff_mean_pre_3: 372339.55
inv_pre_2: 339828.94
ord_diff_mean_pre_3: 331691.39
ord_diff_mean_pre_9: 316938.79
ord_min_pre_9: 314047.51
inv_std_pre_3: 299491.35
ord_pre_

[1450]	training's l2: 153.448	valid_1's l2: 548.958
[1500]	training's l2: 151.909	valid_1's l2: 545.89
[1550]	training's l2: 150.31	valid_1's l2: 543.684
[1600]	training's l2: 148.927	valid_1's l2: 542.73
[1650]	training's l2: 147.399	valid_1's l2: 541.981
[1700]	training's l2: 145.976	valid_1's l2: 540.759
[1750]	training's l2: 144.591	valid_1's l2: 538.796
[1800]	training's l2: 143.334	valid_1's l2: 536.483
[1850]	training's l2: 142.111	valid_1's l2: 535.905
[1900]	training's l2: 140.897	valid_1's l2: 536.021
[1950]	training's l2: 139.698	valid_1's l2: 535.362
[2000]	training's l2: 138.577	valid_1's l2: 534.798
[2050]	training's l2: 137.496	valid_1's l2: 533.755
[2100]	training's l2: 136.47	valid_1's l2: 531.31
[2150]	training's l2: 135.447	valid_1's l2: 531.255
[2200]	training's l2: 134.473	valid_1's l2: 531.285
[2250]	training's l2: 133.517	valid_1's l2: 530.321
[2300]	training's l2: 132.596	valid_1's l2: 528.365
[2350]	training's l2: 131.649	valid_1's l2: 527.157
[2400]	training's

In [31]:
print("The MSE error of validation set is:", mean_squared_error(y_val, np.array(pred_val).transpose()))

The MSE error of validation set is: 378.50449238176674


评价指标为**归一化加权均方根对数误差（Normalized Weighted Root Mean Squared Logarithmic Error, NWRMSLE）**，表达式如下：

$$
\text{NWRMSLE} = \sqrt{ \frac{\sum_{i=1}^{m}\omega_i \sum_{j=1}^{t}\left( \ln \left(\widehat{y}_j^{(i)} + 1 \right) -\ln \left(y_j^{(i)} + 1 \right)\right)^2}{t\cdot \sum_{i=1}^{m}\omega_i} }
$$

In [32]:
# 归一化加权均方根对数误差
err = (y_val - np.array(pred_val).transpose()) ** 2
err = err.sum(axis=1)
err = np.sqrt(err.sum() / 3 / len(y_val))
print("The NWRMSLE error of validation set is:", err)

The NWRMSLE error of validation set is: 19.45519191325973


业务指标的表达式如下：

$$
E(t, y) = \frac{\sum_{i=0}^{n} | y^{(i)} - t^{(i)} |}{\sum_{i=0}^{n} t^{(i)}}
$$

In [33]:
def error(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true)

In [34]:
def expm1_with_clip(x, l=0, r=100000):
    return np.clip(np.expm1(x), l, r)

In [35]:
df_pred_val = pd.DataFrame(
    np.array(pred_val).transpose(), index=order_month.index, 
    columns=['2018-07', '2018-08', '2018-09']
)

In [36]:
df_val = pd.DataFrame(
    np.array(y_val), index=order_month.index, 
    columns=['2018-07', '2018-08', '2018-09']
)

In [37]:
# 验证集准确率
first_month_error = error(expm1_with_clip(df_val['2018-07']), expm1_with_clip(df_pred_val['2018-07']))
second_month_error = error(expm1_with_clip(df_val['2018-08']), expm1_with_clip(df_pred_val['2018-08']))
third_month_error = error(expm1_with_clip(df_val['2018-09']), expm1_with_clip(df_pred_val['2018-09']))

In [38]:
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - first_month_error) * 100))
print("The accuracy of 'M+2' order amount is: %.2f%%" % ((1 - second_month_error) * 100))
print("The accuracy of 'M+3' order amount is: %.2f%%" % ((1 - third_month_error) * 100))

The accuracy of 'M+1' order amount is: 50.96%
The accuracy of 'M+2' order amount is: 54.34%
The accuracy of 'M+3' order amount is: 48.82%


In [39]:
df_pred_test = pd.DataFrame(
    np.array(pred_test).transpose(), index=order_month.index, 
    columns=['2018-10', '2018-11', '2018-12']
).stack().to_frame('pred_qty')
df_pred_test.index.set_names(['item_code', 'month'], inplace=True)
df_pred_test['pred_qty'] = np.clip(np.expm1(df_pred_test['pred_qty']), 0, 100000)

In [40]:
comp = df_test.join(df_pred_test, how='left').fillna(0).reset_index()
first_month_comp = comp.loc[comp['month'] == '2018-10']
second_month_comp = comp.loc[comp['month'] == '2018-11']
third_month_comp = comp.loc[comp['month'] == '2018-12']

In [41]:
first_month_error = error(first_month_comp['qty'], first_month_comp['pred_qty'])
second_month_error = error(second_month_comp['qty'], second_month_comp['pred_qty'])
third_month_error = error(third_month_comp['qty'], third_month_comp['pred_qty'])

In [42]:
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - first_month_error) * 100))
print("The accuracy of 'M+2' order amount is: %.2f%%" % ((1 - second_month_error) * 100))
print("The accuracy of 'M+3' order amount is: %.2f%%" % ((1 - third_month_error) * 100))

The accuracy of 'M+1' order amount is: -2232.79%
The accuracy of 'M+2' order amount is: -2010.57%
The accuracy of 'M+3' order amount is: -1206.30%


In [43]:
test_items = set(df_test.index.get_level_values(0))

In [44]:
len(test_items)

1034

In [45]:
pred_test_items = set(df_pred_test.index.get_level_values(0))

In [46]:
len(pred_test_items)

1845

In [47]:
len(test_items & pred_test_items)

884