**问题**：
1. 按月进行统计的话，提货、分销、库存的数值很大，只要拟合不准，误差将是巨大的

In [1]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
from time import time
from datetime import date
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 1000)

# 1 数据预处理

In [3]:
# 载入数据
order = pd.read_csv("../../data/level2/m111-sku-order.csv", sep=',', parse_dates=['order_date'])
dis = pd.read_csv("../../data/level2/m111-sku-dis.csv", sep=',', parse_dates=['dis_date'])
inv = pd.read_csv(
    "../../data/level2/m111-sku-inv.csv", sep=',', parse_dates=['period_wid']
).rename(columns={'period_wid': 'inv_date'})
category = pd.read_csv(
    "../../data/level2/item2category-from-3.csv", sep=','
).rename(columns={'sales_segment1_code': 'category'})

In [4]:
# 考虑的品类有：消毒柜、洗碗机、烟机、灶具、电热、净水机、燃热、饮水机
cates_considered = ['CRXDG', 'CRXWJ', 'CRYJ', 'CRZJ', 'DR', 'JSJ', 'RR', 'YSJ']

## 1.1 处理订单数据

In [5]:
# 取2017年至2018年区间中的数据（2019/03/01取数有164945条记录）
order = order.loc[(order.order_date >= '2017-01-01') & (order.order_date <= '2018-12-31')]

In [6]:
# 删除其他品类的数据（剩余记录数为161064）
order = order.join(category.set_index('item_code'), on='item_code', how='left')
order = order.loc[order.category.isin(cates_considered)]

In [7]:
# 划分数据集
df_test = order.loc[order.order_date >= '2018-11-01']  # 测试集
order = order.loc[order.order_date <= '2018-10-31']  # 训练和验证集

In [8]:
df_test['month'] = df_test.order_date.astype('str').apply(lambda x: x[:7])
df_test = df_test.groupby(['item_code', 'month'])[['qty']].sum()

In [9]:
# 每个产品每个月的提货量
order_month = order.copy()
order_month['month'] = order_month.order_date.astype('str').apply(lambda x: x[:7])
order_month = order_month.groupby(['item_code', 'month'])[['qty']].sum()

In [10]:
# 取对数
order_month['qty'] = np.log1p(order_month.qty)

In [11]:
order_month = order_month.unstack(level=-1).fillna(0)
order_month.columns = pd.date_range('2017-01-31', '2018-10-31', freq='M')

## 1.2 处理分销数据

In [12]:
# 取2017年至2018年区间中的数据（分销数据从2017年6月开始才可用，记录数为358247）
dis = dis.loc[(dis.dis_date >= '2017-06-01') & (dis.dis_date <= '2018-10-31')]

In [13]:
# 删除其他品类的数据（剩余记录数为343637）
dis = dis.join(category.set_index('item_code'), on='item_code', how='left')
dis = dis.loc[dis.category.isin(cates_considered)]

In [14]:
# 处理分销量为负数的情况，其中负数记录数为8766（备注：负数是有意义的，存在退货的情况）
# dis['qty'] = dis.qty.apply(lambda x: -x if x < 0 else x)
# dis = dis.loc[~(dis.qty < 0)]

In [15]:
# 每个产品每个月的分销量
dis_month = dis.copy()
dis_month['month'] = dis_month.dis_date.astype('str').apply(lambda x: x[:7])
dis_month = dis_month.groupby(['item_code', 'month'])[['qty']].sum()

In [16]:
# 删除分销为负数的记录
dis_month = dis_month.loc[~(dis_month.qty < 0)]

In [17]:
# 取对数
dis_month['qty'] = np.log1p(dis_month.qty)

In [18]:
dis_month = dis_month.unstack(level=-1).fillna(0)
dis_month.columns = pd.date_range('2017-06-30', '2018-10-31', freq='M')

In [19]:
dis_month = dis_month.reindex(order_month.index).fillna(0)

## 1.3 处理库存数据

In [20]:
# 取2017年至2018年区间中的数据（库存数据从2017年12月开始可用，记录数为138437）
inv = inv.loc[(inv.inv_date >= '2017-12-01') & (inv.inv_date <= '2018-10-31')]

In [21]:
# 删除其他品类的数据（剩余记录数为121249）
inv = inv.join(category.set_index('item_code'), on='item_code', how='left')
inv = inv.loc[inv.category.isin(cates_considered)]

In [22]:
# 取每月的最后一天作为当月的库存（剩余记录数为33698）
inv_lastday = inv.loc[inv.inv_date.isin(pd.date_range('2017-12-31', '2018-10-31', freq='M'))]

In [23]:
# 每个产品每月的库存量
inv_month = inv_lastday.copy()
inv_month['month'] = inv_month.inv_date.astype('str').apply(lambda x: x[:7])
inv_month = inv_month.groupby(['item_code', 'month'])[['qty']].sum()

In [24]:
# 删除库存为负数的记录
inv_month = inv_month.loc[~(inv_month.qty < 0)]

In [25]:
# 取对数
inv_month['qty'] = np.log1p(inv_month.qty)

In [26]:
inv_month = inv_month.unstack(level=-1).fillna(0)
inv_month.columns = pd.date_range('2017-12-31', '2018-10-31', freq='M')

In [27]:
inv_month = inv_month.reindex(order_month.index).fillna(0)

## 1.4 处理品类信息

In [28]:
category = category.set_index('item_code').reindex(order_month.index)

In [29]:
label_enc = LabelEncoder()
category['category'] = label_enc.fit_transform(category.category)

In [30]:
onehot_enc = OneHotEncoder()
category_onehot = onehot_enc.fit_transform(category).toarray()  # 不调用toarray方法前的结果是csr_matrix对象

## 1.5 得到每个品类每个月的提货数据

In [31]:
order_cate_month = order_month.reset_index()
order_cate_month['category'] = category.category.values
order_cate_month = order_cate_month.groupby('category')[order_month.columns].sum()

## 1.6 得到每个品类每个月的分销数据

In [32]:
dis_cate_month = dis_month.reset_index()
dis_cate_month['category'] = category['category'].values
dis_cate_month = dis_cate_month.groupby('category')[dis_month.columns].sum()

## 1.7 得到每个品类每个月的库存数据

In [33]:
inv_cate_month = inv_month.reset_index()
inv_cate_month['category'] = category['category'].values
inv_cate_month = inv_cate_month.groupby('category')[inv_month.columns].sum()

# 2 特征工程

In [34]:
def prepare_dataset(order, dis, inv, year, month, is_train=True, name_prefix=None):
    X = {}
    
    # 提货的统计特征（28个特征）
#     for i in [3, 6, 9, 12]:
#         dt = date(year, month, 1)
#         tmp = order[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月提货量
#         X['ord_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月提货量的平均一阶差分
#         X['ord_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月提货量的和（带衰减）
#         X['ord_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月提货量的平均值
#         X['ord_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月提货量的中位数
#         X['ord_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月提货量的最大值
#         X['ord_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月提货量的最小值
#         X['ord_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月提货量的标准差

    # 提货的统计特征
    for i in [3]:
        dt = date(year, month, 1)
        tmp = order[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月提货量
        X['ord_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月提货量的平均一阶差分
        X['ord_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月提货量的和（带衰减）
        X['ord_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月提货量的平均值
        X['ord_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月提货量的中位数
        X['ord_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月提货量的最大值
        X['ord_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月提货量的最小值
        X['ord_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月提货量的标准差
        
    # 分销的统计特征（21个特征）>_<|||
#     for i in [3, 6, 9]:
#         dt = date(year, month, 1)
#         tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月分销量
#         X['dis_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月分销量的平均一阶差分
#         X['dis_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月分销量的和（带衰减）
#         X['dis_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月分销量的均值
#         X['dis_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月分销量的中位数
#         X['dis_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月分销量的最大值
#         X['dis_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月分销量的最小值
#         X['dis_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月分销量的标准差

    # 分销的统计特征
#     for i in [3]:
#         dt = date(year, month, 1)
#         tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月分销量
#         X['dis_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月分销量的平均一阶差分
#         X['dis_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月分销量的和（带衰减）
#         X['dis_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月分销量的均值
#         X['dis_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月分销量的中位数
#         X['dis_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月分销量的最大值
#         X['dis_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月分销量的最小值
#         X['dis_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月分销量的标准差
        
    # 库存的统计特征（14个）>_<|||
#     for i in [3, 6]:
#         dt = date(year, month, 1)
#         tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]  # 前i个月库存量
#         X['inv_diff_mean_pre_%s' % i] = tmp.diff(axis=1).mean(axis=1).values  # 前i个月库存量的平均一阶差分
#         X['inv_sum_decay_pre_%s' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values  # 前i个月库存量的和（带衰减）
#         X['inv_mean_pre_%s' % i] = tmp.mean(axis=1).values  # 前i个月库存量的均值
#         X['inv_median_pre_%s' % i] = tmp.median(axis=1).values  # 前i个月库存量的中位数
#         X['inv_max_pre_%s' % i] = tmp.max(axis=1).values  # 前i个月库存量的最大值
#         X['inv_min_pre_%s' % i] = tmp.min(axis=1).values  # 前i个月库存量的最小值
#         X['inv_std_pre_%s' % i] = tmp.std(axis=1).values  # 前i个月库存量的标准差
        
    # 提货天数特征（12个特征）
#     for i in [3, 6, 9, 12]:
#         dt = date(year, month, 1)
#         tmp = order[pd.date_range(end=dt, periods=i, freq='M')]
#         X['has_ord_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有提货的天数
#         X['last_ord_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有提货的天数
#         X['first_ord_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有提货的天数
        
    # 分销天数特征（9个特征）>_<|||
#     for i in [3, 6, 9]:
#         dt = date(year, month, 1)
#         tmp = dis[pd.date_range(end=dt, periods=i, freq='M')]
#         X['has_dis_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有分销的天数
#         X['last_dis_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有分销的天数
#         X['first_dis_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有分销的天数
        
    # 库存天数特征（6个特征）>_<|||
#     for i in [3, 6]:
#         dt = date(year, month, 1)
#         tmp = inv[pd.date_range(end=dt, periods=i, freq='M')]
#         X['has_inv_pre_%s' % i] = (tmp > 0).sum(axis=1).values  # 前i个月有库存的天数
#         X['last_inv_pre_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values  # 前i个月距离上一次有库存的天数
#         X['first_inv_pre_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values  # 前i个月距离第一次有库存的天数
        
    # 前12个月的提货量
#     for i in range(1, 13):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['ord_pre_%s' % i] = order[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()

    # 前3个月的提货量
    for i in range(1, 4):
        if month - i <= 0:
            start_dt = date(year - 1, month + 12 - i, 1)
        else:
            start_dt = date(year, month - i, 1)
        X['ord_pre_%s' % i] = order[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    # 前9个月的分销量>_<|||
#     for i in range(1, 10):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['dis_pre_%s' % i] = dis[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
       
    # 前6个月的库存量>_<|||
#     for i in range(1, 7):
#         if month - i <= 0:
#             start_dt = date(year - 1, month + 12 - i, 1)
#         else:
#             start_dt = date(year, month - i, 1)
#         X['inv_pre_%s' % i] = inv[pd.date_range(start_dt, periods=1, freq='M')].values.ravel()
        
    X = pd.DataFrame(X)
    
    if is_train:
        start_dt = date(year, month, 1)
        y = order[pd.date_range(start_dt, periods=2, freq='M')].values
        return X, y
    
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
        
    return X

## 2.1 准备训练集

In [48]:
train_month = [ 
    '2018-03', 
    '2018-04', 
    '2018-05', 
    '2018-06', 
    '2018-07'
]

X_l, y_l = [], []
for month in train_month:
    y, m = int(month.split('-')[0]), int(month.split('-')[1])
    
    X_tmp, y_tmp = prepare_dataset(order_month, dis_month, inv_month, y, m)
    
    X_tmp2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, y, m, is_train=False, name_prefix='cate')
    X_tmp2.index = order_cate_month.index
    X_tmp2 = X_tmp2.reindex(category.category).reset_index(drop=True)
    
    X_tmp = pd.concat([X_tmp, X_tmp2, category.reset_index(drop=True)], axis=1)
    X_tmp['pred_month'] = m
    X_l.append(X_tmp)
    y_l.append(y_tmp)
    
    del X_tmp, y_tmp, X_tmp2
    gc.collect()
    
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

## 2.2 准备验证集

In [49]:
X_val, y_val = prepare_dataset(order_month, dis_month, inv_month, 2018, 9)

X_val2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 9, is_train=False, name_prefix='cate')
X_val2.index = order_cate_month.index
X_val2 = X_val2.reindex(category.category).reset_index(drop=True)

X_val = pd.concat([X_val, X_val2, category.reset_index(drop=True)], axis=1)
X_val['pred_month'] = 9

del X_val2
gc.collect()

7

## 2.3 准备测试集

In [50]:
X_test = prepare_dataset(order_month, dis_month, inv_month, 2018, 11, is_train=False)

X_test2 = prepare_dataset(order_cate_month, dis_cate_month, inv_cate_month, 2018, 11, is_train=False, name_prefix='cate')
X_test2.index = order_cate_month.index
X_test2 = X_test2.reindex(category['category']).reset_index(drop=True)

X_test = pd.concat([X_test, X_test2, category.reset_index(drop=True)], axis=1)
X_test['pred_month'] = 11

del X_test2
gc.collect()

7

# 3 训练和预测

In [51]:
print("[INFO] Start training and predicting...")
t0 = time()

params = {
    'num_leaves': 80, 
    'objective': 'regression', 
    'min_data_in_leaf': 200, 
    'learning_rate': 0.02, 
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 1, 
    'metric': 'l2', 
    'num_threads': 16
}

MAX_ROUNDS = 5000
pred_val = []
pred_test = []
cate_vars = []

for i in range(2):
    print('=' * 50)
    print("Step %d" % (i + 1))
    print('=' * 50)
    
    dtrain = lgb.Dataset(X_train, label=y_train[:, i], categorical_feature=cate_vars)
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain, categorical_feature=cate_vars)
    
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS, 
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    
    feat_imp = [("%s: %.2f" % x) for x in sorted(zip(X_train.columns, bst.feature_importance('gain')), key=lambda x: x[1], reverse=True)]
    print('\n'.join(feat_imp))
    pred_val.append(
        bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    pred_test.append(
        bst.predict(X_test, num_iteration=bst.best_iteration or MAX_ROUNDS)
    )
    
print("[INFO] Finished! ( ^ _ ^ ) V")
print("[INFO] Done in %f seconds." % (time() - t0))

[INFO] Start training and predicting...
Step 1




Training until validation scores don't improve for 125 rounds.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "D:\Anaconda3\envs\py3_for_prac\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-51-983741157d16>", line 31, in <module>
    valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
  File "D:\Anaconda3\envs\py3_for_prac\lib\site-packages\lightgbm\engine.py", line 204, in train
    booster.update(fobj=fobj)
  File "D:\Anaconda3\envs\py3_for_prac\lib\site-packages\lightgbm\basic.py", line 1528, in update
    ctypes.byref(is_finished)))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Anaconda3\envs\py3_for_prac\lib\site-packages\IPython\core\interactiveshell.py", line 1863, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, 

KeyboardInterrupt: 

In [None]:
print("The MSE error of validation set is:", mean_squared_error(y_val, np.array(pred_val).transpose()))

评价指标为**归一化加权均方根对数误差（Normalized Weighted Root Mean Squared Logarithmic Error, NWRMSLE）**，表达式如下：

$$
\text{NWRMSLE} = \sqrt{ \frac{\sum_{i=1}^{m}\omega_i \sum_{j=1}^{t}\left( \ln \left(\widehat{y}_j^{(i)} + 1 \right) -\ln \left(y_j^{(i)} + 1 \right)\right)^2}{t\cdot \sum_{i=1}^{m}\omega_i} }
$$

In [41]:
# 归一化加权均方根对数误差
err = (y_val - np.array(pred_val).transpose()) ** 2
err = err.sum(axis=1)
err = np.sqrt(err.sum() / 2 / len(y_val))
print("The NWRMSLE error of validation set is:", err)

The NWRMSLE error of validation set is: 2.027037392389006


业务指标的表达式如下：

$$
E(t, y) = \frac{\sum_{i=0}^{n} | y^{(i)} - t^{(i)} |}{\sum_{i=0}^{n} t^{(i)}}
$$

In [38]:
def error(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true)

In [39]:
def expm1_with_clip(x, l=0, r=100000):
    return np.clip(np.expm1(x), l, r)

In [40]:
df_pred_val = pd.DataFrame(
    np.array(pred_val).transpose(), index=order_month.index, 
    columns=['2018-07', '2018-08', '2018-09']
)

In [41]:
df_val = pd.DataFrame(
    np.array(y_val), index=order_month.index, 
    columns=['2018-07', '2018-08', '2018-09']
)

In [37]:
# 验证集准确率
first_month_error = error(expm1_with_clip(df_val['2018-07']), expm1_with_clip(df_pred_val['2018-07']))
second_month_error = error(expm1_with_clip(df_val['2018-08']), expm1_with_clip(df_pred_val['2018-08']))
third_month_error = error(expm1_with_clip(df_val['2018-09']), expm1_with_clip(df_pred_val['2018-09']))

In [38]:
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - first_month_error) * 100))
print("The accuracy of 'M+2' order amount is: %.2f%%" % ((1 - second_month_error) * 100))
print("The accuracy of 'M+3' order amount is: %.2f%%" % ((1 - third_month_error) * 100))

The accuracy of 'M+1' order amount is: 50.96%
The accuracy of 'M+2' order amount is: 54.34%
The accuracy of 'M+3' order amount is: 48.82%


In [42]:
df_pred_test = pd.DataFrame(
    np.array(pred_test).transpose(), index=order_month.index, 
    columns=['2018-10', '2018-11', '2018-12']
)

In [43]:
df_pred_test = pd.DataFrame(
    np.array(pred_test).transpose(), index=order_month.index, 
    columns=['2018-10', '2018-11', '2018-12']
).stack().to_frame('pred_qty')
df_pred_test.index.set_names(['item_code', 'month'], inplace=True)
# df_pred_test['pred_qty'] = np.clip(np.expm1(df_pred_test['pred_qty']), 0, 100000)

In [44]:
comp = df_test.join(df_pred_test, how='left').fillna(0).reset_index()
first_month_comp = comp.loc[comp['month'] == '2018-10']
second_month_comp = comp.loc[comp['month'] == '2018-11']
third_month_comp = comp.loc[comp['month'] == '2018-12']

In [45]:
first_month_error = error(first_month_comp['qty'], first_month_comp['pred_qty'])
second_month_error = error(second_month_comp['qty'], second_month_comp['pred_qty'])
third_month_error = error(third_month_comp['qty'], third_month_comp['pred_qty'])

In [46]:
print("The accuracy of 'M+1' order amount is: %.2f%%" % ((1 - first_month_error) * 100))
print("The accuracy of 'M+2' order amount is: %.2f%%" % ((1 - second_month_error) * 100))
print("The accuracy of 'M+3' order amount is: %.2f%%" % ((1 - third_month_error) * 100))

The accuracy of 'M+1' order amount is: 36.63%
The accuracy of 'M+2' order amount is: 29.72%
The accuracy of 'M+3' order amount is: 19.48%


In [49]:
test_items = set(df_test.index.get_level_values(0))

In [50]:
len(test_items)

1010

In [51]:
pred_test_items = set(df_pred_test.index.get_level_values(0))

In [52]:
len(pred_test_items)

1558

In [53]:
len(test_items & pred_test_items)

860