# Load cleaned data with basic features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import training_data_cleaning_joining as clean
import Build_data_grid as grid

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
sales_train = clean.clean_sales_data()
test_data = clean.load_test_data()
sample_submission = clean.load_submission_file()

# Benchmark model: predict previous month sales for next month
This is an ad-hoc model, where we predict, for each shop and item in the test set, the sales to be equal to their previous month.
$$\hat{y}_{M+1, s, i} = y_{M, s, i}$$
where $m=1,\dots, M$ indexes month of our data set, and $s$ indexes shop and $i$ indexes item.

In [3]:
sales_last_month = sales_train[sales_train['date_block_num']==33].copy()

sales_last_month = (sales_last_month.groupby(['shop_id', 'item_id'], as_index=False)
                        .agg({'item_cnt_day': 'sum'})
                        .rename(columns={'item_cnt_day': 'item_cnt_month'})
                       )

In [39]:
sales_last_month.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
0,2,31,1
1,2,486,3
2,2,787,1
3,2,794,1
4,2,968,1


In [40]:
sales_last_month.shape

(31531, 3)

In [41]:
print('number of shops with sales in last month:', sales_last_month['shop_id'].nunique())
print('number of items sold in last month:', sales_last_month['item_id'].nunique())
print('number of shop-item-combinations: %d, out of %d possible' %(
    len(sales_last_month), sales_last_month['shop_id'].nunique()*sales_last_month['item_id'].nunique()))

number of shops with sales in last month: 44
number of items sold in last month: 5413
number of shop-item-combinations: 31531, out of 238172 possible


In [7]:
# make grid dataframe with only shop_ids and item_ids in the test data, with each item_id occuring in all shop_ids.
submit = pd.merge(test_data, sales_last_month, how='left', on=['shop_id', 'item_id'])

# set item_cnt to zero for items not sold in shop
submit.fillna(0, inplace=True)

In [8]:
submit.head()

Unnamed: 0,ID,shop_id,item_id,item_cnt_month
0,0,5,5037,0.0
1,1,5,5320,0.0
2,2,5,5233,1.0
3,3,5,5232,0.0
4,4,5,5268,0.0


In [9]:
sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [10]:
print('predicted item_cnt_month > 20: %.2f%%'
      %np.round(100 * submit[submit['item_cnt_month']>20].shape[0]/len(submit), 2))

print('predicted item_cnt_month < 0: %.4f%%'
      %np.round(100 * submit[submit['item_cnt_month']<0].shape[0]/len(submit), 4))

predicted item_cnt_month > 20: 0.09%
predicted item_cnt_month < 0: 0.0037%


In [11]:
# truncate predicted item_cnt_month to between 0 and 20 to match test data
submit.loc[submit['item_cnt_month']>20, 'item_cnt_month'] = 20
submit.loc[submit['item_cnt_month']<0, 'item_cnt_month'] = 0

In [12]:
# sanity check
submit['item_cnt_month'].describe()

count    214200.000000
mean          0.255649
std           1.089856
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          20.000000
Name: item_cnt_month, dtype: float64

In [13]:
submit.shape

(214200, 4)

In [14]:
# submit[['ID','item_cnt_month']].to_csv('submission_last_month_sales.csv', index=False)

### Compare to benchmark of previous month sales, for each shop-item pair

In [56]:
previous_month = all_data.loc[all_data['date_block_num']==32, 
    ['shop_id', 'item_id', 'target_lag_1']].copy()

In [57]:
previous_month.shape

(218655, 3)

In [79]:
pred_pm = pd.merge(X_val[['shop_id', 'item_id']], previous_month, how='left', on=['shop_id', 'item_id']).fillna(0)

In [85]:
pred_pm.shape

(238172, 3)

In [86]:
pred_pm.tail()

Unnamed: 0,shop_id,item_id,target_lag_1
238167,21,7635,0.0
238168,21,7638,0.0
238169,21,7640,0.0
238170,21,7632,0.0
238171,21,7440,0.0


In [89]:
pred_pm = pred_pm['target_lag_1'].values

In [91]:
print('Test R-squared for previous month is %f' % r2_score(y_val, pred_pm))
print('Test RMSE for previous month is %f' % np.sqrt(mean_squared_error(y_val, pred_pm)))

Test R-squared for previous month is 0.154960
Test RMSE for previous month is 4.911370
