In [2]:
import pandas as pd
import pandas_profiling
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from pylab import rcParams
import statsmodels.api as sm
import itertools
from fbprophet import Prophet

In [4]:
# Import different files
item_categories = pd.read_csv('item_categories.csv')
items = pd.read_csv('items.csv')
sales_train = pd.read_csv('sales_train_v2.csv')
sample_submission = pd.read_csv('sample_submission.csv')
shops = pd.read_csv('shops.csv')
test = pd.read_csv('test.csv')




In [5]:
# Conversion to date type
sales_train2 = sales_train.copy()
sales_train2 ['date'] = pd.to_datetime(sales_train['date'])

In [10]:
# Resampling per month
sales_train22 = sales_train2.copy()
sales_train22.set_index('date',inplace=True)
sales_train3 = sales_train22['item_cnt_day'].resample('MS').sum()

In [12]:
sales_train3

date
2013-01-01    116950.0
2013-02-01    133607.0
2013-03-01    140586.0
2013-04-01    112185.0
2013-05-01    116808.0
2013-06-01    127421.0
2013-07-01    128686.0
2013-08-01    131676.0
2013-09-01    133110.0
2013-10-01    119723.0
2013-11-01    117422.0
2013-12-01    184559.0
2014-01-01    101419.0
2014-02-01    110954.0
2014-03-01    111778.0
2014-04-01    103499.0
2014-05-01    108501.0
2014-06-01    100143.0
2014-07-01     96758.0
2014-08-01    110037.0
2014-09-01    101819.0
2014-10-01     89813.0
2014-11-01    116223.0
2014-12-01    169945.0
2015-01-01     78235.0
2015-02-01     80989.0
2015-03-01     76119.0
2015-04-01     78601.0
2015-05-01     73815.0
2015-06-01     63703.0
2015-07-01     64967.0
2015-08-01     69375.0
2015-09-01     65933.0
2015-10-01     66373.0
2015-11-01     24990.0
2015-12-01     21484.0
Freq: MS, Name: item_cnt_day, dtype: float64

In [11]:
# Getting some info on Periodicity and residual elements
rcParams['figure.figsize'] = 18, 8
decomposition = sm.tsa.seasonal_decompose(sales_train3, model='additive')
fig = decomposition.plot()
plt.show()

TypeError: float() argument must be a string or a number, not 'Period'

In [13]:
# Defining some Sarimax coefficients
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))


Examples of parameter combinations for Seasonal ARIMA...
SARIMAX: (0, 0, 1) x (0, 0, 1, 12)
SARIMAX: (0, 0, 1) x (0, 1, 0, 12)
SARIMAX: (0, 1, 0) x (0, 1, 1, 12)
SARIMAX: (0, 1, 0) x (1, 0, 0, 12)


In [None]:
sales

In [14]:
# Identifying the good ARIMA coefficient
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(sales_train3,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()
            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue


Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.



ARIMA(0, 0, 0)x(0, 0, 0, 12)12 - AIC:911.6243509249304
ARIMA(0, 0, 0)x(0, 1, 0, 12)12 - AIC:562.7414879549407
ARIMA(0, 0, 0)x(1, 0, 0, 12)12 - AIC:563.1323756155819
ARIMA(0, 0, 0)x(1, 1, 0, 12)12 - AIC:295.82930770622437
ARIMA(0, 0, 1)x(0, 0, 0, 12)12 - AIC:864.9879330705736
ARIMA(0, 0, 1)x(0, 1, 0, 12)12 - AIC:528.6055449721812
ARIMA(0, 0, 1)x(1, 0, 0, 12)12 - AIC:605.8498858975081
ARIMA(0, 0, 1)x(1, 1, 0, 12)12 - AIC:290.07056459188686
ARIMA(0, 1, 0)x(0, 0, 0, 12)12 - AIC:795.4993039032414
ARIMA(0, 1, 0)x(0, 1, 0, 12)12 - AIC:504.5252546058446
ARIMA(0, 1, 0)x(1, 0, 0, 12)12 - AIC:521.8221762539556
ARIMA(0, 1, 0)x(1, 1, 0, 12)12 - AIC:259.6281447327351
ARIMA(0, 1, 1)x(0, 0, 0, 12)12 - AIC:764.4665365822208
ARIMA(0, 1, 1)x(0, 1, 0, 12)12 - AIC:482.1504014396044
ARIMA(0, 1, 1)x(1, 0, 0, 12)12 - AIC:526.8527776645592
ARIMA(0, 1, 1)x(1, 1, 0, 12)12 - AIC:258.69618696717225
ARIMA(1, 0, 0)x(0, 0, 0, 12)12 - AIC:818.8293420172054
ARIMA(1, 0, 0)x(0, 1, 0, 12)12 - AIC:525.1470582388156
ARIMA(1

In [15]:
# Finding coefficient
mod = sm.tsa.statespace.SARIMAX(sales_train3,
                                order=(1, 1, 0),
                                seasonal_order=(1, 1, 0, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])

                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6392      0.336      1.900      0.057      -0.020       1.299
ar.S.L12      -0.4093      0.287     -1.425      0.154      -0.972       0.154
sigma2      4.384e+08   7.55e-10   5.81e+17      0.000    4.38e+08    4.38e+08


In [145]:
# Working on plotting (not working)
# results.plot_diagnostics(figsize=(16, 8))
# plt.show()

ValueError: operands could not be broadcast together with shapes (9,) (8,) (9,) 

In [128]:
sales_train4 = sales_train3.copy()
sales_train4['Date']=sales_train3.index

In [16]:
sales_train4 = sales_train3.to_frame()

In [17]:
sales_train4['Date']=sales_train3.index

In [None]:
sales_train4.head()
sales = sales_train4.rename(columns = {'Date':'ds', 'item_cnt_day':'y'})
# sales

In [24]:
sales.dtypes

y            float64
ds    datetime64[ns]
dtype: object

In [28]:
# Use the Prophet package from Facebook
sales_model = Prophet(interval_width=0.95)
sales_model.fit(sales)

sales_forecast = sales_model.make_future_dataframe(periods=6, freq='MS')
sales_forecast = sales_model.predict(sales_forecast)
plt.figure(figsize=(18, 6))
sales_model.plot(sales_forecast, xlabel = 'Date', ylabel = 'Sales')
plt.title('Sales')
plt.show()

INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



In [144]:
plt.show()

In [64]:
sales_train21 = sales_train2.copy()
sales_train21['date']=sales_train2['date'].dt.to_period('M')

In [66]:
sales_train3 = sales_train21.set_index('date')

In [70]:
sales_train3.index

PeriodIndex(['2013-02', '2013-03', '2013-05', '2013-06', '2013-01', '2013-10',
             '2013-02', '2013-04', '2013-11', '2013-03',
             ...
             '2015-10', '2015-10', '2015-11', '2015-10', '2015-09', '2015-10',
             '2015-09', '2015-10', '2015-10', '2015-03'],
            dtype='period[M]', name='date', length=2935849, freq='M')

In [75]:
sales_train4=sales_train3['item_cnt_day'].groupby([(sales_train3.index)]).sum()

In [76]:
sales_train4

date
2013-01    116950.0
2013-02    133607.0
2013-03    140586.0
2013-04    112185.0
2013-05    116808.0
2013-06    127421.0
2013-07    128686.0
2013-08    131676.0
2013-09    133110.0
2013-10    119723.0
2013-11    117422.0
2013-12    184559.0
2014-01    101419.0
2014-02    110954.0
2014-03    111778.0
2014-04    103499.0
2014-05    108501.0
2014-06    100143.0
2014-07     96758.0
2014-08    110037.0
2014-09    101819.0
2014-10     89813.0
2014-11    116223.0
2014-12    169945.0
2015-01     78235.0
2015-02     80989.0
2015-03     76119.0
2015-04     78601.0
2015-05     73815.0
2015-06     63703.0
2015-07     64967.0
2015-08     69375.0
2015-09     65933.0
2015-10     66373.0
2015-11     24990.0
2015-12     21484.0
Freq: M, Name: item_cnt_day, dtype: float64

In [77]:
type(sales_train4)

pandas.core.series.Series

In [88]:
sales_train22

Unnamed: 0_level_0,date_block_num,shop_id,item_id,item_price,item_cnt_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-02-01,0,59,22154,999.00,1.0
2013-03-01,0,25,2552,899.00,1.0
2013-05-01,0,25,2552,899.00,-1.0
2013-06-01,0,25,2554,1709.05,1.0
2013-01-15,0,25,2555,1099.00,1.0
2013-10-01,0,25,2564,349.00,1.0
2013-02-01,0,25,2565,549.00,1.0
2013-04-01,0,25,2572,239.00,1.0
2013-11-01,0,25,2572,299.00,1.0
2013-03-01,0,25,2573,299.00,3.0


In [90]:
rcParams['figure.figsize'] = 18, 8
decomposition = sm.tsa.seasonal_decompose(sales_train3, model='additive')
fig = decomposition.plot()
plt.show()

In [50]:
sales_train4.plot()
plt.show()

In [34]:
monthly_sales

date_block_num
0     131479.0
1     128090.0
2     147142.0
3     107190.0
4     106970.0
5     125381.0
6     116966.0
7     125291.0
8     133332.0
9     127541.0
10    130009.0
11    183342.0
12    116899.0
13    109687.0
14    115297.0
15     96556.0
16     97790.0
17     97429.0
18     91280.0
19    102721.0
20     99208.0
21    107422.0
22    117845.0
23    168755.0
24    110971.0
25     84198.0
26     82014.0
27     77827.0
28     72295.0
29     64114.0
30     63187.0
31     66079.0
32     72843.0
33     71056.0
Name: item_cnt_day, dtype: float64

In [33]:
monthly_sales.plot()
plt.show()

In [None]:
sales_train.groupby(['date', 'item_id']).sum()

In [32]:
sales_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
5,10.01.2013,0,25,2564,349.00,1.0
6,02.01.2013,0,25,2565,549.00,1.0
7,04.01.2013,0,25,2572,239.00,1.0
8,11.01.2013,0,25,2572,299.00,1.0
9,03.01.2013,0,25,2573,299.00,3.0


In [30]:
sample_submission

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
5,5,0.5
6,6,0.5
7,7,0.5
8,8,0.5
9,9,0.5


In [58]:
sales_train2.dtypes

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object

In [10]:
type(sales_train['date'][:5])

pandas.core.series.Series

In [60]:
sales_train2 = sales_train.copy()
sales_train2 ['date'] = pd.to_datetime(sales_train['date'])

In [65]:
sales_train2.set_index('date',inplace=True)

In [67]:
sales_train2.item_cnt_day.resample('M')

DatetimeIndexResampler [freq=<MonthEnd>, axis=0, closed=right, label=right, convention=start, base=0]

In [34]:
sales_train.groupby(['shop_id', 'item_id']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_block_num,item_price,item_cnt_day
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,30,9,2385.00,31.0
0,31,7,3038.00,11.0
0,32,7,2431.00,16.0
0,33,3,2082.00,6.0
0,35,11,2964.00,15.0
0,36,1,357.00,1.0
0,40,1,127.00,1.0
0,42,1,127.00,1.0
0,43,0,221.00,1.0
0,49,2,254.00,2.0


In [28]:
pandas_profiling.ProfileReport(sales_train)

0,1
Number of variables,6
Number of observations,2935849
Total Missing (%),0.0%
Total size in memory,134.4 MiB
Average record size in memory,48.0 B

0,1
Numeric,5
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1034
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
28.12.2013,9434
29.12.2013,9335
30.12.2014,9324
Other values (1031),2907756

Value,Count,Frequency (%),Unnamed: 3
28.12.2013,9434,0.3%,
29.12.2013,9335,0.3%,
30.12.2014,9324,0.3%,
30.12.2013,9138,0.3%,
31.12.2014,8347,0.3%,
27.12.2014,8041,0.3%,
31.12.2013,7765,0.3%,
23.02.2013,7577,0.3%,
28.12.2014,7370,0.3%,
21.12.2013,6773,0.2%,

0,1
Distinct count,34
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,14.57
Minimum,0
Maximum,33
Zeros (%),3.9%

0,1
Minimum,0
5-th percentile,1
Q1,7
Median,14
Q3,23
95-th percentile,31
Maximum,33
Range,33
Interquartile range,16

0,1
Standard deviation,9.423
Coef of variation,0.64674
Kurtosis,-1.0829
Mean,14.57
MAD,8.119
Skewness,0.20386
Sum,42775060
Variance,88.793
Memory size,22.4 MiB

Value,Count,Frequency (%),Unnamed: 3
11,143246,4.9%,
23,130786,4.5%,
2,121347,4.1%,
0,115690,3.9%,
1,108613,3.7%,
7,104772,3.6%,
6,100548,3.4%,
5,100403,3.4%,
12,99349,3.4%,
10,96736,3.3%,

Value,Count,Frequency (%),Unnamed: 3
0,115690,3.9%,
1,108613,3.7%,
2,121347,4.1%,
3,94109,3.2%,
4,91759,3.1%,

Value,Count,Frequency (%),Unnamed: 3
29,54617,1.9%,
30,55549,1.9%,
31,57029,1.9%,
32,50588,1.7%,
33,53514,1.8%,

0,1
Distinct count,198
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.2426
Minimum,-22
Maximum,2169
Zeros (%),0.0%

0,1
Minimum,-22
5-th percentile,1
Q1,1
Median,1
Q3,1
95-th percentile,2
Maximum,2169
Range,2191
Interquartile range,0

0,1
Standard deviation,2.6188
Coef of variation,2.1075
Kurtosis,177480
Mean,1.2426
MAD,0.44599
Skewness,272.83
Sum,3648200
Variance,6.8583
Memory size,22.4 MiB

Value,Count,Frequency (%),Unnamed: 3
1.0,2629372,89.6%,
2.0,194201,6.6%,
3.0,47350,1.6%,
4.0,19685,0.7%,
5.0,10474,0.4%,
-1.0,7252,0.2%,
6.0,6338,0.2%,
7.0,4057,0.1%,
8.0,2903,0.1%,
9.0,2177,0.1%,

Value,Count,Frequency (%),Unnamed: 3
-22.0,1,0.0%,
-16.0,1,0.0%,
-9.0,1,0.0%,
-6.0,2,0.0%,
-5.0,4,0.0%,

Value,Count,Frequency (%),Unnamed: 3
624.0,1,0.0%,
637.0,1,0.0%,
669.0,1,0.0%,
1000.0,1,0.0%,
2169.0,1,0.0%,

0,1
Distinct count,21807
Unique (%),0.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,10197
Minimum,0
Maximum,22169
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,1540
Q1,4476
Median,9343
Q3,15684
95-th percentile,20949
Maximum,22169
Range,22169
Interquartile range,11208

0,1
Standard deviation,6324.3
Coef of variation,0.6202
Kurtosis,-1.2252
Mean,10197
MAD,5579.7
Skewness,0.25717
Sum,29937518858
Variance,39997000
Memory size,22.4 MiB

Value,Count,Frequency (%),Unnamed: 3
20949,31340,1.1%,
5822,9408,0.3%,
17717,9067,0.3%,
2808,7479,0.3%,
4181,6853,0.2%,
7856,6602,0.2%,
3732,6475,0.2%,
2308,6320,0.2%,
4870,5811,0.2%,
3734,5805,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,6,0.0%,
2,2,0.0%,
3,2,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
22165,2,0.0%,
22166,270,0.0%,
22167,1114,0.0%,
22168,6,0.0%,
22169,1,0.0%,

0,1
Distinct count,19993
Unique (%),0.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,890.85
Minimum,-1
Maximum,307980
Zeros (%),0.0%

0,1
Minimum,-1
5-th percentile,99
Q1,249
Median,399
Q3,999
95-th percentile,2690
Maximum,307980
Range,307980
Interquartile range,750

0,1
Standard deviation,1729.8
Coef of variation,1.9417
Kurtosis,445.53
Mean,890.85
MAD,769.95
Skewness,10.75
Sum,2615400000
Variance,2992200
Memory size,22.4 MiB

Value,Count,Frequency (%),Unnamed: 3
299.0,291352,9.9%,
399.0,242603,8.3%,
149.0,218432,7.4%,
199.0,184044,6.3%,
349.0,101461,3.5%,
599.0,95673,3.3%,
999.0,82784,2.8%,
799.0,77882,2.7%,
249.0,77685,2.6%,
699.0,76493,2.6%,

Value,Count,Frequency (%),Unnamed: 3
-1.0,1,0.0%,
0.07,2,0.0%,
0.0875,1,0.0%,
0.09,1,0.0%,
0.1,2932,0.1%,

Value,Count,Frequency (%),Unnamed: 3
42990.0,4,0.0%,
49782.0,1,0.0%,
50999.0,1,0.0%,
59200.0,1,0.0%,
307980.0,1,0.0%,

0,1
Distinct count,60
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,33.002
Minimum,0
Maximum,59
Zeros (%),0.3%

0,1
Minimum,0
5-th percentile,6
Q1,22
Median,31
Q3,47
95-th percentile,57
Maximum,59
Range,59
Interquartile range,25

0,1
Standard deviation,16.227
Coef of variation,0.4917
Kurtosis,-1.0254
Mean,33.002
MAD,13.83
Skewness,-0.072361
Sum,96888091
Variance,263.31
Memory size,22.4 MiB

Value,Count,Frequency (%),Unnamed: 3
31,235636,8.0%,
25,186104,6.3%,
54,143480,4.9%,
28,142234,4.8%,
57,117428,4.0%,
42,109253,3.7%,
27,105366,3.6%,
6,82663,2.8%,
58,71441,2.4%,
56,69573,2.4%,

Value,Count,Frequency (%),Unnamed: 3
0,9857,0.3%,
1,5678,0.2%,
2,25991,0.9%,
3,25532,0.9%,
4,38242,1.3%,

Value,Count,Frequency (%),Unnamed: 3
55,34769,1.2%,
56,69573,2.4%,
57,117428,4.0%,
58,71441,2.4%,
59,42108,1.4%,

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [29]:
sales_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
5,10.01.2013,0,25,2564,349.00,1.0
6,02.01.2013,0,25,2565,549.00,1.0
7,04.01.2013,0,25,2572,239.00,1.0
8,11.01.2013,0,25,2572,299.00,1.0
9,03.01.2013,0,25,2573,299.00,3.0
