In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import Series
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from statsmodels.tsa.stattools import adfuller 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sales_train=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")
sub=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
items=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_cat=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
shops=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")

In [None]:
sales_train

**trainデータを月単位に集計**

In [None]:
monthly_sales_train=sales_train.groupby(["date_block_num","shop_id","item_id"])["date","item_price","item_cnt_day"].agg({"date":["min","max"],"item_price":["mean"],"item_cnt_day":["sum"]})

In [None]:
monthly_sales_train.head(20)

**itemデータの確認**

In [None]:
items

In [None]:
x=items.groupby(["item_category_id"]).count()
x=x.sort_values(by="item_id",ascending=False)
x=x.iloc[0:10].reset_index()
x

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=x.item_category_id,y=x.item_id)
plt.title("Items per Category")
plt.xlabel("item_category_id",fontsize=12)
plt.ylabel("",fontsize=12)

**月ごとの売り上げ総数**

In [None]:
ts=sales_train.groupby(["date_block_num"])["item_cnt_day"].sum().astype("float")
plt.figure(figsize=(10,5))
plt.title("Total Sales of the company")
plt.xlabel("Time")
plt.ylabel("sales")
plt.plot(ts)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(ts.rolling(window=12,center=False).mean(),label="Rolling mean")
plt.plot(ts.rolling(window=12,center=False).std(),label="Rolling std")
plt.legend()

In [None]:
ts.rolling(window=12,center=False).mean()

In [None]:
plt.figure(figsize=(10,5))
sm.tsa.seasonal_decompose(ts.values,period=12).plot()

**時系列データの定常性の確認(ADF)**

In [None]:
def test_stationarity(timeseries):
    dftest=adfuller(timeseries,autolag="AIC")
    dfoutput=pd.Series(dftest[0:4],index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput["Critical Value{}".format(key)]=value
    print(dfoutput)

In [None]:
test_stationarity(ts)

****

In [None]:
def difference(dataset,interval):
    diff=list()
    for i in range(interval,len(dataset)):
        value=dataset[i]-dataset[i-interval]
        diff.append(value)
    return Series(diff)

In [None]:
plt.title("original")
plt.xlabel("time")
plt.ylabel("sales")
plt.plot(ts)

In [None]:
plt.title("after de-trend")
plt.xlabel("time")
plt.ylabel("sales")
new_ts=difference(ts,1)
plt.plot(new_ts)

In [None]:
test_stationarity(new_ts)

In [None]:
plt.title('After De-seasonalization')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts,12)       
plt.plot(new_ts)

In [None]:
test_stationarity(new_ts)

**AR,MA,ARMAモデル**

In [None]:
def tsplot(y,lags,figsize,style,title):
    y=pd.Series(y)
    
    with plt.style.context(style):
        fig=plt.figure(figsize=figsize)
        layout=(3,2)
        ts_ax=plt.subplot2grid(layout,(0,0),colspan=2)
        acf_ax=plt.subplot2grid(layout,(1,0))
        pacf_ax=plt.subplot2grid(layout,(1,1))

        y.plot(ax=ts_ax)
        ts_ax.set_title(title)

        smt.graphics.plot_acf(y,lags=lags,ax=acf_ax)
        smt.graphics.plot_pacf(y,lags=lags,ax=pacf_ax)
        
        plt.tight_layout()
    return

AR(1)

In [None]:
np.random.seed(1)
n_samples=int(1000)
x=w=np.random.normal(size=n_samples)
a=0.6

for t in range(n_samples):
    x[t]=a*x[t-1]+w[t]

tsplot(x,12,(10,8),"bmh","AR(1)process")

AR(2)

In [None]:
n=int(1000)
alphas=np.array([0.444,0.333])
betes=np.array([0.0])
ar=np.r_[1,-alphas]
ma=np.r_[1,betes]
ar2=smt.arma_generate_sample(ar=ar,ma=ma,nsample=n)
tsplot(ar2,12,(10,8),"bmh","AR(2)process")