In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import date
import calendar
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'

plt.rcParams['axes.unicode_minus'] = False

matplotlib.rc('font', family='NanumGothic')

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('~/Downloads/1조_Wine/wine_data_sales.csv', engine='python', parse_dates=['date'])
df.head()

In [None]:
df = df.drop('Unnamed: 0', axis = 1, inplace = False)

In [None]:
df.columns

In [None]:
from collections import Counter

scale_var_char = ['gender', 'age', 'job', 'marital', 'province',
       'city','amount']

for v_var in scale_var_char:
    print(v_var,'변수',Counter(df[v_var]))

In [None]:
for idx in df[df['gender'] == 'UNKNOW'].index:
    df.loc[idx, 'gender'] = np.NaN

In [None]:
for idx in df[df['age'] == '미상'].index:
    df.loc[idx, 'age'] = np.NaN

In [None]:
for idx in df[df['job'] == '미상'].index:
    df.loc[idx, 'job'] = np.NaN

In [None]:
for idx in df[df['marital'] == 'UNKNOWN'].index:
    df.loc[idx, 'marital'] = np.NaN

In [None]:
df.isnull().sum()

In [None]:
df_drop = df.dropna(how='any')

In [None]:
df_drop.isnull().sum()

In [None]:
df_drop.head()

In [None]:
df_drop.columns

In [None]:
df_drop.info()

In [None]:
# df_drop.set_index('date', inplace = False)

In [None]:
# df_drop = df_drop.drop(['gender', 'age', 'job', 'marital', 'province', 'city',
#        'purchase'], axis = 1, inplace = False)

# 일별 총매출액 

In [None]:
trends_amount_sum = df_drop.groupby('date').sum()
trends_amount_sum.head()

In [None]:
# trends_amount_sum.plot(figsize=(15,8))

In [None]:
# plot_acf(trends_amount_sum['amount'])
# plot_pacf(trends_amount_sum['amount'])
# plt.show()

In [None]:
# df_train = df_count[mask]
# df_test = df_count[~mask]
# print("shape of train data: {}".format(df_train.shape))
# print("shape of test data: {}".format(df_test.shape))

In [None]:
df_train = trends_amount_sum[trends_amount_sum.index <= "2016-09-30"]
df_test = trends_amount_sum[trends_amount_sum.index > "2016-09-30"]
df_test.head()
print("shape of train data: {}".format(df_train.shape))
print("shape of test data: {}".format(df_test.shape))

# 시계열분석 차분 

> 시계열분석에서 차분을 통해 정상성데이터로 해준다. (평균이 일정하지 않는 자료에서는 차분(현시점 자료에서 전시점 자료를 빼는것 : 변화량)

In [None]:
df_diff1 = df_train - df_train.shift(1)
df_diff1.plot(figsize = (12,8))

In [None]:
df_diff2 = df_train - 2 * (df_train.shift(1)) + (df_train.shift(2))
df_diff2.plot(figsize = (12,8))

> 1번 차분 2번차분을 해본결과 2가 조금 더 정상성을 보여서 2로 채택했다.

In [None]:
# ACF 

lag_size = 60 # 최대 p기간 지정
fig = plt.figure(figsize = (12,8))
ax1 = fig.add_subplot(211)
fig = plot_acf(df_train, lags=lag_size, ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(df_train, lags=lag_size, ax=ax2)

> ACF를 보면 20의 Time lag을 기준으로 자기상관이 양에서 음으로 변동한다. 
또한 PACF는 1의 Time lag에서 약 0.7을 보이고 이후에 급격히 감소한다.  
p=0, q=1이 적당하다고 추측할 수 있다.


In [None]:
# pacf = sm.tsa.pacf(df_drop['date'])
# pacf

# 데이터에는 ARIMA(0,2,1)을 사용하기로 함

In [None]:
# train 데이터 이용, AR(0), I(2, 차분), MA(1)인 ARIMA 모델
ts_model = ARIMA(df_train, order = (0,2,1))
# 데이터 적합
# trend: 상수 포함 여부 "nc"이면 상수 미포함, full _output : 모든 출력 결과 표시, disp : 수렴 정보 출력
ts_result = ts_model.fit(trend = "nc", full_output = True, disp = 1)
print(ts_result.summary())

> MA.1의 계수는 유효하고, 모형의 constant가 유효하지 않음 그래서 c를 빼주었음

In [None]:
start_time = df_test.index.min() 
end_time = df_test.index.max() 
fig, ax = plt.subplots(figsize = (12,8))
# 예측 전까지 데이터
ax = df_train.plot(ax = ax)
# 예측 그래프 생성 함수, start: 예측 시작, end: 예측 종료, plot_insample : 가지고 있는 데이터 중 예측 구간과 겹치는 구간 표시
fig = ts_result.plot_predict(start = start_time, end = end_time, ax = ax, plot_insample = False)
plt.show()

In [None]:
start_time = df_test.index.min() 
end_time = df_test.index.max()
fig, ax = plt.subplots(figsize = (12,8))
plt.plot(df_train, linestyle = "-", label = "train")
plt.plot(df_test, linestyle = "--", label = "test")
fig = ts_result.plot_predict(start = start_time, end = end_time, ax = ax, plot_insample = False)
plt.show()

In [None]:
start_time = df_test.index.min() 
end_time = df_test.index.max() 
y_pred = ts_result.predict(start = start_time, end = end_time, typ = 'levels')
plt.subplots(figsize = (12,8))
plt.plot(df_train, linestyle = "-", label = "train")
plt.plot(df_test, linestyle = "--", label = "test")
plt.plot(y_pred, linestyle = "--", label = "pred")
plt.legend()

In [None]:
df_train_f = trends_amount_sum[trends_amount_sum.index <= "2016-10-31"]
df_test_f = trends_amount_sum[trends_amount_sum.index > "2016-10-31"]
df_test_f.head()
print("shape of train data: {}".format(df_train_f.shape))
print("shape of test data: {}".format(df_test_f.shape))

In [None]:
# train 데이터 이용, AR(0), I(2, 차분), MA(1)인 ARIMA 모델
ts_model_f = ARIMA(df_train_f, order = (0,2,1))
# 데이터 적합
# trend: 상수 포함 여부 "nc"이면 상수 미포함, full _output : 모든 출력 결과 표시, disp : 수렴 정보 출력
ts_result_f = ts_model_f.fit(trend = "nc", full_output = True, disp = 1)
print(ts_result_f.summary())

In [None]:
# 11월, 12월 수요량 예측
start_time_f = '2016-11-01'
end_time_f = '2016-12-31' 
fig, ax = plt.subplots(figsize = (12,8))
# 예측 전까지 데이터
ax = df_train_f.plot(ax = ax)
# 예측 그래프 생성 함수, start: 예측 시작, end: 예측 종료, plot_insample : 가지고 있는 데이터 중 예측 구간과 겹치는 구간 표시
fig = ts_result_f.plot_predict(start = start_time_f, end = end_time_f, ax = ax, plot_insample = False)

In [None]:
start_time_f = '2016-11-01'
end_time_f = '2016-12-31' 
y_pred_f = ts_result_f.predict(start = start_time_f, end = end_time_f, typ = 'levels')
plt.subplots(figsize = (12,8))
plt.plot(df_train_f, linestyle = "-", label = "train")
plt.plot(y_pred_f, linestyle = "--", label = "pred")
plt.legend()

# 주별 월별 분기별 해보기!!

In [None]:
# df=pd.read_csv('~/Downloads/1조_Wine/wine_data_sales.csv', engine='python', parse_dates=['date'], index_col = 'date')
# df.head()

In [None]:
# df.columns

In [None]:
# df = df.drop(['Unnamed: 0', 'gender', 'age', 'job', 'marital', 'province', 'city',
#        'purchase'], axis = 1, inplace = False)

In [None]:
# data= df_drop.groupby('date').sum()
# data.head()

In [None]:
# data.info()

In [None]:
# data.index = pd.to_datetime(data.index)
# print(data.index)

In [None]:
# data = data.sort_index()
# data.head()

In [None]:
weekly = data.resample('W').mean()
weekly.plot(figsize = (20,8))
plt.ylabel('Weekly amount')

In [None]:
# weekly_train = weekly[weekly.index <= "2016-09-30"]
# weekly_test = weekly[weekly.index > "2016-09-30"]
# weekly_test.head()
# print("shape of train data: {}".format(weekly_train.shape))
# print("shape of test data: {}".format(weekly_test.shape))

In [None]:
# weekly_diff1 = weekly_train - weekly_train.shift(1)
# weekly_diff1.plot(figsize = (12,8))

In [None]:
# weekly_diff2 = weekly_train - 2 * (weekly_train.shift(1)) + (weekly_train.shift(2))
# weekly_diff2.plot(figsize = (12,8))

In [None]:
# lag_size = 21 # 최대 p기간 지정
# fig = plt.figure(figsize = (12,8))
# ax1 = fig.add_subplot(211)
# fig = plot_acf(df_train, lags=lag_size, ax=ax1)
# ax2 = fig.add_subplot(212)
# fig = plot_pacf(df_train, lags=lag_size, ax=ax2)

In [None]:
# # train 데이터 이용, AR(0), I(2, 차분), MA(1)인 ARIMA 모델
# ts_model = ARIMA(weekly_train, order = (0,2,1))
# # 데이터 적합
# # trend: 상수 포함 여부 "nc"이면 상수 미포함, full _output : 모든 출력 결과 표시, disp : 수렴 정보 출력
# ts_result = ts_model.fit(trend = "nc", full_output = True, disp = 1)
# print(ts_result.summary())

In [None]:
# start_time = weekly_test.index.min() 
# end_time = weekly_test.index.max() 
# fig, ax = plt.subplots(figsize = (12,8))
# # 예측 전까지 데이터
# ax = weekly_train.plot(ax = ax)
# # 예측 그래프 생성 함수, start: 예측 시작, end: 예측 종료, plot_insample : 가지고 있는 데이터 중 예측 구간과 겹치는 구간 표시
# fig = ts_result.plot_predict(start = start_time, end = end_time, ax = ax, plot_insample = False)
# plt.show()

In [None]:
# weekly_train_f = weekly[weekly.index <= "2016-10-30"]
# weekly_test_f = weekly[weekly.index > "2016-10-30"]
# df_test_f.head()
# print("shape of train data: {}".format(weekly_train_f.shape))
# print("shape of test data: {}".format(weekly_test_f.shape))

In [None]:
# # train 데이터 이용, AR(0), I(1, 차분), MA(2)인 ARIMA 모델
# ts_model_f = ARIMA(weekly_train_f, order = (0,2,1))
# # 데이터 적합
# # trend: 상수 포함 여부 "nc"이면 상수 미포함, full _output : 모든 출력 결과 표시, disp : 수렴 정보 출력
# ts_result_f = ts_model_f.fit(trend = "c", full_output = True, disp = 1)
# print(ts_result_f.summary())

In [None]:
# # 11월, 12월 수요량 예측
# start_time_w = '2016-11-06'
# end_time_w = '2017-01-01' 
# fig, ax = plt.subplots(figsize = (12,8))
# # 예측 전까지 데이터
# ax = weekly_train_f.plot(ax = ax)

# # 예측 그래프 생성 함수, start: 예측 시작, end: 예측 종료, plot_insample : 가지고 있는 데이터 중 예측 구간과 겹치는 구간 표시
# fig = ts_result_f.plot_predict(start = start_time_w, end = end_time_w, ax = ax, plot_insample = False)

# 요일마다해보기

In [None]:
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(figsize = (20,8))


# 일마다해보기

In [None]:
# by_day = data.groupby(data.index.dayofyear).mean()
# by_day.plot(figsize = (20,8))

# 월마다해보기

In [None]:
# by_month = data.groupby(data.index.month).mean()
# by_month.plot(figsize = (20,8))

In [None]:
# by_month

# 분기마다 해보기

In [None]:
by_quarter = data.groupby(data.index.quarter).mean()
by_quarter.plot(figsize = (20,8))

# 2016년 7월말, 9월초, 매분기말

In [None]:
by_quarter = data.groupby(data.index.quarter)
by_quarter.plot(figsize = (20,8))

In [None]:
# from statsmodels.tsa.stattools import adfuller
# def test_for_stationary(timeseries):
    
#     #Determing rolling statistics
#     rolmean = data.rolling(window=12).mean()
#     rolstd = data.rolling(window=12).std()

#     #Plot rolling statistics:
#     orig = plt.plot(timeseries, color='blue',label='Original')
#     mean = plt.plot(rolmean, color='red', label='Rolling Mean')
#     std = plt.plot(rolstd, color='black', label = 'Rolling Std')
#     plt.legend(loc='best')
#     plt.title('Rolling Mean & Standard Deviation')
#     plt.show(block=False)
    
# # #     Perform Dickey-Fuller test:
# #     print ('Results of Dickey-Fuller Test:')
# #     dftest = adfuller(timeseries, autolag='AIC')
# #     dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
# #     for key,value in dftest[4].items():
# #         dfoutput['Critical Value (%s)'%key] = value
# #     print (dfoutput)

In [None]:
ts = data

In [None]:
ts_logtransformed = np.log(ts)
plt.plot(ts_logtransformed)

In [None]:
ts_logtransformed.head(10)

In [None]:
decomposition = seasonal_decompose(ts_logtransformed)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_logtransformed, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
decomposition = seasonal_decompose(data, freq=12)  
fig = plt.figure()  
fig = decomposition.plot()

In [None]:
Rolling_average = ts_logtransformed.rolling(window = 7, center= False).mean()
plt.plot(ts_logtransformed, label = 'Log Transformed')
plt.plot(Rolling_average, color = 'red', label = 'Rolling Average')
plt.legend(loc = 'best')

In [None]:
Rolling_average.head(10)

In [None]:


log_Rolling_difference = ts_logtransformed - Rolling_average
log_Rolling_difference.head(10)
log_Rolling_difference.tail(10)



In [None]:


log_Rolling_difference.dropna(inplace=True)
plt.plot(log_Rolling_difference)



In [None]:
test_for_stationary(log_Rolling_difference)

In [None]:
ts_logtransformed = np.log(data)

In [None]:
# decomposition = seasonal_decompose(ts_logtransformed)

# trend = decomposition.trend
# seasonal = decomposition.seasonal
# residual = decomposition.resid

# plt.subplot(411)
# plt.plot(ts_logtransformed, label='Original')
# plt.legend(loc='best')
# plt.subplot(412)
# plt.plot(trend, label='Trend')
# plt.legend(loc='best')
# plt.subplot(413)
# plt.plot(seasonal,label='Seasonality')
# plt.legend(loc='best')
# plt.subplot(414)
# plt.plot(residual, label='Residuals')
# plt.legend(loc='best')
# plt.tight_layout()

In [None]:
# trends_amount_sum = df_drop.groupby('date').sum()
# trends_amount_sum.head()

# 일별 총매출액 - 남성

In [None]:
trends_sum_male = df_drop[df['gender'] == 'MALE'].groupby('date').sum()
trends_sum_male.head()

In [None]:
trends_sum_male.plot(figsize=(15,8))

# 일별 총매출액 - 여성

In [None]:
trends_sum_female = df_drop[df['gender'] == 'FEMALE'].groupby('date').sum()
trends_sum_female.head()

In [None]:
trends_sum_female.plot(figsize=(15,8))

# 일별 구매빈도

In [None]:
trends_amount_count = df_drop

In [None]:
trends_amount_count = df_drop.groupby('date').count()
trends_amount_count.plot(figsize=(15,8))
trends_amount_count.head()

In [None]:
trends_amount_male = df_drop[df['gender'] == 'MALE'].groupby('date')
trends_amount_male.head()