In [None]:
# data manipulation 
import numpy as np
import pandas as pd

from datetime import datetime
import itertools as it

from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AR

from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics

import math

# data visualization 
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import tsa_acquire_all
from acquire import peekatdata
from acquire import missing_values_col
from prepare import data_prepped
from prepare import missing_values_col

In [None]:
df.head()

In [None]:
df.describe()

### Train, Test split, and then resample aggregation:

In [None]:
aggregation = 'sum'
train = df[:'2016'].sale_total.resample('D').agg(aggregation)
test = df['2017':].sale_total.resample('D').agg(aggregation)

#### This is a gut check to make sure our split makes sense, in that we're looking at all years up to and including 2016, and then 2017 and beyond.

In [None]:
print('Observations: %d' % (len(train.values) + len(test.values)))
print('Training: %d' % (len(train.values)))
print('Testing: %d' % (len(test.values)))

In [None]:
4*365

In [None]:
pd.concat([train.head(3), train.tail(3)])

In [None]:
pd.concat([test.head(3), test.tail(3)])

In [None]:
plt.plot(train)
plt.plot(test)
plt.show()

### How to split by percentage:

In [None]:
s = df.sale_total.resample('D').agg(aggregation)

In [None]:
print(len(s))
train_size = int(len(s) * .66)
print(train_size)
train1, test1 = s[0:train_size], s[train_size:len(s)]

In [None]:
print('Observations: %d' % (len(train1.values) + len(test1.values)))
print('Training: %d' % (len(train1.values)))
print('Testing: %d' % (len(test1.values)))

In [None]:
.66 * 1826

In [None]:
plt.plot(train1)
plt.plot(test1)
plt.show()

In [None]:
pd.concat([train1.head(3), train1.tail(3)])

In [None]:
pd.concat([test1.head(3), test1.tail(3)])

### Now Modeling these:

In [None]:
y_hat = pd.DataFrame(dict(actual=test))

y_hat['avg_forecast'] = train.mean()
y_hat.tail()

In [None]:
y_hat.describe()

In [None]:
def plot_data_and_predictions(predictions, label):
    plt.figure(figsize=(10, 8))

    plt.plot(train,label='Train')
    plt.plot(test, label='Test')
    plt.plot(predictions, label=label, linewidth=5)

    plt.legend(loc='best')
    plt.show()


def evaluate(actual, predictions, output=True):
    mse = metrics.mean_squared_error(actual, predictions)
    rmse = math.sqrt(mse)

    if output:
        print('MSE:  {}'.format(mse))
        print('RMSE: {}'.format(rmse))
    else:
        return mse, rmse    

def plot_and_eval(predictions, actual=test, metric_fmt='{:.2f}', linewidth=4):
    if type(predictions) is not list:
        predictions = [predictions]

    plt.figure(figsize=(16, 8))
    plt.plot(train,label='Train')
    plt.plot(test, label='Test')

    for y_hat in predictions:
        mse, rmse = evaluate(actual, y_hat, output=False)        
        label = f'{y_hat.name}'
        if len(predictions) > 1:
            label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.plot(y_hat, label=label, linewidth=linewidth)

    if len(predictions) == 1:
        label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.title(label)

    plt.legend(loc='best')
    plt.show()   

In [None]:
plot_and_eval(y_hat.avg_forecast)

### This looks at the rolling 30 day average... important!

In [None]:
periods = 30
train.rolling(periods).mean().iloc[:-1]

In [None]:
periods = 30
y_hat['moving_avg_forecast'] = train.rolling(periods).mean().iloc[:-1]

In [None]:
plot_and_eval(y_hat.moving_avg_forecast)

In [None]:
period_vals = [7, 30, 90, 180, 365, 720]

for periods in period_vals:
    y_hat[f'moving_avg_forecast_{periods}'] = train.rolling(periods).mean().iloc[-1]

In [None]:
forecasts = [y_hat[f'moving_avg_forecast_{p}'] for p in period_vals]

plot_and_eval(forecasts, linewidth=2)

In [None]:
from statsmodels.tsa.api import Holt

holt = Holt(train).fit(smoothing_level = .3, smoothing_slope = .1)
y_hat['holt_linear'] = holt.forecast(test.shape[0])

# can also play with the decimal numbers in the above smoothing level and scope parameters
# can just visually look at this to determine these smoothing numbers

In [None]:
plot_and_eval(y_hat.holt_linear)

In [None]:
using_last_year = train['2016']\
    .reset_index()\
    .drop(columns='sale_date_index_col')\
    .set_index(train['2016'].index + 366)\
    .iloc[:-1]

In [None]:
y_hat['last_year'] = using_last_year

In [None]:
plot_and_eval(y_hat.last_year, linewidth=1)

In [None]:
predictions = train['2016'] + train.diff(365).mean()
predictions.index = pd.date_range('20170101', periods=366)
predictions = predictions[:-1]
predictions.name = 'Last Year + Mean'

plot_and_eval(predictions, linewidth=1)