# Naive Approach to ARIMA modelling using MSE values as loss function and Fourier time series approximation to handle seasonality

In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from itertools import product
from pandas.plotting import register_matplotlib_converters
from pmdarima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, plot_predict
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller

register_matplotlib_converters()

In [118]:
item_count, store_count = 50, 10
# item_count, store_count = 2, 2
forecast_range = 90

df = pd.read_csv("../data/train.csv")
df["date"] = pd.to_datetime(df["date"])
df

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [14]:
def fourier_extrapolation(x, n_predict, n_harm=50):
    n = x.size
    t = np.arange(0, n)
    p = np.polyfit(t, x, 1)         # Find linear trend in x
    x_detrended = x - p[0] * t      # Detrended x
    x_freq_domain = np.fft.fft(x_detrended)  # Detrended x in the frequency domain
    f = np.fft.fftfreq(n)              # Frequencies
    indexes = list(range(n))
    # Sort indexes by frequency, lower -> higher
    indexes.sort(key=lambda i: np.absolute(f[i]))

    t = np.arange(0, n + n_predict)
    restored_signal = np.zeros(t.size)
    for i in indexes[:1 + n_harm * 2]:
        amplitude = np.absolute(x_freq_domain[i]) / n   # Amplitude
        phase = np.angle(x_freq_domain[i])             # Phase
        restored_signal += amplitude * np.cos(2 * np.pi * f[i] * t + phase)
        
    restored_signal = restored_signal + p[0] * t
    restored_signal = pd.Series(restored_signal, 
                               index=pd.date_range(
                                   start=x.index[0],
                                   periods=len(x) + n_predict,
                                   freq='D'
                               )
                              )
    
    in_sample_signal = restored_signal[:n]
    future_signal = restored_signal[n:]

    return in_sample_signal, future_signal

In [48]:
series_dict = dict()
in_sample_fourier_dict = dict()
future_fourier_dict = dict()

for (iid, sid) in product(range(1, item_count+1), range(1, store_count+1)):
    df_t = df[(df["item"] == iid) & (df["store"] == sid)]
    series_t = pd.Series(df_t["sales"].values, df_t["date"])
    iss, fs = fourier_extrapolation(series_t, forecast_range, 500)
    
    series_dict[(iid, sid)] = series_t
    in_sample_fourier_dict[(iid, sid)] = iss
    future_fourier_dict[(iid, sid)] = fs

In [121]:
fitted_model_dict = dict()

for (iid, sid) in product(range(1, item_count+1), range(1, store_count+1)):
    model = auto_arima(series_dict[iid, sid],
                       np.array(in_sample_fourier_dict[iid, sid]).reshape(-1,1)
                      )
    fitted_model_dict[(iid, sid)] = model

  return np.roots(self.polynomial_reduced_ma)**-1


In [140]:
pred_df = pd.DataFrame(columns=["date", "store", "item", "sales"])

for (sid, iid) in product(range(1, store_count+1), range(1, item_count+1)):
    model = fitted_model_dict[(iid, sid)]
    pred = model.predict(forecast_range, np.array(future_fourier_dict[(iid, sid)]).reshape(-1, 1))
    pred = pd.DataFrame({
        "date": pred.index,
        "store": sid,
        "item": iid,
        "sales": np.round(pred.values).astype(int)
    })
    pred_df = pd.concat([pred_df, pred], axis=0)

pred_df = pred_df.reset_index()
pred_df["index"] = pred_df.index
pred_df = pred_df[["index", "sales"]]
pred_df.to_csv("submission.csv", index=False)

  pred_df = pd.concat([pred_df, pred], axis=0)
