# ARIMA Model

#### Experimenting with ARIMA (AutoRegressive Integrated Moving Average) modelling

Links:
- [ARIMA Model - Time Series Forecasting Python](https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/)
- [Summary of Rules for Identifying ARIMA Models](https://people.duke.edu/~rnau/arimrule.htm)
- [Akaike Information Criterion](https://www.scribbr.com/statistics/akaike-information-criterion/)

First import all the necessary modules and set up the dataframe:

In [None]:
from dotenv import load_dotenv
import os

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.api import tsa
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

load_dotenv()
DATASET_PATH = os.environ.get("DATASET_PATH")

# main dataframe
df_main = pd.read_excel(DATASET_PATH + "Conversion by Day.xlsx")

# get a list of the categories for this dimension, e.g. ['ACT', 'NSW', ...] for POLICY: Risk State
categories = df_main[df_main.columns[0]].tolist()
categories = sorted(list(set(categories)))  # remove duplicate groupings and sort alphabetically

response_col = int(input("Column number of the response variable: "))
category = input("Pick a category: ")

response = df_main.columns[response_col]

df_main.head()

The first step is to make the time series stationary by repeatedly applying a differencing transformation to it (whilst avoid over-differencing). This helps to remove trends from the time series.

The number of times a series is differenced is the order of differencing, $d$. The optimal order of differencing is often when [the standard deviation is lowest](https://people.duke.edu/~rnau/411arim2.htm). 

In [None]:
# filter out all other categories
df = df_main.loc[df_main.iloc[:, 0] == category]

def plot_series_acf(row: int, series: pd.Series, title: str):
    """
    Plots a series and its corresponding ACF.
    
    Parameters:
        row: the row to plot the series and ACF
        series: the series to plot
        title: title of the plot
        acf: plots an ACF if True and PACF if false
    """
    # plot time series
    axes[row, 0].plot(series.values)
    axes[row, 0].set_title(title)
    # plot (P)ACF plot
    plot_acf(series.values, ax=axes[row, 1])

# set up plot configurations
fig, axes = plt.subplots(4, 2, sharex='col')  # each column of plots will have the same x-axis
# plot original time series
plot_series_acf(0, df[response], "Original Series")

# difference the original series 3 times and plot each differencing & its ACF
differences = [ ((df[response]), np.std(df[response]), 0) ]  # initialise with original series
df_diff = df[response].copy()
for i in range(1, 4):
    df_diff = df_diff.diff().dropna()
    # add difference and standard deviation to the list
    differences.append((df_diff, np.std(df_diff), i))
    # plot series & ACF
    plot_series_acf(i, df_diff, f"Differencing Order: {i}")

plt.rcParams.update({"figure.figsize": (9, 7), "figure.dpi": 140})
plt.show()
    
sorted(differences, key=lambda x: x[1])  # sort list by ascending standard deviation

# test for stationarity using the ADF test
p_value = adfuller(df[response].values)[1]

# find first time series with p < 0.05 to minimize standard deviation
d = 0
for difference, _, order in differences:
    p_value = adfuller(difference)[1]
    if p_value < 0.01:
        d = order
        break
print(f"Optimal order of differencing: {d}")

The next step is to determine $p$, the order of the autoregressive (AR) term, which can be done by inspecting the partial autocorrelation function (PACF) plot. The order of the AR term is equal to the number of lags that cross the significance limit in the PACF plot.

In [None]:
# plot series with optimal differencing against its PACF plot
df_diff = differences[d][0]  # extract the series with the optimal order of differencing
fig, axes = plt.subplots(1, 2, sharex='col')

def plot_series_pacf(series: pd.Series, title: str):
    """
    Plots a series and its corresponding PACF.
    
    Parameters:
        row: the row to plot the series and ACF
        series: the series to plot
        title: title of the plot
        acf: plots an ACF if True and PACF if false
    """
    # plot time series
    axes[0].plot(series.values)
    axes[0].set_title(title)
    # plot PACF plot
    plot_pacf(series.values, ax=axes[1], method="ywmle", alpha=0.01)

plot_series_pacf(df_diff, "Optimal Order of Differencing")

plt.show()
plt.rcParams.update({"figure.figsize": (9, 3), "figure.dpi": 140})

# get values from PACF plot
pacf, ci = tsa.pacf(df_diff, alpha=0.05, nlags=40, method='ywmle')
pacf = pacf[1:]  # trim first PACF value - we don't need it
ci = ci[1:]

# find how many partial autocorrelations are outside the significance limit (0.05),
# which will become p (order of AR term)
p = 0
for i in range(len(pacf)):
    if ci[i][0] - pacf[i] < pacf[i] < ci[i][1] - pacf[i]:
        break
    else:
        p += 1
print(f"Order of AR term, p: {p}")

Going back to the ACF plot, we can determine the order of the moving-average (MA) term in a similar fashion:

In [None]:
# plot series with optimal differencing against its ACF plot
fig, axes = plt.subplots(1, 2, sharex='col')
axes[0].plot(df_diff.values); axes[0].set_title("Optimal Order of Differencing")
plot_acf(df_diff.values, ax=axes[1])

plt.show()

acf, ci = tsa.acf(df_diff, alpha=0.05, nlags=40, fft=True)
acf = acf[1:]  # trim first ACF value - we don't need it

# find how many autocorrelations are outside the significance limit (0.05),
# which will become q (order of MA term)
q = 0
for i in range(len(acf)):
    if not ci[i][0] <= acf[i] <= ci[i][1]:
        q += 1
    else:
        break
print(f"Order of MA term, q: {q}")

Create the ARIMA model using the $p$, $d$, and $q$ values:

In [None]:
print(p, d, q)
model = ARIMA(df[response].values, order=(p, d, q))
model_fit = model.fit(method_kwargs={"warn_convergence": False})
print(model_fit.summary())

Plot the ARIMA model against the original series:

In [None]:
plt.plot(df["QUOTE_DATE"][1:], df[response][1:], label="Original Series")
plt.plot(df["QUOTE_DATE"][1:], model_fit.predict()[1:], label="ARIMA Model")
plt.legend()
plt.show()

Forecast 30 steps (30 days) into the future:

In [None]:
forecast = model_fit.get_forecast(steps=30).summary_frame(alpha=0.05)

fig, axes = plt.subplots(figsize=(15, 5))

df.reset_index()[response][1:].plot(ax=axes, label="Original Series")
pd.Series(model_fit.predict())[1:].plot(ax=axes, label="ARIMA Model")
last_index = df.reset_index()[response].last_valid_index()
forecast.index = range(last_index + 1, last_index + forecast.shape[0] + 1)
forecast["mean"][1:].plot(ax=axes, label="ARIMA Model Forecast")

lower_series = forecast["mean_ci_lower"]
upper_series = forecast["mean_ci_upper"]


plt.fill_between(lower_series.index, lower_series, upper_series, color="k", alpha=0.05)

plt.legend()
plt.show()

Combining all the code together and letting the program loop over every category:

In [None]:
from dotenv import load_dotenv
import os

from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

load_dotenv()
DATASET_PATH = os.environ.get("DATASET_PATH")

def is_stationary(time_series: pd.Series, sig_level: float = 0.01) -> bool:
    """
    Tests whether the given time series is stationary at the given significance level
    using the ADF unit root test. Returns True if stationary, False otherwise.
    
    Parameters:
    time_series: time series to test stationarity
    sig_level: significance level for the test, defaults to 5%
    """
    results = adfuller(time_series.values)
    p_value = results[1]
    
    return p_value < sig_level

def get_p(time_series: pd.Series) -> int:
     # get values and confidence intervals from PACF plot
    pacf_values, ci = pacf(
        time_series, alpha=0.05,
        nlags=min(int(10 * np.log10(time_series.size)), time_series.size // 2 - 1),
        method='ywmle')
    pacf_values = pacf_values[1:]  # trim first PACF value as we ignore it
    ci = ci[1:]
    
    # find how many partial autocorrelations are outside the critical region
    p = 0
    for i in range(len(pacf_values)):
        value = pacf_values[i]
        if abs(value) >= abs(ci[i][0] - value):
            p += 1  # keep looking
        else:
            break
    
    return p

def get_q(time_series: pd.Series) -> int:
    # get values and confidence intervals from ACF plot
    acf_values, ci = acf(df_diff, alpha=0.05, nlags=40, fft=True)
    acf_values = acf_values[1:]  # trim first ACF value as we ignore it
    ci = ci[1:]
    
    # find how many autocorrelations are outside the critical region
    # q
    q = 0
    for i in range(len(acf_values)):
        value = acf_values[i]
        if abs(value) >= abs(ci[i][0] - value):
            q += 1  # keep looking
        else:
            break
            
    return q

df_main = pd.read_excel(DATASET_PATH + "Conversion by Day.xlsx")

dimension_col = int(input("Column number of dimension variable: "))
response_col = int(input("Column number of response variable: "))

# get the list of categories (possible values) for the selected dimension
categories = df_main[df_main.columns[dimension_col]].tolist()
categories = sorted(list(set(categories)))  # remove duplicates and sort alphabetically
# store response variable as a string
response = df_main.columns[response_col]

for category in categories:
    print(category)
    # filter out all other categories
    df = df_main.loc[df_main.iloc[:, 0] == category]

    """
    now we need to fit an ARIMA model to the data and use it to provide a 1-month forecast
    and detect recent anomalies in the data. first we calculate bounds for the parameters
    (p, d, q) and then evaluate the best combination of (p, d, q) using AIC.
    """

    # to find d (order of differencing): minimize standard deviation whilst ensuring that
    # the time series is stationary (p < 0.05 using ADF unit root test)

    # difference the original series 5 times and store it along with its standard deviation
    differences: list[tuple[pd.Series, float, int]] = [ (df[response], np.std(df[response]), 0)]
    df_diff = df[response].copy()
    for i in range(1, 6):
        # difference the previous series and drop any invalid values
        df_diff = df_diff.diff().dropna()
        # store series and standard deviation to list
        differences.append( (df_diff, np.std(df_diff), i) )

    # sort list by ascending standard deviation
    sorted(differences, key=lambda x: x[1])

    # find first time series in list with p < 0.05 (minimizes standard deviation)
    d = 0
    for difference, _, order in differences:
        if is_stationary(difference):
            d = order
            df_diff = difference
            break  # optimal order of differencing has been found

    # to find p (number of lags): find number of lags that cross the significance limit in
    # the partial autocorrelation function (PACF) plot

    p = get_p(df_diff)

    # to find q (number of lags): find number of lags that cross the significance limit in
    # the autocorrelation function (ACF) plot

    q = get_q(df_diff)
    
    # adjust the p, d, q parameters if they are too extreme
    while q > 5:
        d += 1
        df_diff = differences[d][0]  # update optimal differencing order
        q = get_q(df_diff)
        p = get_p(df_diff)
    
    model_fit = ARIMA(
        df[response].values,
        order=(p, d, q),
        enforce_stationarity=False,
        enforce_invertibility=False
    ).fit(method_kwargs={"warn_convergence": False})
    
    print(p, d, q)

    # pick the best model based on AICc constraint
    best_mse = 100
    best_aicc = 10000

    for i in range(p+1):
        for j in range(d+1):
            for k in range(q+1):
                model_fit = ARIMA(
                    df[response].values,
                    order=(i, j, k),
                    enforce_stationarity=False,
                    enforce_invertibility=False
                ).fit(method_kwargs={"warn_convergence": False})

                # a better model has a lower AICc. however, if the AICc scores are close
                # (within 50) then test the mean squared error (MSE) as well
                if model_fit.aicc < best_aicc + 50:
                    if model_fit.mse < best_mse:
                        best_aicc = model_fit.aicc
                        best_mse = model_fit.mse
                        best_model = model_fit
                        best_order = (i, j, k)
                        
    # forecast 30 steps (days) into the future
    forecast = model_fit.get_forecast(steps=30).summary_frame(alpha=0.05)
    
    # plot the original data, the arima model, and the arima model forecast on the same plot
    fig, axes = plt.subplots()
    
    df.reset_index()[response][1:].plot(ax=axes, label="Original Series")
    pd.Series(best_model.predict())[1:].plot(ax=axes, label="ARIMA Model")
    last_index = df.reset_index()[response].last_valid_index()
    forecast.index = range(last_index+1, last_index+forecast.shape[0]+1)
    forecast["mean"][1:].plot(ax=axes, label="ARIMA Model Forecast")
    
    lower_series = forecast["mean_ci_lower"]
    upper_series = forecast["mean_ci_upper"]
    
    plt.fill_between(lower_series.index, lower_series, upper_series, color="k", alpha=0.05)
    
    plt.legend()
    plt.show()