# ARMA Models and Hyperparameter tunning
## (Auto Regressive Moving Average Model

# Hyperparameter tunning
#### Parts of model that the data scientist sets before training and that controls the learning process

In [None]:
import inspect
import time
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

# 1. Prepare Data
## 1.1. Import

<b> Creating a client to connect to the MongoDB server, then assign the "air-quality" database to db, and the "nairobi" collection to nairobi.</b>

In [None]:
client = MongoClient(host="localhost",port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]

In [None]:
def wrangle(collection):

    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample and forward-fill
    y = ...

    return y

In [None]:
y = wrangle(nairobi)
y.head()

# 1.2. Explore

In [None]:
fig, ax =plt.subplots(figsize=(15,6))
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

In [None]:
fig, ax =plt.subplots(figsize=(15,6))
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

# 1.3. Split

In [None]:
y_train = y.loc["2018-10-01":"2018-10-31"]
y_test = y.loc["2018-11-01"]

# 2. Build Model
## 2.1. Baseline

In [None]:
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train,y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

# 2.2. Iterate

In [None]:
p_params = range(0,25,8)    #p_params being lags for the AR part of the model
q_params = range(0,3,1)     #q_params being lags for the MA part of the model

In [None]:
# Create dictionary to store MAEs
mae_grid = dict()
# Outer loop: Iterate through possible values for `p`
for p in p_params:
    # Create key-value pair in dict. Key is `p`, value is empty list.
    mae_grid[p] = list()
    # Inner loop: Iterate through possible values for `q`
    for q in q_params:
        # Combination of hyperparameters for model
        order = (p, 0, q)
        # Note start time
        start_time = time.time()
        # Train model
        model = ARIMA(y_train,order = order).fit()
        # Calculate model training time
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA {order} in {elapsed_time} seconds.")
        # Generate in-sample (training) predictions
        y_pred = model.predict()
        # Calculate training MAE
        mae = mean_absolute_error(y_train,y_pred)
        # Append MAE to list in dictionary
        mae_grid[p].append(mae)

print()
print(mae_grid)

In [None]:
mae_df = pd.DataFrame(mae_grid)
mae_df.round(4)

In [None]:
sns.heatmap(mae_df, cmap = "Blues")
plt.xlabel("p values")
plt.ylabel("q values")
plt.title("ARMA Grid Search (Criterion: MAE)")

In [None]:
fig, ax = plt.subplots(figsize=(15, 12))
model.plot_diagnostics(fig=fig)

# 2.3. Evaluate

In [None]:
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model = ARIMA(history, order=(8,0,1)).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)
    history = history.append(y_test[next_pred.index])

In [None]:
test_mae = mean_absolute_error(y_test,y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))

# 3. Communicate Results

In [None]:
df_predictions = pd.DataFrame({"y_test":y_test, "y_pred_wfv": y_pred_wfv})
fig = px.line(df_predictions, labels={"value":"PM2.5"})
fig.show()