# 3.4. ARMA Models and Hyperparameter tunning
## (Auto Regressive Moving Average Model

# Hyperparameter tunning
#### Parts of model that the data scientist sets before training and that controls the learning process

In [None]:
import inspect
import time
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA

# 1. Prepare Data
## 1.1. Import

<b>Task 3.4.1: Create a client to connect to the MongoDB server, then assign the "air-quality" database to db, and the "nairobi" collection to nairobi.</b>

In [None]:
client = MongoClient(host="localhost",port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]

In [None]:
def wrangle(collection):

    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    # Read results into DataFrame
    df = pd.DataFrame(list(results)).set_index("timestamp")

    # Localize timezone
    df.index = df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")

    # Remove outliers
    df = df[df["P2"] < 500]

    # Resample and forward-fill
    y = ...

    return y

<b>Task 3.4.2: Change your wrangle function so that it has a resample_rule argument that allows the user to change the resampling interval. The argument default should be "1H".</b>

<b>Task 3.4.3: Use your wrangle function to read the data from the nairobi collection into the Series y.</b>

In [None]:
y = wrangle(nairobi)
y.head()

# 1.2. Explore

<b>Task 3.4.4: Create an ACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient".</b>

In [None]:
fig, ax =plt.subplots(figsize=(15,6))
plot_acf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

<b>Task 3.4.5: Create an PACF plot for the data in y. Be sure to label the x-axis as "Lag [hours]" and the y-axis as "Correlation Coefficient".</b>

In [None]:
fig, ax =plt.subplots(figsize=(15,6))
plot_pacf(y, ax=ax)
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

# 1.3. Split

<b>Task 3.4.6: Create a training set y_train that contains only readings from October 2018, and a test set y_test that contains readings from November 1, 2018.</b>

In [None]:
y_train = y.loc["2018-10-01":"2018-10-31"]
y_test = y.loc["2018-11-01"]

# 2. Build Model
## 2.1. Baseline

In [None]:
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train,y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

# 2.2. Iterate

<b>Task 3.4.8: Create ranges for possible  𝑝
  and  𝑞
  values. p_params should range between 0 and 25, by steps of 8. q_params should range between 0 and 3 by steps of 1</b>

ARMA
- AR -> looking at past reading in order to make present prediction
- MA -> looking at past predition and the true value (calculating error)
- in time series data it some time happened that in past prediction may be their a lot of pollution in air 
due to some events like (independence day) in which every one shooting fires in air and the day after events may the
air fresh so this cause of an error in model prediction. that why we using ARMA model to handle this error
and now in present 

In [None]:
p_params = range(0,25,8)    #p_params being lags for the AR part of the model (range selected using pacf_plot)
q_params = range(0,3,1)     #q_params being lags for the MA part of the model(error lag)(using acf_plot) we often keep it small

<b>Task 3.4.9: Complete the code below to train a model with every combination of hyperparameters in p_params and q_params. Every time the model is trained, the mean absolute error is calculated and then saved to a dictionary. If you're not sure where to start, do the code-along with Nicholas!</b>

# Grid Search
for each p_params and q_params we calculate MAE and then we choose that parameters which gives minimum MAE
for examples:

|             |             |             |             |             |
| ----------- | ----------- | ----------- | ----------- | ----------- |
| (0,0) MAE   | (8,0) MAE   | (16,0) MAE  | (24,0) MAE  |             |
|             |             |             |             |             |
| (0,1) MAE   | (8,1) MAE   | (16,1) MAE  | (24,1) MAE  |             |
|             |             |             |             |             |
| (0,2) MAE   | (8,2) MAE   | (16,2) MAE  | (24,2) MAE  |             |
    

In [None]:
# Create dictionary to store MAEs
mae_grid = dict()
# Outer loop: Iterate through possible values for `p`
for p in p_params:
    # Create key-value pair in dict. Key is `p`, value is empty list.
    mae_grid[p] = list()
    # Inner loop: Iterate through possible values for `q`
    for q in q_params:
        # Combination of hyperparameters for model
        order = (p, 0, q)
        # Note start time
        start_time = time.time()
        # Train model
        model = ARIMA(y_train,order = order).fit()
        # Calculate model training time
        elapsed_time = round(time.time() - start_time, 2)
        print(f"Trained ARIMA {order} in {elapsed_time} seconds.")
        # Generate in-sample (training) predictions
        y_pred = model.predict()
        # Calculate training MAE
        mae = mean_absolute_error(y_train,y_pred)
        # Append MAE to list in dictionary
        mae_grid[p].append(mae)

print()
print(mae_grid)

<b>Task 3.4.10: Organize all the MAE's from above in a DataFrame names mae_df. Each row represents a possible value for  𝑞
  and each column represents a possible value for  𝑝
 .</b>

In [None]:
mae_df = pd.DataFrame(mae_grid)
mae_df.round(4)

<b>Task 3.4.11: Create heatmap of the values in mae_grid. Be sure to label your x-axis "p values" and your y-axis "q values".</b>

In [None]:
sns.heatmap(mae_df, cmap = "Blues")
plt.xlabel("p values")
plt.ylabel("q values")
plt.title("ARMA Grid Search (Criterion: MAE)")

<b>Task 3.4.12: Use the plot_diagnostics method to check the residuals for your model. Keep in mind that the plot will represent the residuals from the last model you trained, so make sure it was your best model, too!</b>

In [None]:
fig, ax = plt.subplots(figsize=(15, 12))
model.plot_diagnostics(fig=fig)

# 2.3. Evaluate

<b>Task 3.4.13: Complete the code below to perform walk-forward validation for your model for the entire test set y_test. Store your model's predictions in the Series y_pred_wfv. Choose the values for  𝑝
  and  𝑞
  that best balance model performance and computation time. Remember: This model is going to have to train 24 times before you can see your test MAE!WQU WorldQuant University Applied Data Science Lab QQQQ</b>

In [None]:
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model = ARIMA(history, order=(8,0,1)).fit()
    next_pred = model.forecast()
    y_pred_wfv = y_pred_wfv.append(next_pred)
    history = history.append(y_test[next_pred.index])

In [None]:
test_mae = mean_absolute_error(y_test,y_pred_wfv)
print("Test MAE (walk forward validation):", round(test_mae, 2))

# 3. Communicate Results

<b>Task 3.4.14: First, generate the list of training predictions for your model. Next, create a DataFrame df_predictions with the true values y_test and your predictions y_pred_wfv (don't forget the index). Finally, plot df_predictions using plotly express. Make sure that the y-axis is labeled "P2".</b>

In [None]:
df_predictions = pd.DataFrame({"y_test":y_test, "y_pred_wfv": y_pred_wfv})
fig = px.line(df_predictions, labels={"value":"PM2.5"})
fig.show()