# Prophet Parameter Testing

Ten different Prophet configurations are evaluated. Results for Correlation (R), Root Mean Squared Error (RMSE), and Mean Absolute Error (MAE) are given. Each pollutant is considered individually. Some of the non-default configs performed better than the default for some pollutants.

In [1]:
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import mahalanobis

# Load aqicn data
def load_data(filepath, chunk=None):
    df = pd.read_csv(filepath, skipinitialspace=True)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    for col in df.columns:
        if col != "date":
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.sort_values(by='date').reset_index(drop=True)
    
    if chunk in ['week', 'month']:
        df = df.set_index('date')
        if chunk == 'week':
            df = df.resample('W').mean()
        elif chunk == 'month':
            df = df.resample('M').mean()
        df = df.reset_index()
    
    return df

# Evaluate a single Prophet model
def evaluate_model(df, pollutant, model_name, model, split_date='2022-01-01'):
    df_model = df[['date', pollutant]].dropna().rename(columns={'date': 'ds', pollutant: 'y'})
    train = df_model[df_model['ds'] < split_date]
    test = df_model[df_model['ds'] >= split_date]

    model.fit(train)
    future = model.make_future_dataframe(periods=len(test), freq='D')
    forecast = model.predict(future)

    result = forecast[['ds', 'yhat']].merge(df_model, on='ds', how='left').dropna()

    y_true = result['y']
    y_pred = result['yhat']

    r = np.corrcoef(y_true, y_pred)[0, 1]
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    return {
        "Model": model_name,
        "Pollutant": pollutant,
        "R": round(r, 3),
        "RMSE": round(rmse, 2),
        "MAE": round(mae, 2)
    }

# Compare ten additive configurations
def compare_models(df, pollutant):
    config_params = {
    "Default Settings": {},

    "Very Smooth Trend": {
        "changepoint_prior_scale": 0.005,
        "n_changepoints": 10,
        "changepoint_range": 0.8
    },
    
    "Smooth Trend": {
        "changepoint_prior_scale": 0.01,
        "n_changepoints": 15,
        "changepoint_range": 0.8
    },
    
    "Mild Trend Changes": {
        "changepoint_prior_scale": 0.05,
        "n_changepoints": 25,
        "changepoint_range": 0.8
    },
    
    "Balanced Trend Changes": {
        "changepoint_prior_scale": 0.1,
        "n_changepoints": 35,
        "changepoint_range": 1.0
    },
    
    "Somewhat Flexible Trend": {
        "changepoint_prior_scale": 0.15,
        "n_changepoints": 40,
        "changepoint_range": 1.0
    },
    
    "Flexible Trend": {
        "changepoint_prior_scale": 0.2,
        "n_changepoints": 50,
        "changepoint_range": 1.0
    },
    
    "Highly Flexible Trend": {
        "changepoint_prior_scale": 0.4,
        "n_changepoints": 60,
        "changepoint_range": 1.0
    },
    
    "Many Possible Changes": {
        "changepoint_prior_scale": 0.1,
        "n_changepoints": 80,
        "changepoint_range": 1.0
    },
    
    "Very Complex Trend": {
        "changepoint_prior_scale": 0.6,
        "n_changepoints": 100,
        "changepoint_range": 1.0
    }
}


    results = []
    for name, params in config_params.items():
        try:
            model = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=True,
                daily_seasonality=False,
                seasonality_mode='additive',
                **params
            )
            result = evaluate_model(df, pollutant, name, model)
            results.append(result)
        except Exception as e:
            results.append({"Model": name, "Pollutant": pollutant, "Error": str(e)})

    return pd.DataFrame(results)

In [2]:
city_df = load_data("aqicn_data/shanghai.csv")

pollutants = ['pm25','pm10','o3','no2','so2','co']

for pollutant in pollutants:
    results = compare_models(city_df, pollutant)
    print(f"Results for {pollutant}")
    print(results)

08:53:23 - cmdstanpy - INFO - Chain [1] start processing
08:53:23 - cmdstanpy - INFO - Chain [1] done processing
08:53:23 - cmdstanpy - INFO - Chain [1] start processing
08:53:23 - cmdstanpy - INFO - Chain [1] done processing
08:53:24 - cmdstanpy - INFO - Chain [1] start processing
08:53:24 - cmdstanpy - INFO - Chain [1] done processing
08:53:24 - cmdstanpy - INFO - Chain [1] start processing
08:53:24 - cmdstanpy - INFO - Chain [1] done processing
08:53:25 - cmdstanpy - INFO - Chain [1] start processing
08:53:25 - cmdstanpy - INFO - Chain [1] done processing
08:53:26 - cmdstanpy - INFO - Chain [1] start processing
08:53:26 - cmdstanpy - INFO - Chain [1] done processing
08:53:26 - cmdstanpy - INFO - Chain [1] start processing
08:53:27 - cmdstanpy - INFO - Chain [1] done processing
08:53:27 - cmdstanpy - INFO - Chain [1] start processing
08:53:28 - cmdstanpy - INFO - Chain [1] done processing
08:53:28 - cmdstanpy - INFO - Chain [1] start processing
08:53:28 - cmdstanpy - INFO - Chain [1]

Results for pm25
                     Model Pollutant      R   RMSE    MAE
0         Default Settings      pm25  0.500  34.65  26.80
1        Very Smooth Trend      pm25  0.517  33.59  26.08
2             Smooth Trend      pm25  0.516  33.64  26.10
3       Mild Trend Changes      pm25  0.500  34.65  26.80
4   Balanced Trend Changes      pm25  0.509  34.19  26.45
5  Somewhat Flexible Trend      pm25  0.523  33.47  26.00
6           Flexible Trend      pm25  0.529  33.22  25.97
7    Highly Flexible Trend      pm25  0.511  33.83  26.86
8    Many Possible Changes      pm25  0.510  34.16  26.42
9       Very Complex Trend      pm25  0.496  34.33  27.34


08:53:31 - cmdstanpy - INFO - Chain [1] done processing
08:53:31 - cmdstanpy - INFO - Chain [1] start processing
08:53:31 - cmdstanpy - INFO - Chain [1] done processing
08:53:32 - cmdstanpy - INFO - Chain [1] start processing
08:53:32 - cmdstanpy - INFO - Chain [1] done processing
08:53:32 - cmdstanpy - INFO - Chain [1] start processing
08:53:32 - cmdstanpy - INFO - Chain [1] done processing
08:53:33 - cmdstanpy - INFO - Chain [1] start processing
08:53:33 - cmdstanpy - INFO - Chain [1] done processing
08:53:33 - cmdstanpy - INFO - Chain [1] start processing
08:53:34 - cmdstanpy - INFO - Chain [1] done processing
08:53:34 - cmdstanpy - INFO - Chain [1] start processing
08:53:35 - cmdstanpy - INFO - Chain [1] done processing
08:53:35 - cmdstanpy - INFO - Chain [1] start processing
08:53:36 - cmdstanpy - INFO - Chain [1] done processing
08:53:37 - cmdstanpy - INFO - Chain [1] start processing
08:53:37 - cmdstanpy - INFO - Chain [1] done processing
08:53:38 - cmdstanpy - INFO - Chain [1] 

Results for pm10
                     Model Pollutant      R   RMSE    MAE
0         Default Settings      pm10  0.456  18.55  13.76
1        Very Smooth Trend      pm10  0.446  18.76  13.89
2             Smooth Trend      pm10  0.449  18.70  13.85
3       Mild Trend Changes      pm10  0.456  18.55  13.76
4   Balanced Trend Changes      pm10  0.460  18.50  13.74
5  Somewhat Flexible Trend      pm10  0.466  18.39  13.69
6           Flexible Trend      pm10  0.474  18.28  13.68
7    Highly Flexible Trend      pm10  0.302  21.21  16.42
8    Many Possible Changes      pm10  0.458  18.53  13.75
9       Very Complex Trend      pm10  0.194  24.59  18.71


08:53:41 - cmdstanpy - INFO - Chain [1] done processing
08:53:43 - cmdstanpy - INFO - Chain [1] start processing
08:53:43 - cmdstanpy - INFO - Chain [1] done processing
08:53:44 - cmdstanpy - INFO - Chain [1] start processing
08:53:44 - cmdstanpy - INFO - Chain [1] done processing
08:53:45 - cmdstanpy - INFO - Chain [1] start processing
08:53:45 - cmdstanpy - INFO - Chain [1] done processing
08:53:45 - cmdstanpy - INFO - Chain [1] start processing
08:53:45 - cmdstanpy - INFO - Chain [1] done processing
08:53:46 - cmdstanpy - INFO - Chain [1] start processing
08:53:46 - cmdstanpy - INFO - Chain [1] done processing
08:53:46 - cmdstanpy - INFO - Chain [1] start processing
08:53:47 - cmdstanpy - INFO - Chain [1] done processing
08:53:47 - cmdstanpy - INFO - Chain [1] start processing
08:53:47 - cmdstanpy - INFO - Chain [1] done processing
08:53:48 - cmdstanpy - INFO - Chain [1] start processing
08:53:48 - cmdstanpy - INFO - Chain [1] done processing
08:53:48 - cmdstanpy - INFO - Chain [1] 

Results for o3
                     Model Pollutant      R   RMSE    MAE
0         Default Settings        o3  0.547  18.14  13.06
1        Very Smooth Trend        o3  0.556  17.90  12.89
2             Smooth Trend        o3  0.555  17.93  12.88
3       Mild Trend Changes        o3  0.547  18.14  13.06
4   Balanced Trend Changes        o3  0.546  18.16  13.09
5  Somewhat Flexible Trend        o3  0.557  17.92  12.89
6           Flexible Trend        o3  0.553  17.98  13.18
7    Highly Flexible Trend        o3  0.431  21.58  16.21
8    Many Possible Changes        o3  0.546  18.17  13.09
9       Very Complex Trend        o3  0.327  27.29  19.78


08:53:50 - cmdstanpy - INFO - Chain [1] start processing
08:53:50 - cmdstanpy - INFO - Chain [1] done processing
08:53:51 - cmdstanpy - INFO - Chain [1] start processing
08:53:51 - cmdstanpy - INFO - Chain [1] done processing
08:53:51 - cmdstanpy - INFO - Chain [1] start processing
08:53:52 - cmdstanpy - INFO - Chain [1] done processing
08:53:52 - cmdstanpy - INFO - Chain [1] start processing
08:53:52 - cmdstanpy - INFO - Chain [1] done processing
08:53:53 - cmdstanpy - INFO - Chain [1] start processing
08:53:53 - cmdstanpy - INFO - Chain [1] done processing
08:53:53 - cmdstanpy - INFO - Chain [1] start processing
08:53:53 - cmdstanpy - INFO - Chain [1] done processing
08:53:54 - cmdstanpy - INFO - Chain [1] start processing
08:53:54 - cmdstanpy - INFO - Chain [1] done processing
08:53:55 - cmdstanpy - INFO - Chain [1] start processing
08:53:55 - cmdstanpy - INFO - Chain [1] done processing
08:53:56 - cmdstanpy - INFO - Chain [1] start processing
08:53:57 - cmdstanpy - INFO - Chain [1]

Results for no2
                     Model Pollutant      R  RMSE   MAE
0         Default Settings       no2  0.627  7.28  5.47
1        Very Smooth Trend       no2  0.627  7.27  5.47
2             Smooth Trend       no2  0.627  7.28  5.48
3       Mild Trend Changes       no2  0.627  7.28  5.47
4   Balanced Trend Changes       no2  0.627  7.28  5.48
5  Somewhat Flexible Trend       no2  0.627  7.29  5.52
6           Flexible Trend       no2  0.626  7.31  5.57
7    Highly Flexible Trend       no2  0.627  7.32  5.61
8    Many Possible Changes       no2  0.627  7.28  5.48
9       Very Complex Trend       no2  0.421  9.36  7.37


08:53:58 - cmdstanpy - INFO - Chain [1] start processing
08:54:00 - cmdstanpy - INFO - Chain [1] done processing
08:54:01 - cmdstanpy - INFO - Chain [1] start processing
08:54:02 - cmdstanpy - INFO - Chain [1] done processing
08:54:02 - cmdstanpy - INFO - Chain [1] start processing
08:54:02 - cmdstanpy - INFO - Chain [1] done processing
08:54:03 - cmdstanpy - INFO - Chain [1] start processing
08:54:03 - cmdstanpy - INFO - Chain [1] done processing
08:54:04 - cmdstanpy - INFO - Chain [1] start processing
08:54:04 - cmdstanpy - INFO - Chain [1] done processing
08:54:05 - cmdstanpy - INFO - Chain [1] start processing
08:54:05 - cmdstanpy - INFO - Chain [1] done processing
08:54:06 - cmdstanpy - INFO - Chain [1] start processing
08:54:07 - cmdstanpy - INFO - Chain [1] done processing
08:54:07 - cmdstanpy - INFO - Chain [1] start processing
08:54:10 - cmdstanpy - INFO - Chain [1] done processing
08:54:10 - cmdstanpy - INFO - Chain [1] start processing
08:54:11 - cmdstanpy - INFO - Chain [1]

Results for so2
                     Model Pollutant      R  RMSE   MAE
0         Default Settings       so2  0.644  2.70  1.92
1        Very Smooth Trend       so2  0.585  3.17  2.29
2             Smooth Trend       so2  0.611  2.92  2.10
3       Mild Trend Changes       so2  0.644  2.70  1.92
4   Balanced Trend Changes       so2  0.660  2.66  1.88
5  Somewhat Flexible Trend       so2  0.559  3.76  2.61
6           Flexible Trend       so2  0.499  4.65  3.10
7    Highly Flexible Trend       so2  0.390  7.45  4.54
8    Many Possible Changes       so2  0.642  2.85  2.03
9       Very Complex Trend       so2  0.419  6.72  4.15


08:54:20 - cmdstanpy - INFO - Chain [1] start processing
08:54:20 - cmdstanpy - INFO - Chain [1] done processing
08:54:21 - cmdstanpy - INFO - Chain [1] start processing
08:54:21 - cmdstanpy - INFO - Chain [1] done processing
08:54:21 - cmdstanpy - INFO - Chain [1] start processing
08:54:22 - cmdstanpy - INFO - Chain [1] done processing
08:54:22 - cmdstanpy - INFO - Chain [1] start processing
08:54:22 - cmdstanpy - INFO - Chain [1] done processing
08:54:23 - cmdstanpy - INFO - Chain [1] start processing
08:54:23 - cmdstanpy - INFO - Chain [1] done processing
08:54:23 - cmdstanpy - INFO - Chain [1] start processing
08:54:24 - cmdstanpy - INFO - Chain [1] done processing
08:54:24 - cmdstanpy - INFO - Chain [1] start processing
08:54:26 - cmdstanpy - INFO - Chain [1] done processing
08:54:26 - cmdstanpy - INFO - Chain [1] start processing
08:54:26 - cmdstanpy - INFO - Chain [1] done processing
08:54:27 - cmdstanpy - INFO - Chain [1] start processing
08:54:29 - cmdstanpy - INFO - Chain [1]

Results for co
                     Model Pollutant      R  RMSE   MAE
0         Default Settings        co  0.424  2.04  1.61
1        Very Smooth Trend        co  0.463  1.97  1.52
2             Smooth Trend        co  0.458  1.98  1.54
3       Mild Trend Changes        co  0.424  2.04  1.61
4   Balanced Trend Changes        co  0.476  1.96  1.53
5  Somewhat Flexible Trend        co  0.481  1.96  1.52
6           Flexible Trend        co  0.486  1.95  1.51
7    Highly Flexible Trend        co  0.240  2.58  2.02
8    Many Possible Changes        co  0.473  1.97  1.53
9       Very Complex Trend        co  0.109  3.61  2.64
