# Variables With Prediction Modelling

Variables with future predictions are used to predict the Absorbance.

The variables are:
* Air temperature
* Daily Cumulated Rainfall
* Water Temperature
* Flow River (River Discharge in the Projections folder)


3 different models are used:
* linear regression
* random forest
* neural network

In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Models
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor

# Utils
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

# Hyperparameter Optimization
import optuna

# MAPiE
from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score

# SHAP
import shap

plt.rcParams["font.size"] = 26

# Define paths

In [None]:
data_folder = os.path.join("..", "..", "data", "tarragona")

projection_folder = os.path.join(data_folder, "future_projections")
processed_projections_folder = os.path.join(
    projection_folder, "processed"
)

interpolated_projections_folder = os.path.join(
    projection_folder, "interpolated"
)

raw_data_folder = os.path.join(data_folder, "raw_data")

# Load dataset

In [None]:
full_df = pd.read_excel(
    os.path.join(raw_data_folder, "raw_full_dataset.xlsx")
)

In [None]:
full_df

In [None]:
full_df.isna().sum()

In [None]:
full_df = full_df.dropna()

# Remove Variables with no future projections

They are:
- Nitrate
- pH
- Ammonium
- Dissolved Oxygen
- Conductivity
- Redox Potential

In [None]:
full_df.drop(
    columns=[
        "Nitrate (mg/L)",
        "pH",
        "Ammonium (mg/L)",
        "Dissolved Oxygen (mg/L)",
        "Conductivity (µS/cm)",
        "Redox Potential (mV)",
        "Turbidity (NTU)",
        "cumulated_rainfall_24h",
        "flowriver",
        "environmental_temperature",
        "is_outlier"
    ],
    inplace=True,
)

## Take the Monthly Average

In [None]:
full_df["Year"] = full_df["DateTime"].dt.year
full_df["Month"] = full_df["DateTime"].dt.month

In [None]:
# take the monthly average
monthly_avg_df = full_df.groupby(["Year", "Month"]).mean().reset_index()

In [None]:
full_df = monthly_avg_df

In [None]:
full_df["Season"] = full_df["Month"].apply(
    lambda x: "Winter"
    if x in [12, 1, 2]
    else "Spring"
    if x in [3, 4, 5]
    else "Summer"
    if x in [6, 7, 8]
    else "Autumn"
)

# convert Season to integer
full_df["Season"] = full_df["Season"].apply(
    lambda x: 1
    if x == "Winter"
    else 2
    if x == "Spring"
    else 3
    if x == "Summer"
    else 4
)

full_df["Timestamp"] = full_df["DateTime"].apply(
    lambda x: x.timestamp()
)

### Split Data

In [None]:
further_features = ["Year", "Season", "Month", "Timestamp"]

to_drop = ["Timestamp", "Season", "Month"]

further_features = [
    feature for feature in further_features if feature not in to_drop
]

X_columns_to_drop = to_drop + ["UVA254"]

X = full_df[full_df.columns.difference(X_columns_to_drop)]

y = full_df[["DateTime", "UVA254"]]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(X[X.columns.difference(["DateTime"])])

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        X.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = X["DateTime"].values

In [None]:
X_train_fit = X_poly[X_poly.columns.difference(["DateTime"])]
y_train_fit = y[y.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_train_fit = scaler.fit_transform(X_train_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_train_fit = X[X.columns.difference(["DateTime"])]
y_train_fit = y.copy()

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_train_fit = scaler.fit_transform(X_train_fit)

X_train_fit = pd.DataFrame(
    X_train_fit, columns=X.columns.difference(["DateTime"])
)

# Projections

## Load Input Projections

In [None]:
air_temp_rcp45_df = pd.read_excel(
    os.path.join(interpolated_projections_folder, "air_temp_rcp45.xlsx")
)
air_temp_rcp85_df = pd.read_excel(
    os.path.join(interpolated_projections_folder, "air_temp_rcp85.xlsx")
)

rain_rcp45_df = pd.read_excel(
    os.path.join(
        interpolated_projections_folder, "precipitation_rcp45.xlsx"
    )
)
rain_rcp85_df = pd.read_excel(
    os.path.join(
        interpolated_projections_folder, "precipitation_rcp85.xlsx"
    )
)

flow_rcp45_df = pd.read_excel(
    os.path.join(interpolated_projections_folder, "flow_rcp45.xlsx")
)
flow_rcp85_df = pd.read_excel(
    os.path.join(interpolated_projections_folder, "flow_rcp85.xlsx")
)

water_temp_rcp45_df = pd.read_excel(
    os.path.join(
        interpolated_projections_folder, "water_temp_rcp45.xlsx"
    )
)
water_temp_rcp85_df = pd.read_excel(
    os.path.join(
        interpolated_projections_folder, "water_temp_rcp85.xlsx"
    )
)

## Create Datasets

### RCP 4.5

In [None]:
rcp45_df = air_temp_rcp45_df.copy()
rcp45_df.rename(columns={"Value": "Air Temperature"}, inplace=True)

rcp45_df["Daily Cumulated Rainfall (L/m²)"] = rain_rcp45_df["Rainfall (mm)"].values
rcp45_df["Flow River (m³/s)"] = flow_rcp45_df["Flow River Rate (m³/s)"].values
rcp45_df["Water Temperature (°C)"] = water_temp_rcp45_df["Water Temperature (°C)"].values

rcp45_df.drop(columns=["Unnamed: 0"], inplace=True)
rcp45_df["Year"] = rcp45_df["DateTime"].dt.year

In [None]:
rcp45_df

### RCP 8.5

In [None]:
rcp85_df = air_temp_rcp85_df.copy()
rcp85_df.rename(columns={"Value": "Air Temperature"}, inplace=True)

rcp85_df["Daily Cumulated Rainfall (L/m²)"] = rain_rcp85_df["Rainfall (mm)"].values
rcp85_df["Flow River (m³/s)"] = flow_rcp85_df["Flow River Rate (m³/s)"].values
rcp85_df["Water Temperature (°C)"] = water_temp_rcp85_df["Water Temperature (°C)"].values

rcp85_df.drop(columns=["Unnamed: 0"], inplace=True)
rcp85_df["Year"] = rcp85_df["DateTime"].dt.year

In [None]:
rcp85_df

## Predictions

### XGBoost

#### RCP 4.5

##### Prepare Data

In [None]:
further_features = ["Year"]

last_train_date = full_df["DateTime"].max()

rcp45_test_df = rcp45_df[rcp45_df["DateTime"] > last_train_date]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    rcp45_test_df[rcp45_test_df.columns.difference(["DateTime"])]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        rcp45_test_df.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = rcp45_test_df["DateTime"].values

In [None]:
X_test_fit = X_poly[X_poly.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_test_fit = rcp45_test_df[
    rcp45_test_df.columns.difference(["DateTime"])
]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=rcp45_test_df.columns.difference(["DateTime"])
)

##### Train Model

In [None]:
# sort the columns
X_train_fit = X_train_fit.reindex(sorted(X_train_fit.columns), axis=1)
X_test_fit = X_test_fit.reindex(sorted(X_test_fit.columns), axis=1)

In [None]:
best_params = {
    "objective": "reg:squarederror",
    "booster": "gblinear",
    "eta": 0.021222820197838683,
    "reg_lambda": 1.2716135487076726e-07,
    "reg_alpha": 0.25584966658518155,
    "learning_rate": 0.6602531811820622,
    "n_estimators": 369,
    "updater": "shotgun",
}

In [None]:
booster = xgb.XGBRegressor(
    random_state=42,
    **best_params,
)

booster.fit(X_train_fit, y_train_fit["UVA254"])

##### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = booster.feature_names_in_
feature_importance["importance"] = -booster.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)

# rotate the x axis words by 45°
plt.xticks(rotation=45)

plt.title("Feature Importance")
plt.show()

##### Predictions

In [None]:
mapie_cqr = MapieRegressor(booster, method="naive", random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["UVA254"])

In [None]:
# add month to the test data to get the predictions
X_test_fit["Month"] = rcp45_test_df["DateTime"].dt.month.values

In [None]:
alpha = 0.05

predictions_per_month = {}

# Evaluate prediction and coverage level on testing set for each month
for month in range(1, 13):
    test_fit = X_test_fit[X_test_fit["Month"] == month]
    rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month]

    # drop the month column
    test_fit = test_fit[test_fit.columns.difference(["Month"])]

    y_med, y_pis_cqr = mapie_cqr.predict(test_fit, alpha=alpha)
    y_lower = y_pis_cqr[:, 0, 0]
    y_upper = y_pis_cqr[:, 1, 0]

    predictions_per_month[month] = {
        "y_med": y_med,
        "y_lower": y_lower,
        "y_upper": y_upper,
    }

In [None]:
X_test_fit.drop(columns=["Month"], inplace=True)

In [None]:
# evaluate on all the months
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

##### Plots

In [None]:
# get training residuals
train_predictions = booster.predict(X_train_fit)
train_residuals = y_train_fit["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y["DateTime"], y=train_predictions, label="Fitted Values"
)

plt.show()

In [None]:
# all the predictions
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)


sns.lineplot(
    x=rcp45_test_df["DateTime"], y=y_med, label=f"Predicted Values"
)
# plot std of predictions
plt.fill_between(
    rcp45_test_df["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box

plt.title(f"UVA254 - RCP 4.5")

plt.legend()
plt.show()

In [None]:
# predictions per month
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)

# give me a list of 12 colors
colors = sns.color_palette("husl", 12)

for month in range(1, 13):
    y_med = predictions_per_month[month]["y_med"]
    y_lower = predictions_per_month[month]["y_lower"]
    y_upper = predictions_per_month[month]["y_upper"]

    # plot the time series fitted values

    sns.lineplot(
        x=rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y=y_med,
        label=f"Predicted Values - Month {month}",
        color=colors[month - 1],
    )
    # plot std of predictions
    plt.fill_between(
        rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y_lower,
        y2=y_upper,
        alpha=0.2,
        label="95% Prediction Interval",
        color=colors[month - 1],
    )
    plt.xlabel("DateTime")
    plt.ylabel("UVA254")

    # add rmse and r2 to the plot in a box

    plt.title(f"UVA254 - Month {month} - RCP 4.5")

    plt.legend()
plt.show()

##### Store Results

In [None]:
boost_pred = y_med
boost_lower_bound = y_lower
boost_upper_bound = y_upper

#### RCP 8.5

##### Prepare Data

In [None]:
further_features = ["Year"]

last_train_date = full_df["DateTime"].max()

rcp85_test_df = rcp85_df[rcp85_df["DateTime"] > last_train_date]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    rcp85_test_df[rcp85_test_df.columns.difference(["DateTime"])]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        rcp85_test_df.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = rcp85_test_df["DateTime"].values

In [None]:
X_test_fit = X_poly[X_poly.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_test_fit = rcp85_test_df[
    rcp85_test_df.columns.difference(["DateTime"])
]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=rcp85_test_df.columns.difference(["DateTime"])
)

##### Train Model

In [None]:
# sort the columns
X_train_fit = X_train_fit.reindex(sorted(X_train_fit.columns), axis=1)
X_test_fit = X_test_fit.reindex(sorted(X_test_fit.columns), axis=1)

In [None]:
best_params = {
    "objective": "reg:squarederror",
    "booster": "gblinear",
    "eta": 0.021222820197838683,
    "reg_lambda": 1.2716135487076726e-07,
    "reg_alpha": 0.25584966658518155,
    "learning_rate": 0.6602531811820622,
    "n_estimators": 369,
    "updater": "shotgun",
}

In [None]:
booster = xgb.XGBRegressor(
    random_state=42,
    **best_params,
)

booster.fit(X_train_fit, y_train_fit["UVA254"])

##### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = booster.feature_names_in_
feature_importance["importance"] = -booster.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)

# rotate the x axis words by 45°
plt.xticks(rotation=45)

plt.title("Feature Importance")
plt.show()

##### Predictions

In [None]:
mapie_cqr = MapieRegressor(booster, method="naive", random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["UVA254"])

In [None]:
# add month to the test data to get the predictions
X_test_fit["Month"] = rcp85_test_df["DateTime"].dt.month.values

In [None]:
alpha = 0.05

predictions_per_month = {}

# Evaluate prediction and coverage level on testing set for each month
for month in range(1, 13):
    test_fit = X_test_fit[X_test_fit["Month"] == month]
    rcp85_test_df[rcp85_test_df["DateTime"].dt.month == month]

    # drop the month column
    test_fit = test_fit[test_fit.columns.difference(["Month"])]

    y_med, y_pis_cqr = mapie_cqr.predict(test_fit, alpha=alpha)
    y_lower = y_pis_cqr[:, 0, 0]
    y_upper = y_pis_cqr[:, 1, 0]

    predictions_per_month[month] = {
        "y_med": y_med,
        "y_lower": y_lower,
        "y_upper": y_upper,
    }

In [None]:
X_test_fit.drop(columns=["Month"], inplace=True)

In [None]:
# evaluate on all the months
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

##### Plots

In [None]:
# get training residuals
train_predictions = booster.predict(X_train_fit)
train_residuals = y_train_fit["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y["DateTime"], y=train_predictions, label="Fitted Values"
)

plt.show()

In [None]:
# all the predictions
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)


sns.lineplot(
    x=rcp45_test_df["DateTime"], y=y_med, label=f"Predicted Values"
)
# plot std of predictions
plt.fill_between(
    rcp45_test_df["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box

plt.title(f"UVA254 - RCP 8.5")

plt.legend()
plt.show()

In [None]:
# predictions per month
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)

# give me a list of 12 colors
colors = sns.color_palette("husl", 12)

for month in range(1, 13):
    y_med = predictions_per_month[month]["y_med"]
    y_lower = predictions_per_month[month]["y_lower"]
    y_upper = predictions_per_month[month]["y_upper"]

    # plot the time series fitted values

    sns.lineplot(
        x=rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y=y_med,
        label=f"Predicted Values - Month {month}",
        color=colors[month - 1],
    )
    # plot std of predictions
    plt.fill_between(
        rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y_lower,
        y2=y_upper,
        alpha=0.2,
        label="95% Prediction Interval",
        color=colors[month - 1],
    )
    plt.xlabel("DateTime")
    plt.ylabel("UVA254")

    # add rmse and r2 to the plot in a box

    plt.title(f"UVA254 - Month {month} - RCP 4.5")

    plt.legend()
plt.show()

##### Store Results

In [None]:
boost_pred = y_med
boost_lower_bound = y_lower
boost_upper_bound = y_upper

### LightGBM (Random Forest with Linear Regressors on leaves)

#### RCP 4.5

##### Prepare Data

In [None]:
further_features = ["Year"]

last_train_date = full_df["DateTime"].max()

rcp45_test_df = rcp45_df[rcp45_df["DateTime"] > last_train_date]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    rcp45_test_df[rcp45_test_df.columns.difference(["DateTime"])]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        rcp45_test_df.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = rcp45_test_df["DateTime"].values

In [None]:
X_test_fit = X_poly[X_poly.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_test_fit = rcp45_test_df[
    rcp45_test_df.columns.difference(["DateTime"])
]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=rcp45_test_df.columns.difference(["DateTime"])
)

##### Train Model

In [None]:
# sort the columns
X_train_fit = X_train_fit.reindex(sorted(X_train_fit.columns), axis=1)
X_test_fit = X_test_fit.reindex(sorted(X_test_fit.columns), axis=1)

In [None]:
params = {
    "n_estimators": 16,
    "learning_rate": 0.6192800859019298,
    "max_depth": 16,
    "num_leaves": 20,
    "min_data_in_leaf": 34,
    "lambda_l1": 1.8585248563175933,
    "lambda_l2": 0.020368547806226774,
    "min_split_gain": 2.5,
    "subsample": 0.5639096844841955,
    "bagging_fraction": 0.026474369917739878,
    "feature_fraction": 0.0012608584366219668,
    "min_child_samples": 33,
    "max_bin": 20,
}

###### Train model with mapie

In [None]:
alpha = 0.1
estimator = LGBMRegressor(
    objective="regression", random_state=42, linear_tree=True, **params
)

estimator.fit(X_train_fit, y_train_fit["UVA254"])

##### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = estimator.feature_name_
feature_importance["importance"] = estimator.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)
plt.title("Feature Importance")
plt.show()

##### Predictions

In [None]:
# add month to the test data to get the predictions
X_test_fit["Month"] = rcp45_test_df["DateTime"].dt.month.values

In [None]:
alpha = 0.05

predictions_per_month = {}

# Evaluate prediction and coverage level on testing set for each month
for month in range(1, 13):
    test_fit = X_test_fit[X_test_fit["Month"] == month]
    rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month]

    # drop the month column
    test_fit = test_fit[test_fit.columns.difference(["Month"])]

    y_med_month, y_pis_cqr = mapie_cqr.predict(test_fit, alpha=alpha)
    y_lower_month = y_pis_cqr[:, 0, 0]
    y_upper_month = y_pis_cqr[:, 1, 0]

    predictions_per_month[month] = {
        "y_med": y_med_month,
        "y_lower": y_lower_month,
        "y_upper": y_upper_month,
    }

In [None]:
X_test_fit.drop(columns=["Month"], inplace=True)

In [None]:
# evaluate on all the months
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

##### Plots

In [None]:
# get training residuals
train_predictions, _ = mapie_cqr.predict(X_train_fit, alpha=alpha)
train_residuals = y_train_fit["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=y_train_fit["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
# all the predictions
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)


sns.lineplot(
    x=rcp45_test_df["DateTime"], y=y_med, label=f"Predicted Values"
)
# plot std of predictions
plt.fill_between(
    rcp45_test_df["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box

plt.title(f"UVA254 - RCP 8.5")

plt.legend()
plt.show()

In [None]:
# predictions per month
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)

# give me a list of 12 colors
colors = sns.color_palette("husl", 12)

for month in range(1, 13):
    y_med_month = predictions_per_month[month]["y_med"]
    y_lower_month = predictions_per_month[month]["y_lower"]
    y_upper_month = predictions_per_month[month]["y_upper"]

    # plot the time series fitted values

    sns.lineplot(
        x=rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y=y_med_month,
        label=f"Predicted Values - Month {month}",
        color=colors[month - 1],
    )
    # plot std of predictions
    plt.fill_between(
        rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y_lower_month,
        y2=y_upper_month,
        alpha=0.2,
        label="95% Prediction Interval",
        color=colors[month - 1],
    )
    plt.xlabel("DateTime")
    plt.ylabel("UVA254")

    # add rmse and r2 to the plot in a box

    plt.title(f"UVA254 - Month {month} - RCP 4.5")

    plt.legend()
plt.show()

##### Store Results

In [None]:
lgbm_pred = y_med
lgbm_lower_bound = y_lower
lgbm_upper_bound = y_upper

#### RCP 8.5

##### Prepare Data

In [None]:
further_features = ["Year"]

last_train_date = full_df["DateTime"].max()

rcp85_test_df = rcp85_df[rcp85_df["DateTime"] > last_train_date]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    rcp85_test_df[rcp85_test_df.columns.difference(["DateTime"])]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        rcp85_test_df.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = rcp85_test_df["DateTime"].values

In [None]:
X_test_fit = X_poly[X_poly.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_test_fit = rcp85_test_df[
    rcp85_test_df.columns.difference(["DateTime"])
]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=rcp85_test_df.columns.difference(["DateTime"])
)

##### Train Model

In [None]:
# sort the columns
X_train_fit = X_train_fit.reindex(sorted(X_train_fit.columns), axis=1)
X_test_fit = X_test_fit.reindex(sorted(X_test_fit.columns), axis=1)

In [None]:
params = {
    "n_estimators": 16,
    "learning_rate": 0.6192800859019298,
    "max_depth": 16,
    "num_leaves": 20,
    "min_data_in_leaf": 34,
    "lambda_l1": 1.8585248563175933,
    "lambda_l2": 0.020368547806226774,
    "min_split_gain": 2.5,
    "subsample": 0.5639096844841955,
    "bagging_fraction": 0.026474369917739878,
    "feature_fraction": 0.0012608584366219668,
    "min_child_samples": 33,
    "max_bin": 20,
}

###### Train model with mapie

In [None]:
alpha = 0.1
estimator = LGBMRegressor(
    objective="regression", random_state=42, linear_tree=True, **params
)

estimator.fit(X_train_fit, y_train_fit["UVA254"])

##### Feature Importance

In [None]:
# show the importance of each feature in the model
feature_importance = pd.DataFrame()
feature_importance["feature"] = estimator.feature_name_
feature_importance["importance"] = estimator.feature_importances_

feature_importance = feature_importance.sort_values(
    by="importance", ascending=False
)

# plot the importance of each feature
plt.figure(figsize=(25, 7.5))
plt.bar(
    x=feature_importance["feature"],
    height=feature_importance["importance"],
)
plt.title("Feature Importance")
plt.show()

##### Predictions

In [None]:
# add month to the test data to get the predictions
X_test_fit["Month"] = rcp85_test_df["DateTime"].dt.month.values

In [None]:
alpha = 0.05

predictions_per_month = {}

# Evaluate prediction and coverage level on testing set for each month
for month in range(1, 13):
    test_fit = X_test_fit[X_test_fit["Month"] == month]
    rcp85_test_df[rcp85_test_df["DateTime"].dt.month == month]

    # drop the month column
    test_fit = test_fit[test_fit.columns.difference(["Month"])]

    y_med_month, y_pis_cqr = mapie_cqr.predict(test_fit, alpha=alpha)
    y_lower_month = y_pis_cqr[:, 0, 0]
    y_upper_month = y_pis_cqr[:, 1, 0]

    predictions_per_month[month] = {
        "y_med": y_med_month,
        "y_lower": y_lower_month,
        "y_upper": y_upper_month,
    }

In [None]:
X_test_fit.drop(columns=["Month"], inplace=True)

In [None]:
# evaluate on all the months
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

##### Plots

In [None]:
# get training residuals
train_predictions, _ = mapie_cqr.predict(X_train_fit, alpha=alpha)
train_residuals = y_train_fit["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(train_predictions, train_residuals, c="b", s=40, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=y_train_fit["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
# all the predictions
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)


sns.lineplot(
    x=rcp85_test_df["DateTime"], y=y_med, label=f"Predicted Values"
)
# plot std of predictions
plt.fill_between(
    rcp85_test_df["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box

plt.title(f"UVA254 - RCP 8.5")

plt.legend()
plt.show()

In [None]:
# predictions per month
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)

# give me a list of 12 colors
colors = sns.color_palette("husl", 12)

for month in range(1, 13):
    y_med_month = predictions_per_month[month]["y_med"]
    y_lower_month = predictions_per_month[month]["y_lower"]
    y_upper_month = predictions_per_month[month]["y_upper"]

    # plot the time series fitted values

    sns.lineplot(
        x=rcp85_test_df[rcp85_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y=y_med_month,
        label=f"Predicted Values - Month {month}",
        color=colors[month - 1],
    )
    # plot std of predictions
    plt.fill_between(
        rcp85_test_df[rcp85_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y_lower_month,
        y2=y_upper_month,
        alpha=0.2,
        label="95% Prediction Interval",
        color=colors[month - 1],
    )
    plt.xlabel("DateTime")
    plt.ylabel("UVA254")

    # add rmse and r2 to the plot in a box

    plt.title(f"UVA254 - Month {month} - RCP 4.5")

    plt.legend()
plt.show()

##### Store Results

In [None]:
lgbm_pred = y_med
lgbm_lower_bound = y_lower
lgbm_upper_bound = y_upper

### Neural Network

#### RCP 4.5

##### Prepare Data

In [None]:
further_features = ["Year"]

last_train_date = full_df["DateTime"].max()

rcp45_test_df = rcp45_df[rcp45_df["DateTime"] > last_train_date]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    rcp45_test_df[rcp45_test_df.columns.difference(["DateTime"])]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        rcp45_test_df.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = rcp45_test_df["DateTime"].values

In [None]:
X_test_fit = X_poly[X_poly.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_test_fit = rcp45_test_df[
    rcp45_test_df.columns.difference(["DateTime"])
]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=rcp45_test_df.columns.difference(["DateTime"])
)

##### Train the model

In [None]:
best_params = {
    "n_layers": 2,
    "n_units_0": 85,
    "n_units_1": 75,
    "activation": "identity",
    "solver": "adam",
    "alpha": 0.7765540584565614,
    "learning_rate": "constant",
    "power_t": 0.3382710741601535,
    "beta_1": 0.19887581875693028,
    "beta_2": 0.984060053664114,
    "epsilon": 0.32827083622604075,
}

In [None]:
hidden_layer_sizes = [
    best_params[f"n_units_{i}"] for i in range(best_params["n_layers"])
]

for i in range(best_params["n_layers"]):
    best_params.pop(f"n_units_{i}")

best_params.pop("n_layers")

###### Train model with mapie

In [None]:
alpha = 0.1
estimator = MLPRegressor(
    random_state=42,
    hidden_layer_sizes=hidden_layer_sizes,
    max_iter=1000,
    early_stopping=True,
    **best_params
)

estimator.fit(X_train_fit, y_train_fit["UVA254"])

In [None]:
# Calibrate uncertainties on calibration set
mapie_cqr = MapieRegressor(estimator, cv=10, random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["UVA254"])

##### Feature Importance

In [None]:
explainer = shap.KernelExplainer(
    model=estimator.predict, data=X_train_fit, link="identity"
)
shap_values = explainer.shap_values(X_test_fit, nsamples=100)

shap.initjs()

In [None]:
plt.figure(figsize=(20, 10))

shap.summary_plot(
    shap_values, X_test_fit, feature_names=X_test_fit.columns.to_list()
)

In [None]:
shap.force_plot(
    explainer.expected_value,
    shap_values[0, :],
    X_test_fit.iloc[0, :],
    feature_names=X_test_fit.columns.to_list(),
)

##### Prediction

In [None]:
# add month to the test data to get the predictions
X_test_fit["Month"] = rcp45_test_df["DateTime"].dt.month.values

In [None]:
alpha = 0.05

predictions_per_month = {}

# Evaluate prediction and coverage level on testing set for each month
for month in range(1, 13):
    test_fit = X_test_fit[X_test_fit["Month"] == month]
    rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month]

    # drop the month column
    test_fit = test_fit[test_fit.columns.difference(["Month"])]

    y_med_month, y_pis_cqr = mapie_cqr.predict(test_fit, alpha=alpha)
    y_lower_month = y_pis_cqr[:, 0, 0]
    y_upper_month = y_pis_cqr[:, 1, 0]

    predictions_per_month[month] = {
        "y_med": y_med_month,
        "y_lower": y_lower_month,
        "y_upper": y_upper_month,
    }

In [None]:
X_test_fit.drop(columns=["Month"], inplace=True)

In [None]:
# evaluate on all the months
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

##### Plots

In [None]:
# get training residuals
train_predictions, _ = mapie_cqr.predict(X_train_fit, alpha=alpha)
train_residuals = y_train_fit["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(
    train_predictions.flatten(), train_residuals, c="b", s=40, alpha=0.5
)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=y_train_fit["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
# all the predictions
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)


sns.lineplot(
    x=rcp45_test_df["DateTime"], y=y_med, label=f"Predicted Values"
)
# plot std of predictions
plt.fill_between(
    rcp45_test_df["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box

plt.title(f"UVA254 - RCP 4.5")

plt.legend()
plt.show()

In [None]:
# predictions per month
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)

# give me a list of 12 colors
colors = sns.color_palette("husl", 12)

min_max_months = {3: "March", 9: "September"}

for month, name in min_max_months.items():
    y_med_month = predictions_per_month[month]["y_med"]
    y_lower_month = predictions_per_month[month]["y_lower"]
    y_upper_month = predictions_per_month[month]["y_upper"]

    # y_lower_month can be negative, so we need to set it to 0
    y_lower_month = np.maximum(y_lower_month, 0)

    # plot the time series fitted values

    sns.lineplot(
        x=rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y=y_med_month,
        label=f"Predicted Values - {name}",
        color=colors[month - 1],
    )
    # plot std of predictions
    plt.fill_between(
        rcp45_test_df[rcp45_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y_lower_month,
        y2=y_upper_month,
        alpha=0.2,
        label="95% Prediction Interval",
        color=colors[month - 1],
    )
    plt.xlabel("DateTime")
    plt.ylabel("UVA254")

    # add rmse and r2 to the plot in a box

    plt.title(f"UVA254 - Min/Max Months - RCP 4.5")

    plt.legend()
plt.show()

##### Store Results

In [None]:
nn_pred = y_med
nn_lower_bound = y_lower
nn_upper_bound = y_upper

#### RCP 8.5

##### Prepare Data

In [None]:
further_features = ["Year"]

last_train_date = full_df["DateTime"].max()

rcp85_test_df = rcp85_df[rcp85_df["DateTime"] > last_train_date]

In [None]:
# add polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(
    rcp85_test_df[rcp85_test_df.columns.difference(["DateTime"])]
)

X_poly = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out(
        rcp85_test_df.columns.difference(["DateTime"])
    ),
)

X_poly["DateTime"] = rcp85_test_df["DateTime"].values

In [None]:
X_test_fit = X_poly[X_poly.columns.difference(["DateTime"])]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=X_poly.columns.difference(["DateTime"])
)

In [None]:
X_test_fit = rcp85_test_df[
    rcp85_test_df.columns.difference(["DateTime"])
]

In [None]:
scaler = StandardScaler()
# scale the data and keep the column names
X_test_fit = scaler.fit_transform(X_test_fit)

X_test_fit = pd.DataFrame(
    X_test_fit, columns=rcp85_test_df.columns.difference(["DateTime"])
)

##### Train the model

In [None]:
best_params = {
    "n_layers": 2,
    "n_units_0": 85,
    "n_units_1": 75,
    "activation": "identity",
    "solver": "adam",
    "alpha": 0.7765540584565614,
    "learning_rate": "constant",
    "power_t": 0.3382710741601535,
    "beta_1": 0.19887581875693028,
    "beta_2": 0.984060053664114,
    "epsilon": 0.32827083622604075,
}

In [None]:
hidden_layer_sizes = [
    best_params[f"n_units_{i}"] for i in range(best_params["n_layers"])
]

for i in range(best_params["n_layers"]):
    best_params.pop(f"n_units_{i}")

best_params.pop("n_layers")

###### Train model with mapie

In [None]:
alpha = 0.05
estimator = MLPRegressor(
    random_state=42,
    hidden_layer_sizes=hidden_layer_sizes,
    max_iter=1000,
    early_stopping=True,
    **best_params
)

# estimator.fit(X_train_fit, y_train_fit["UVA254"])

In [None]:
# Calibrate uncertainties on calibration set
mapie_cqr = MapieRegressor(estimator, random_state=42)
mapie_cqr.fit(X_train_fit, y_train_fit["UVA254"])

##### Prediction

In [None]:
# add month to the test data to get the predictions
X_test_fit["Month"] = rcp85_test_df["DateTime"].dt.month.values

In [None]:
alpha = 0.05

predictions_per_month = {}

# Evaluate prediction and coverage level on testing set for each month
for month in range(1, 13):
    test_fit = X_test_fit[X_test_fit["Month"] == month]
    rcp45_test_df[rcp85_test_df["DateTime"].dt.month == month]

    # drop the month column
    test_fit = test_fit[test_fit.columns.difference(["Month"])]

    y_med_month, y_pis_cqr = mapie_cqr.predict(test_fit, alpha=alpha)
    y_lower_month = y_pis_cqr[:, 0, 0]
    y_upper_month = y_pis_cqr[:, 1, 0]

    predictions_per_month[month] = {
        "y_med": y_med_month,
        "y_lower": y_lower_month,
        "y_upper": y_upper_month,
    }

In [None]:
X_test_fit.drop(columns=["Month"], inplace=True)

In [None]:
# evaluate on all the months
y_med, y_pis_cqr = mapie_cqr.predict(X_test_fit, alpha=alpha)
y_lower = y_pis_cqr[:, 0, 0]
y_upper = y_pis_cqr[:, 1, 0]

##### Plots

In [None]:
# get training residuals
train_predictions, _ = mapie_cqr.predict(X_train_fit, alpha=alpha)
train_residuals = y_train_fit["UVA254"] - train_predictions

In [None]:
# plot the training residuals
plt.figure(figsize=(10, 5))
plt.scatter(
    train_predictions.flatten(), train_residuals, c="b", s=40, alpha=0.5
)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Training Set Residuals Plot")
plt.show()

In [None]:
# plot the distribution of the training residuals
plt.figure(figsize=(10, 5))
sns.histplot(train_residuals, kde=True)
plt.title("Training Set Residuals Distribution")
plt.show()

In [None]:
# plot the time series fitted values
plt.figure(figsize=(20, 10))
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=y_train_fit["UVA254"],
    label="Historical Data",
)
sns.lineplot(
    x=y_train_fit["DateTime"],
    y=train_predictions,
    label="Fitted Values",
)

plt.show()

In [None]:
# all the predictions
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)


y_lower = np.maximum(y_lower, 0)

sns.lineplot(
    x=rcp45_test_df["DateTime"], y=y_med, label=f"Predicted Values"
)
# plot std of predictions
plt.fill_between(
    rcp45_test_df["DateTime"],
    y_lower,
    y2=y_upper,
    alpha=0.2,
    label="95% Prediction Interval",
    color="g",
)
plt.xlabel("DateTime")
plt.ylabel("UVA254")

# add rmse and r2 to the plot in a box

plt.title(f"UVA254 - RCP 8.5")

plt.legend()
plt.show()

In [None]:
# predictions per month
plt.figure(figsize=(20, 10))

sns.lineplot(
    x=y["DateTime"],
    y=y["UVA254"],
    label="Historical Data",
)

# give me a list of 12 colors
colors = sns.color_palette("husl", 12)

min_max_months = {3: "March", 9: "September"}

for month, name in min_max_months.items():
    y_med_month = predictions_per_month[month]["y_med"]
    y_lower_month = predictions_per_month[month]["y_lower"]
    y_upper_month = predictions_per_month[month]["y_upper"]

    # y_lower values cannot be negative
    y_lower_month = np.maximum(y_lower_month, 0)

    # plot the time series fitted values

    sns.lineplot(
        x=rcp45_test_df[rcp85_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y=y_med_month,
        label=f"Predicted Values - {name}",
        color=colors[month - 1],
    )
    # plot std of predictions
    plt.fill_between(
        rcp45_test_df[rcp85_test_df["DateTime"].dt.month == month][
            "DateTime"
        ],
        y_lower_month,
        y2=y_upper_month,
        alpha=0.2,
        label="95% Prediction Interval",
        color=colors[month - 1],
    )
    plt.xlabel("DateTime")
    plt.ylabel("UVA254")

    # add rmse and r2 to the plot in a box

    plt.legend()
plt.show()

##### Store Results

In [None]:
nn_pred = y_med
nn_lower_bound = y_lower
nn_upper_bound = y_upper