# Series analysis

## Import libreries

In [None]:
import subprocess
from pathlib import Path
import importlib.util
import sys

# Detect if running on Google Colab
def in_colab():
    return importlib.util.find_spec("google.colab") is not None

# Set base directory and handle environment
if in_colab():
    # Install required packages only if not already installed
    def install(package):
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

    install("utilsforecast")

    # Mount Google Drive
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive')

    # Set base directory to your Drive project folder
    BASE_DIR = Path('/content/drive/MyDrive/heat-forecast')

    # Add `src/` to sys.path for custom package imports
    src_path = BASE_DIR / 'src'
    if str(src_path) not in sys.path:
        sys.path.append(str(src_path))

else:
    # Local/VM setup: assume notebook is in project_root/notebooks/
    BASE_DIR = Path.cwd().parent


In [None]:
# --- IPython Magic ---
%load_ext autoreload
%autoreload 2

# --- Standard Library ---
import sys
import logging

# --- Scientific Computing & Data Handling ---
import numpy as np
import pandas as pd
pd.set_option('display.float_format', '{:.3f}'.format)

# --- Plotting & Visualization ---
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from cycler import cycler

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# --- Statistics & Modeling ---
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr, kendalltau

# --- Machine Learning & Dimensionality Reduction ---
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# --- Forecasting & Preprocessing ---
from utilsforecast.preprocessing import fill_gaps

# --- Custom Modules ---
from heat_forecast.utils.plotting import (
    configure_time_axes, plot_weekly_seasonality, plot_daily_seasonality,
    display_scrollable
)
from heat_forecast.utils.transforms import make_is_winter

# --- Plotting Configuration ---
interactive = False  # Set to False for static plots
plt.style.use("ggplot")
palette = sns.color_palette("tab10", 5)
plt.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.titlesize': 18
})
mpl.rcParams['axes.prop_cycle'] = cycler(color=["#000000", "#000000"])
mpl.rcParams['axes.grid'] = True
mpl.rcParams['axes.grid.which'] = 'both'

# --- Logging Configuration ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%H:%M:%S",
    force=True,
    stream=sys.stdout
)



## Description and visualization of the target series

Import the target series.

In [None]:
heat_df = pd.DataFrame()
for id in range(1, 6):
    path = BASE_DIR / "data" / "timeseries" / f"impianto{id}" / f"impianto{id}_heat_demand.csv"
    df = pd.read_csv(path, sep=';', parse_dates=['timestamp'])
    logging.info(f'Processing data for facility {id}')
    df.columns = ['ds', 'y']
    df['unique_id'] = f'F{id}'
    df = fill_gaps(df, freq='h') # fill missing timestamps in ds using nans 
    heat_df = pd.concat([heat_df, df], ignore_index=True)
heat_df['ds'] = heat_df['ds'].dt.tz_localize(None)

Plot of the target series:

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(15, 15), sharex=False, sharey=True)
for i, (id, group) in enumerate(heat_df.groupby('unique_id')):
    sns.lineplot(data=group, x='ds', y='y', ax=axes[i], label=id, color=palette[i])
    axes[i].set_title(f'{id}', fontsize=16)

configure_time_axes(axes, heat_df['ds'])

fig.suptitle('Heat Demand for each id')
fig.supxlabel('Date time [H]')
fig.supylabel('Heat Demand [kWh]')
fig.tight_layout(rect=[0.01, 0.01, 0.99, 0.99])

To inspect the time series further, specify a period to zoom in on a specific date range:

In [None]:
# Choose plot period
period = pd.date_range(start='2023-10-01', end='2024-05-01', freq='h')

# Plot
fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(15, 20), sharey=True)
axes = axes.flatten()
mask = heat_df['ds'].isin(period)
for i, (id, group) in enumerate(heat_df.loc[mask].groupby('unique_id')):
    ax = axes[i]
    sns.lineplot(data=group, x='ds', y='y', ax=ax, label=id, color=palette[i])
    ax.set_title(f'{id}', fontsize=16)

configure_time_axes(axes, period)

fig.suptitle('Heat Demand for a specified period')
fig.supxlabel('Date')
fig.supylabel('Heat Demand [kWh]')
fig.tight_layout(rect=[0.01, 0.01, 0.99, 0.99])

Show basic dataset summary.

In [None]:
def summarize(df, name, target_col='y', zero_count=True):
    logging.info(f"\nBasic series information: {name}")

    agg_dict = {
        "count": (target_col, "count"),
        "mean": (target_col, "mean"),
        "std": (target_col, "std"),
        "min": (target_col, "min"),
        "p05": (target_col, lambda x: x.quantile(0.05)),
        "p25": (target_col, lambda x: x.quantile(0.25)),
        "p50": (target_col, lambda x: x.quantile(0.50)),
        "p75": (target_col, lambda x: x.quantile(0.75)),
        "p95": (target_col, lambda x: x.quantile(0.95)),
        "max": (target_col, "max"),
        "missing_count": (target_col, lambda x: x.isna().sum()),
        "start": ("ds", "min"),
        "end": ("ds", "max"),
    }

    if zero_count:
        agg_dict["zero_count"] = (target_col, lambda x: (x == 0).sum())

    summary = df.groupby("unique_id").agg(**agg_dict)
    display(summary)

# Full year
summarize(heat_df, "all year")

# Cold semester (Nov–Apr)
cold_months = [11, 12, 1, 2, 3, 4]
heat_cold_df = heat_df[heat_df['ds'].dt.month.isin(cold_months)]
summarize(heat_cold_df, "coldest semester (Nov-Apr)")

# Warm semester (May–Oct)
warm_months = [5, 6, 7, 8, 9, 10]
heat_warm_df = heat_df[heat_df['ds'].dt.month.isin(warm_months)]
summarize(heat_warm_df, "warmest semester (May-Oct)")


In [None]:
# Clip values below 5 kWh 
min_threshold = 5.0

# Count values below the threshold
low_values_by_facility = heat_df[heat_df['y'] < min_threshold].groupby('unique_id')['y'].agg(low_count='count').T
total_count = len(heat_df[heat_df['y'] < min_threshold])
low_values_by_facility['total'] = total_count
logging.info("Low values by facility:")
display(low_values_by_facility)

# Apply the capping
heat_df_capped = heat_df.copy()
heat_df_capped.loc[heat_df_capped['y'] < min_threshold, 'y'] = min_threshold

# Update heat_df with the capped values
heat_df = heat_df_capped.copy()
logging.info("Capping applied.")

# Verify the capping worked
low_values_by_facility = heat_df[heat_df['y'] < min_threshold].groupby('unique_id')['y'].agg(low_count='count').T
total_count = len(heat_df[heat_df['y'] < min_threshold])
low_values_by_facility['total'] = total_count
logging.info("Sanity check, low values by facility:")
display(low_values_by_facility)


In [None]:
# Heat demand grouped by month (whose number is shown on the right side of the plot) for the coldest months
lam = plot_weekly_seasonality(heat_df, only_cold_months=True, make_is_winter=make_is_winter)

The function `plot_daily_seasonality` below is analogous to the previous one, but it displays daily load profiles instead of weekly patterns.

In [None]:
# Heat demand grouped by month (whose number is shown on the right side of the plot) for the coldest months
lam = plot_daily_seasonality(heat_df, only_cold_months=True, make_is_winter=make_is_winter)

A closer examination of the weekly seasonal patterns reveals that F2 is the only series without a clear weekly structure, while F3 and F5 exhibit the most pronounced weekly seasonality.

### Plot of aggregated y series

Aggregate the series to the daily level by averaging.

In [None]:
heat_daily_df = (
    heat_df
    .groupby('unique_id')
    .resample('D', on='ds', include_groups=False)
    .mean(numeric_only=True)   # average numeric columns
    .reset_index()             # bring ds and unique_id back as columns
)

Plot of the daily-aggregated target series across all years.

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(15, 15), sharex=False, sharey=True)
for i, (id, group) in enumerate(heat_daily_df.groupby('unique_id')):
    sns.lineplot(data=group, x='ds', y='y', ax=axes[i], label=id, color=palette[i])
    axes[i].set_title(f'{id}', fontsize=16)

configure_time_axes(axes, heat_daily_df['ds'])

fig.suptitle('Heat Demand for each id')
fig.supxlabel('Date [D]')
fig.supylabel('Heat Demand [kWh]')
fig.tight_layout(rect=[0.01, 0.01, 0.99, 0.99])

Plot of the aggregated target series over a specified time period.

In [None]:
# Choose plot period
period = pd.date_range(start='2023-10-01', end='2024-06-01', freq='h')

# Plot
fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(15, 20), sharey=True)
axes = axes.flatten()
mask = heat_daily_df['ds'].isin(period)
for i, (id, group) in enumerate(heat_daily_df.loc[mask].groupby('unique_id')):
    ax = axes[i]
    sns.lineplot(data=group, x='ds', y='y', ax=ax, label=id, color=palette[i])
    ax.set_title(f'{id}')

configure_time_axes(axes, period)

fig.suptitle('Heat Demand for a specified period')
fig.supxlabel('Date')
fig.supylabel('Heat Demand [kWh]')
fig.tight_layout(rect=[0.01, 0.01, 0.99, 0.99])

This confirms the earlier observations.

## Import and plot the auxiliary series

Import the target series.

In [None]:
aux_df = pd.DataFrame()
for id in range(1, 6):
    path = BASE_DIR / "data" / "timeseries" / f"impianto{id}" / f"impianto{id}_auxiliary_series.csv"
    df = pd.read_csv(path, sep=';', parse_dates=['time'])
    logging.info(f'Processing auxiliary data for facility {id}')
    df.rename(columns={'time': 'ds'}, inplace=True)
    df['unique_id'] = f'F{id}'
    df = fill_gaps(df, freq='h') # fill missing timestamps in ds using nans 
    aux_df = pd.concat([aux_df, df], ignore_index=True)
aux_df['ds'] = aux_df['ds'].dt.tz_localize(None)
aux_cols = aux_df.columns.difference(['ds', 'unique_id']).to_list()

Plot auxiliary and target series.

In [None]:
units = {
    'dew_point': '°C',  
    'humidity': '%',
    'pressure': 'hPa',
    'temperature': '°C',
    'wind_speed': 'm/s'
} # Probable units for the auxiliary series

# Select only common timestamps for alignment in the plot
common_ds = heat_df['ds'].isin(aux_df['ds'])
heat_df_filtered = heat_df[common_ds]
aux_df_filtered = aux_df[aux_df['ds'].isin(heat_df_filtered['ds'])]

fig, axes = plt.subplots(nrows=6, ncols=1, figsize=(12, 16))
axes = axes.flatten()

# Plot the target series
ax = axes[0]
sns.lineplot(data=heat_df_filtered.pivot(index='ds', columns='unique_id', values='y'), ax=ax, palette=palette, alpha=0.7, dashes=False)
ax.set_title('Heat Demand [kWh]')
ax.legend(title='Facility ID')

# Plot the auxiliary series
for i, c in enumerate(aux_cols):
    ax = axes[i + 1]  # Skip the first axis for the target series
    col_df = aux_df_filtered.loc[:, ['ds', 'unique_id', c]]
    col_df = col_df.pivot(index='ds', columns='unique_id', values=c)
    sns.lineplot(data=col_df, ax=ax, palette=palette, alpha=0.7, dashes=False)
    ax.set_title(f'{c} [{units[c]}]')
    ax.legend(title='Facility ID')

configure_time_axes(axes, aux_df_filtered['ds'], global_legend=True, legend_fig=fig)

fig.suptitle('Auxiliary and Target Series')
fig.supxlabel('Date and Time [H]')
fig.tight_layout(rect=[0.01, 0.02, 0.99, 0.98])

Let's again compare visually, only for a given period and daily aggregated data.

In [None]:
# Resample the auxiliary series to daily frequency
aux_daily_df = (
    aux_df
    .groupby('unique_id')
    .resample('D', on='ds', include_groups=False)
    .mean(numeric_only=True)   # average numeric columns
    .reset_index()             # bring ds and unique_id back as columns
)

# Choose plot period
period = pd.date_range(start='2023-10-01', end='2024-06-01', freq='h')

fig, axes = plt.subplots(nrows=6, ncols=1, figsize=(15, 22))
axes = axes.flatten()

# Plot the target series
ax = axes[0]
heat_daily_period_df = heat_daily_df[heat_daily_df['ds'].isin(period)]
sns.lineplot(data=heat_daily_period_df.pivot(index='ds', columns='unique_id', values='y'), ax=ax, palette=palette, alpha=0.7, dashes=False)
ax.set_title('Heat Demand [kWh]')
ax.legend(title='Facility ID')

# Plot the auxiliary series
aux_daily_period_df = aux_daily_df[aux_daily_df['ds'].isin(period)]
for i, c in enumerate(aux_cols):
    ax = axes[i + 1]  # Skip the first axis for the target series
    col_df = aux_daily_period_df.loc[:, ['ds', 'unique_id', c]]
    col_df = col_df.pivot(index='ds', columns='unique_id', values=c)
    sns.lineplot(data=col_df, ax=ax, palette=palette, alpha=0.7, dashes=False)
    ax.set_title(f'{c} [{units[c]}]')
    ax.legend(title='Facility ID')

configure_time_axes(axes, aux_daily_period_df['ds'], global_legend=True, legend_fig=fig)

fig.suptitle('Auxiliary and Target Series')
fig.supxlabel('Date [D]')
fig.tight_layout(rect=[0.01, 0.02, 0.99, 0.98])

Summary per series:

In [None]:
for col in aux_cols:
    summarize(aux_df[['ds', 'unique_id', col]], f"auxiliary column '{col}'", target_col=col, zero_count=False)

## Study of the correlation between exog variables for a single ID

We will analyze the correlation between variables both at the individual ID level and using daily aggregated data.

In [None]:
# Choose a specific facility ID for correlation analysis
facility_id = 'F4'

In [None]:
# Filter for the specific facility ID
heat_facility_daily_df = heat_daily_df[heat_daily_df['unique_id'] == facility_id].copy().reset_index(drop=True)
heat_facility_daily_df = heat_facility_daily_df[heat_facility_daily_df['ds'] < pd.Timestamp('2024-06-01')]
aux_facility_daily_df = aux_daily_df[aux_daily_df['unique_id'] == facility_id].copy().reset_index(drop=True)
aux_facility_daily_df = aux_facility_daily_df[aux_facility_daily_df['ds'] < pd.Timestamp('2024-06-01')]

# Merge
facility_daily_df = heat_facility_daily_df.merge(aux_facility_daily_df, on=['ds', 'unique_id'], how='inner')

# Select numeric columns
df_for_plot = facility_daily_df.select_dtypes(include='number')

# Define custom correlation annotation for upper plot
def corrfunc(x, y, **kws):
    pearson_r, _ = pearsonr(x, y)
    spearman_r, _ = spearmanr(x, y)
    kendall_r, _ = kendalltau(x, y)
    ax = plt.gca()
    ax.annotate(
        f"P: {pearson_r:.2f}\n"
        f"S: {spearman_r:.2f}\n"
        f"K: {kendall_r:.2f}",
        xy=(0.5, 0.5),
        xycoords="axes fraction",
        ha="center", va="center",
        fontsize=10
    )

# Define custom scatterplot with trendlines
def scatter_with_trend(x, y, **kwargs):
    ax = plt.gca()
    sns.scatterplot(x=x, y=y, ax=ax, s=10, alpha=0.6)
    sns.regplot(x=x, y=y, ax=ax, scatter=False, color='black', ci=None)  # Linear trend
    smoothed = sm.nonparametric.lowess(y, x, frac=0.3)
    ax.plot(smoothed[:, 0], smoothed[:, 1], color='red', linestyle='--')  # LOWESS
'''
# 7. PairGrid plot
g = sns.PairGrid(df_for_plot, height=2.0)
g.map_lower(scatter_with_trend)
g.map_upper(corrfunc)
g.map_diag(sns.histplot, kde=True)

# Adjust labels and titles
g.set(xlabel="")
for i, col in enumerate(df_for_plot.columns):
    g.axes[0, i].set_title(col, fontsize=12)

plt.suptitle(f'Correlation & Trend Analysis for Facility {facility_id} (Daily)', fontsize=14)
plt.tight_layout()
plt.show()
'''


Since our main focus is on the cold season, let's generate the same plot using only data from that period.

In [None]:
# Filter for the specific facility ID and cold months
heat_facility_winter_daily_df = (
    heat_facility_daily_df
    .loc[heat_daily_df['ds'].dt.month.isin([11, 12, 1, 2, 3])]
)
aux_facility_winter_daily_df = (
    aux_facility_daily_df
    .loc[aux_daily_df['ds'].dt.month.isin([11, 12, 1, 2, 3])]
)

# Merge
facility_winter_daily_df = heat_facility_winter_daily_df.merge(aux_facility_winter_daily_df, on=['ds', 'unique_id'], how='inner')

# Select numeric columns
df_for_plot = facility_winter_daily_df.select_dtypes(include='number')

# PairGrid plot
g = sns.PairGrid(df_for_plot, height=2.0)
g.map_lower(scatter_with_trend)
g.map_upper(corrfunc)
g.map_diag(sns.histplot, kde=True)

# Adjust labels and titles
g.set(xlabel="")
for i, col in enumerate(df_for_plot.columns):
    g.axes[0, i].set_title(col, fontsize=12)

plt.suptitle(f'Correlation & Trend Analysis for Facility {facility_id} and winter period only (Daily)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Filter the heat and auxiliary dataframes for the specific facility ID
heat_facility_df = heat_df[heat_df['unique_id'] == facility_id].copy().reset_index(drop=True)
aux_facility_df = aux_df[aux_df['unique_id'] == facility_id].copy().reset_index(drop=True)

# Filter for the winter period
heat_facility_winter_df = (
    heat_facility_df
    .loc[heat_facility_df['ds'].dt.month.isin([11, 12, 1, 2, 3]) & heat_facility_df['ds'].dt.year] #.isin([2022, 2023])] 
)
aux_facility_winter_df = (
    aux_facility_df
    .loc[aux_facility_df['ds'].dt.month.isin([11, 12, 1, 2, 3]) & aux_facility_df['ds'].dt.year] #.isin([2022, 2023])] 
)

# Merge
facility_winter_df = heat_facility_winter_df.merge(aux_facility_winter_df, on=['ds', 'unique_id'], how='inner')

# Select numeric columns
df_for_plot = facility_winter_df.select_dtypes(include='number')

# PairGrid plot
g = sns.PairGrid(df_for_plot, height=2.0)
g.map_lower(scatter_with_trend)
g.map_upper(corrfunc)
g.map_diag(sns.histplot, kde=True)

# Adjust labels and titles
g.set(xlabel="")
for i, col in enumerate(df_for_plot.columns):
    g.axes[0, i].set_title(col, fontsize=12)

plt.suptitle(f'Correlation & Trend Analysis for Facility {facility_id} and winter period only (Hourly)', fontsize=14)
plt.tight_layout()
plt.show()

## Anomaly detection using DPCA

We now apply DPCA for anomaly detection, again at the individual ID level.

In [None]:
# Choose a specific facility ID for correlation analysis
facility_id = 'F1'

In [None]:
# Filter the heat and auxiliary daily dataframes for the specific facility ID
heat_facility_df = heat_df[heat_df['unique_id'] == facility_id].copy().reset_index(drop=True)
aux_facility_df = aux_df[aux_df['unique_id'] == facility_id].copy().reset_index(drop=True)
facility_df = heat_facility_df.merge(aux_facility_df, on=['ds', 'unique_id'], how='inner')

# Build a dict of Series for every lagged column
numeric_cols = facility_df.select_dtypes(include='number').columns.tolist()
max_lag = 5  # Maximum lag to consider
lags = np.arange(1, max_lag + 1)  # Define lags
lagged = {
    f"{c}_lag_{lag}": facility_df[c].shift(lag)
    for c in numeric_cols
    for lag in lags
}

# Turn it into a DataFrame and concatenate with the original DataFrame
lagged_df = pd.DataFrame(lagged)
facility_wlags_df = pd.concat([facility_df, lagged_df], axis=1)
facility_wlags_df.dropna(inplace=True)  # Drop rows with NaN values (due to lags)
facility_wlags_df.reset_index(inplace=True, drop=True)  # Reset index after dropping rows

# Delete non-numeric columns for DPCA
ds = facility_wlags_df['ds'] # Save the 'ds' column to re-add it later
facility_wlags_df.drop(columns=['ds', 'unique_id'], inplace=True)  # Drop 'ds' and 'unique_id' for DPCA
logging.info("Endogenous and exogenous variables with their lags, for DPCA:")
display(facility_wlags_df.head())

In [None]:
# Standardize the data before PCA
scaler = StandardScaler()
Z = scaler.fit_transform(facility_wlags_df)

# Perform PCA
pca = PCA(n_components=2)
Z_pca = pca.fit_transform(Z)

# Create a DataFrame with PCA results
pca_df = pd.DataFrame(Z_pca, columns=['PC1', 'PC2'])
pca_df['ds'] = ds.values  # Re-add the 'ds' column
pca_df['index'] = np.arange(len(pca_df))  # Add the unique_id column back
pca_df['month'] = ds.dt.month
pca_df['quarter'] = ds.dt.quarter
pca_df['hour'] = ds.dt.hour
pca_df['dayofyear'] = ds.dt.dayofyear

# Plot the PCA results
if interactive:
    fig = px.scatter(
        pca_df,
        x="PC1",
        y="PC2",
        color='month',
        hover_data=["ds"],
        opacity=0.7, 
        title=f"DPCA on {facility_id} using {max_lag} lags",
        width=800,   # in pixels
        height=600   
    )
    fig.show()

else:
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(
        pca_df['PC1'],
        pca_df['PC2'],
        c=pca_df['month'],
        s=20,
        cmap='tab10',
        alpha=0.7,
    )
    plt.colorbar(scatter, label='Month')
    plt.title(f"DPCA on {facility_id} using {max_lag} lags")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.tight_layout()
    plt.show()

By examining the plot using the two principal components, incorporating both 0 and 5 lags of the variables, we observe some data points that deviate from the general pattern. To determine whether these are genuine outliers, we'll visualize them in the original space.

In [None]:
match facility_id:
    case 'F1':
        possible_outliers_0lags = ['2023-02-26T07:00:00', '2019-12-22T11:00:00']
        possible_outliers_5lags = ['2019-12-22T10:00:00']
    case 'F2':
        possible_outliers_5lags = ['2022-01-13T08:00:00', '2022-01-13T09:00:00',  '2022-01-13T10:00:00'] \
            + ['2021-02-13T12:00:00', '2021-02-13T13:00:00', '2021-02-13T14:00:00'] \
            + ['2024-04-16T15:00:00', '2024-04-16T16:00:00', '2024-04-16T17:00:00', '2024-04-16T18:00:00'] 
        possible_outliers_0lags = ['2020-07-24T06:00:00', '2022-02-07T11:00:00']
    case 'F3':
        possible_outliers_0lags = ['2020-02-04T14:00:00', '2024-08-27T13:00:00']
        possible_outliers_5lags = []
    case 'F4':
        possible_outliers_0lags = ['2020-11-22T21:00:00', '2020-02-04T16:00:00', '2020-06-29T14:00:00']
        possible_outliers_5lags = ['2019-12-13T18:00:00', '2019-12-13T19:00:00', '2019-12-13T20:00:00', '2019-12-13T21:00:00'] \
            + ['2022-11-22T18:00:00', '2022-11-22T19:00:00', '2022-11-22T20:00:00'] \
            + ['2023-11-03T10:00:00', '2023-11-03T11:00:00', '2023-11-03T12:00:00']
    case 'F5':
        possible_outliers_0lags = ['2019-12-13T15:00:00', '2020-02-04T16:00:00', '2020-02-04T13:00:00', '2020-06-29T14:00:00', '2023-02-10T07:00:00']
        possible_outliers_5lags = ['2019-12-13T18:00:00', '2019-12-13T19:00:00', '2019-12-13T20:00:00', '2019-12-13T21:00:00'] \
            + ['2022-11-22T18:00:00', '2022-11-22T19:00:00', '2022-11-22T20:00:00'] \
            + ['2023-11-03T10:00:00', '2023-11-03T11:00:00', '2023-11-03T12:00:00']
possible_outliers = possible_outliers_5lags + possible_outliers_0lags

In [None]:
# parse strings into datetimes, then filter
possible_outliers_ds = [pd.to_datetime(d) for d in possible_outliers]
possible_outliers_df = facility_df[facility_df['ds'].isin(possible_outliers_ds)]

# columns and their units
col = ['y', 'dew_point', 'temperature', 'pressure', 'wind_speed', 'humidity']
units = ['kWh', '°C', 'hPa', '°C', 'm/s', '%']
units_map = dict(zip(col, units))

# prepare DataFrames
df_for_plot = facility_df.drop(columns=['unique_id']).set_index('ds')
df_outliers_for_plot = possible_outliers_df.drop(columns=['unique_id']).set_index('ds')

# Plot
if interactive:
    # build subplots
    fig = make_subplots(
        rows=len(col), cols=1,
        shared_xaxes=False,
        subplot_titles=[f"{c} [{units_map[c]}]" for c in col],
        row_heights=[1]*len(col),  
        vertical_spacing=0.04   
    )

    # add traces
    for i, c in enumerate(col, start=1):
        # full series (no hover)
        fig.add_trace(
            go.Scatter(
                x=df_for_plot.index,
                y=df_for_plot[c],
                mode="lines",
                line=dict(color="black"),
                showlegend=False,
                hoverinfo='skip'
            ),
            row=i, col=1
        )
        # outlier markers (interactive)
        fig.add_trace(
            go.Scatter(
                x=df_outliers_for_plot.index,
                y=df_outliers_for_plot[c],
                mode="markers",
                marker=dict(color="red", size=8),
                name="outlier",
                showlegend=False,
                hovertemplate="Date: %{x|%Y-%m-%d %H:%M}<br>Value: %{y}<extra></extra>"
            ),
            row=i, col=1
        )

    # layout
    fig.update_layout(
        height=220 * len(col),
        title_text=f"All Series and red-colored Possible Outliers - Facility {facility_id[-1]}",
    )
    fig.add_annotation(
        text="Year",
        x=0.5, y=-0.03,
        xref="paper", yref="paper",
        showarrow=False,
        font=dict(size=14)
    )

    fig.show()

else:
    n_rows = len(col)
    fig, axes = plt.subplots(
        nrows=n_rows,
        ncols=1,
        figsize=(12, 2.5 * n_rows),  # Adjust width/height as needed
        sharex=False
    )

    # Ensure axes is always iterable (even if n_rows=1)
    if n_rows == 1:
        axes = [axes]

    # Plot each variable
    for i, (ax, c) in enumerate(zip(axes, col)):
        # Plot full series
        ax.plot(
            df_for_plot.index,
            df_for_plot[c],
            color="black",
            label="Value"
        )
        
        # Plot outliers
        ax.scatter(
            df_outliers_for_plot.index,
            df_outliers_for_plot[c],
            color="red",
            s=40,
            label="Outlier",
            zorder=5
        )

        # Title and y-label
        ax.set_title(f"{c} [{units_map[c]}]", fontsize=12)
        ax.set_ylabel(units_map[c], fontsize=10)

        # Improve layout
        ax.grid(True)

    # Common X-label 
    fig.text(0.5, 0.01, "Date", ha="center", fontsize=14)

    # --- Main Title ---
    fig.suptitle(
        f"All Series and Red-Colored Possible Outliers - Facility {facility_id[-1]}",
        fontsize=16
    )

    plt.tight_layout() 
    plt.show()

In [None]:
to_adjust = {
    'F2': ['2020-07-24T06:00:00']
}

# parse strings to datetimes
to_adjust = {uid: [pd.to_datetime(dt) for dt in dts] for uid, dts in to_adjust.items()}

# forward‐fill in heat_df
for uid, dts in to_adjust.items():
    for dt in dts:
        mask = (heat_df['unique_id'] == uid) & (heat_df['ds'] == dt)
        # set the target to NaN
        logging.info("Target observations before adjustment:")
        display(heat_df[mask])
        heat_df.loc[mask, 'y'] = np.nan
# ensure ordering, then ffill per facility
heat_df.sort_values(['unique_id', 'ds'], inplace=True)
heat_df['y'] = heat_df.groupby('unique_id')['y'].ffill()
logging.info("Target observations after adjustment:")
display(heat_df[mask])

# 2) forward‐fill in aux_df
aux_cols = aux_df.columns.difference(['ds', 'unique_id'])
for uid, dts in to_adjust.items():
    for dt in dts:
        mask = (aux_df['unique_id'] == uid) & (aux_df['ds'] == dt)
        logging.info("Auxiliary observations before adjustment:")
        display(aux_df[mask])
        # set all auxiliary columns to NaN
        aux_df.loc[mask, aux_cols] = np.nan
# ensure ordering, then ffill per facility
aux_df.sort_values(['unique_id', 'ds'], inplace=True)
aux_df[aux_cols] = aux_df.groupby('unique_id')[aux_cols].ffill()
logging.info("Auxiliary observations after adjustment:")
display(aux_df[mask])

## Save elaborated data

In [None]:
path_heat = BASE_DIR / "data" / "timeseries_preprocessed" / "heat.csv"
heat_df.to_csv(path_heat, index=False, encoding="utf-8")
logging.info("Heat demand data saved to {}".format(path_heat.relative_to(BASE_DIR)))

path_aux = BASE_DIR / "data" / "timeseries_preprocessed" / "auxiliary.csv"
aux_df.to_csv(path_aux, index=False, encoding="utf-8")
logging.info("Auxiliary data saved to {}".format(path_aux.relative_to(BASE_DIR)))

## Thesis figures

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

heat_df = pd.DataFrame()
for id in range(1, 6):
    path = BASE_DIR / "data" / "timeseries" / f"impianto{id}" / f"impianto{id}_heat_demand.csv"
    df = pd.read_csv(path, sep=';', parse_dates=['timestamp'])
    logging.info(f'Processing data for facility {id}')
    df.columns = ['ds', 'y']
    df['unique_id'] = f'F{id}'
    df = fill_gaps(df, freq='h') # fill missing timestamps in ds using nans 
    heat_df = pd.concat([heat_df, df], ignore_index=True)
heat_df['ds'] = heat_df['ds'].dt.tz_localize(None)
unique_ids = heat_df['unique_id'].unique()
n_ids = len(unique_ids)

In [None]:
fig = make_subplots(
    rows=n_ids,
    cols=1,
    shared_xaxes=False,
    shared_yaxes=True,
    vertical_spacing=0.06,
    subplot_titles=[f"Series {uid}" for uid in unique_ids]
)
for i, uid in enumerate(unique_ids, start=1):
    group = heat_df[(heat_df['unique_id'] == uid)]
    group = group[(group['ds']>pd.Timestamp('2023-09-15')) & (group['ds']<pd.Timestamp('2024-06-15'))]

    fig.add_trace(
        go.Scatter(
            x=group['ds'],
            y=group['y'],
            mode='lines',
            name=str(uid),
            line=dict(color='black')
        ),
        row=i,
        col=1,
    )
for i in range(1, n_ids + 1):
    fig.update_xaxes(
        showline=True,
        linewidth=1,
        linecolor="lightgrey",
        mirror=True,
        row=i,
        col=1
    )
    fig.update_yaxes(
        showline=True,
        linewidth=1,
        linecolor="lightgrey",
        mirror=True,
        row=i,
        col=1,
        dtick=200
    )
fig.update_layout(
    height=200 * n_ids,
    width=750,
    title_text=None,
    showlegend=False,
    margin=dict(l=60, r=20, t=30, b=60),
)
fig.update_layout(template='plotly_white')
fig.update_xaxes(title_text="Date time", row=n_ids, col=1)
fig.update_yaxes(title_text="Heat Demand (kWh)", row=3, col=1)
pio.write_html(
    fig, "fig.html",
    include_plotlyjs="inline", full_html=True,
    config={"toImageButtonOptions": {"format": "svg"}}
)
fig


In [None]:
fig = make_subplots(
    rows=n_ids,
    cols=1,
    shared_xaxes=False,
    shared_yaxes=True,
    vertical_spacing=0.06,
    subplot_titles=[f"Series {uid}" for uid in unique_ids]
)
for i, uid in enumerate(unique_ids, start=1):
    group = heat_df[(heat_df['unique_id'] == uid)]
    group = group[(group['ds']>=pd.Timestamp('2024-01-01')) & (group['ds']<pd.Timestamp('2024-02-12'))]

    fig.add_trace(
        go.Scatter(
            x=group['ds'],
            y=group['y'],
            mode='lines',
            name=str(uid),
            line=dict(color='black')
        ),
        row=i,
        col=1,
    )
for i in range(1, n_ids + 1):
    fig.update_xaxes(
        tickformat="%b %d",
        showline=True,
        linewidth=1,
        linecolor="lightgrey",
        mirror=True,
        row=i,
        col=1
    )
    fig.update_yaxes(
        showline=True,
        linewidth=1,
        linecolor="lightgrey",
        mirror=True,
        row=i,
        col=1
    )
fig.update_layout(
    height=200 * n_ids,
    width=750,
    title_text=None,
    showlegend=False,
    margin=dict(l=60, r=20, t=30, b=60),
)
fig.update_layout(template='plotly_white')
fig.update_xaxes(title_text="Date time", row=n_ids, col=1)
fig.update_yaxes(title_text="Heat Demand (kWh)", row=3, col=1)
pio.write_html(
    fig, "fig.html",
    include_plotlyjs="inline", full_html=True,
    config={"toImageButtonOptions": {"format": "svg"}}
)
fig

In [None]:
from heat_forecast.utils.plotting import plotly_weekly_seasonality

groups = [
    (pd.Timestamp('2023-09-20'), pd.Timestamp('2024-12-21')),  # autumn
    (pd.Timestamp('2024-12-21'), pd.Timestamp('2025-03-21')),  # winter
    (pd.Timestamp('2025-03-21'), pd.Timestamp('2025-06-21')),  # spring
]

labels = ["Autumn", "Winter", "Spring"]

fig = plotly_weekly_seasonality(
    target_df=heat_df,
    groups=groups,
    group_labels=labels,
    width=750,
    height_per_id=300,
    vertical_spacing=0.05,
    n_cols=1,
    annotate=False,
)
fig.update_layout(
    height=210 * n_ids,
    width=750,
    title_text=None,
    showlegend=True,
    margin=dict(l=60, r=20, t=30, b=60),
)
pio.write_html(
    fig, "fig.html",
    include_plotlyjs="inline", full_html=True,
    config={"toImageButtonOptions": {"format": "svg"}}
)
fig



In [None]:
from scipy.stats import pearsonr, spearmanr, kendalltau
from statsmodels.nonparametric.smoothers_lowess import lowess
facility_id = 'F1'

heat_facility_daily_df = (
    heat_daily_df[heat_daily_df['unique_id'] == facility_id]
    .copy()
    .reset_index(drop=True)
)
aux_facility_daily_df = (
    aux_daily_df[aux_daily_df['unique_id'] == facility_id]
    .copy()
    .reset_index(drop=True)
)

facility_daily_df = heat_facility_daily_df.merge(
    aux_facility_daily_df,
    on=['ds', 'unique_id'],
    how='inner'
)

# Select numeric columns
df_for_plot = facility_daily_df.select_dtypes(include='number').copy()
cols = df_for_plot.columns.tolist()
n = len(cols)

fig = make_subplots(
    rows=n,
    cols=n,
    shared_xaxes=False,
    shared_yaxes=False,
    horizontal_spacing=0.01,
    vertical_spacing=0.01,    
)

for i, row_col in enumerate(cols):
    for j, col_col in enumerate(cols):
        x = df_for_plot[col_col]
        y = df_for_plot[row_col]

        # Diagonal: histogram (like sns.histplot on the diagonal)
        if i == j:
            fig.add_trace(
                go.Histogram(
                    x=x,
                    nbinsx=30,
                    showlegend=False,
                    marker=dict(color='black', opacity=0.7)
                ),
                row=i + 1,
                col=j + 1
            )

        # Lower triangle: scatter + linear trend + LOWESS (scatter_with_trend)
        elif i > j:
            # Remove NaNs
            valid = x.notna() & y.notna()
            x_valid = x[valid]
            y_valid = y[valid]

            # Scatter
            fig.add_trace(
                go.Scatter(
                    x=x_valid,
                    y=y_valid,
                    mode='markers',
                    marker=dict(size=4, opacity=0.4, color='black'),
                    showlegend=False
                ),
                row=i + 1,
                col=j + 1
            )

            # LOWESS smoother (like your red dashed line)
            if len(x_valid) > 5:
                lowess_res = lowess(y_valid, x_valid, frac=0.3, return_sorted=True)
                fig.add_trace(
                    go.Scatter(
                        x=lowess_res[:, 0],
                        y=lowess_res[:, 1],
                        mode='lines',
                        line=dict(width=3, color="#ff0000"),
                        showlegend=False
                    ),
                    row=i + 1,
                    col=j + 1
                )

            # Linear trend (OLS)
            if len(x_valid) > 1:
                m, b = np.polyfit(x_valid, y_valid, 1)
                xs = np.linspace(x_valid.min(), x_valid.max(), 50)
                ys = m * xs + b

                fig.add_trace(
                    go.Scatter(
                        x=xs,
                        y=ys,
                        mode='lines',
                        line=dict(width=3, dash='dot', color="#22ff00"),
                        showlegend=False
                    ),
                    row=i + 1,
                    col=j + 1
                )

        # Upper triangle: correlation text (corrfunc)
        else:  # i < j
            pair = df_for_plot[[col_col, row_col]].dropna()
            if len(pair) > 1:
                pearson_r, _ = pearsonr(pair[col_col], pair[row_col])
                spearman_r, _ = spearmanr(pair[col_col], pair[row_col])
                kendall_r, _ = kendalltau(pair[col_col], pair[row_col])
                text = (
                    f"S: {spearman_r:.2f}<br>"
                    f"P: {pearson_r:.2f}<br>"
                )
            else:
                text = "n/a"

            # Position the text in the middle of the cell
            fig.add_trace(
                go.Scatter(
                    x=[0.5],
                    y=[0.4],
                    text=[text],
                    mode='text',
                    showlegend=False,
                    textfont=dict(size=14), 
                ),
                row=i + 1,
                col=j + 1
            )

            # Make this panel look like a label panel, not a real axis
            fig.update_xaxes(
                showgrid=False,
                showticklabels=False,
                range=[0, 1],
                row=i + 1,
                col=j + 1
            )
            fig.update_yaxes(
                showgrid=False,
                showticklabels=False,
                range=[0, 1],
                row=i + 1,
                col=j + 1
            )

# Hide all tick labels
for i in range(n):
    for j in range(n):
        fig.update_xaxes(
            showticklabels=False, row=i+1, col=j+1,
            showline=True,
            linewidth=1,
            linecolor="lightgrey",
            mirror=True,
        )
        fig.update_yaxes(
            showticklabels=False, row=i+1, col=j+1,
            showline=True,
            linewidth=1,
            linecolor="lightgrey",
            mirror=True,
        )

nice_names = {
    'dew_point': 'DPT (°C)',
    'temperature': 'T (°C)',
    'wind_speed': 'WS (m/s)',
    'pressure': 'P (hPa)',
    'humidity': 'RH (%)',
    'y': 'Q (kWh)'
}
    
# Show x labels only on the bottom row
for j, col_name in enumerate(cols):
    fig.update_xaxes(
        showticklabels=True,
        title_text="",
        row=n,
        col=j+1
    )

# Show y labels only on the first column
for i, row_name in enumerate(cols):
    fig.update_yaxes(
        showticklabels=True,
        title_text="",
        row=i+1,
        col=1,
        title_standoff=50
    )

for i, row_name in enumerate(cols):
    fig.add_annotation(
        x=0,               
        y=0.5,              
        xref='x domain',
        yref='y domain',
        text=nice_names.get(row_name, row_name),
        showarrow=False,
        textangle=-90,
        xanchor='center',
        yanchor='middle',
        font=dict(size=14),
        xshift=-45,         
        row=i + 1,
        col=1
    )
for i, col_name in enumerate(cols):
    fig.add_annotation(
        x=0.5,               
        y=0,              
        xref='x domain',
        yref='y domain',
        text=nice_names.get(col_name, col_name),
        showarrow=False,
        textangle=0,
        xanchor='center',
        yanchor='middle',
        font=dict(size=14),
        yshift=-45,         
        row=len(cols),
        col=i + 1
    )

fig.update_layout(
    title=None,
    height=800,
    width=800,
    bargap=0.1,
    hovermode='closest',
    template='plotly_white',
    margin=dict(l=10, r=10, t=10, b=60),
)
pio.write_html(
    fig, "fig.html",
    include_plotlyjs="inline", full_html=True,
    config={"toImageButtonOptions": {"format": "png", "scale": 2}}
)
fig
