In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from darts import TimeSeries

from aare.constants import LOC_BERN, LOC_THUN, TIME, TEMP
from aare.remote_existenz_store import RemoteExistenzStore

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
store = RemoteExistenzStore()

In [None]:
freq = "1h"
ANYTIME = "0"  # to be used as period start when querying influx. starting at 0 just returns all the data.

In [None]:
df = store.query_hydro(ANYTIME, LOC_BERN, agg_freq=freq)
o_df = df.copy()
df

In [None]:
# restore original df for iterative development, can re-run if necessary
df = o_df.copy()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# there shouldn't be any NaNs because we set agg_create_empty to False (default)
df.isna().value_counts()

In [None]:
# at max one is allowed, at the end of the series
assert (df.set_index(TIME).resample(freq).count() > 1).sum().item() <= 1, "Has more than one data points within one time-step according to frequency"

In [None]:
# resample to add nan points where data is missing. also removes the trailing data point if 18:00 and 18:55 for example.
# doing this manually gives a bit more control and avoid having to send this data over the air from the influx server.
df = df.set_index(TIME).resample(freq).first().reset_index(TIME)

In [None]:
df.shape

In [None]:
df.isna().value_counts()

In [None]:
px.line(df, TIME, TEMP)

We can clearly see that there is a lot of data missing from 2003 to 2009 although the existing data seems plausible and follows the trend.

We can also clearly see some outliers that extend below the 0° border, which doesn't make sense. This is also reflected in the dataset summary, where the minimum is -9.5°.

Manually creeping up on the Y-axis to see what the lowest, likely valid temperature is, it seems that 2.5° would be a good cutoff to remove outliers on the low end.

In [None]:
df.describe().T

Manual visual analysis shows that there is a period from Jan 2003 to Mar 2003 with very strange data. This data should be excluded.
You could assume that this is the start of some measurement difficulties that are only remedied in 2009, so unless we find that it's not enough data, it might be best to exclude everything from Jan 2003 up to Jul 2009, where everything seems to be in order again.

In [None]:
def between(df, from_, to_):
    """Returns a boolean mask for a time period selection. Assumes '_time' as time column and falls back to index."""
    if TIME in df.columns:
        return (df[TIME] >= from_) & (df[TIME] < to_)
    
    return (df.index >= from_) & (df.index < to_)    

In [None]:
df.loc[between(df, "2003-01-28", "2003-03-03"), TEMP] = np.nan
# this works if the time is the index
# df.loc["2003-01-28":"2003-03-03", TEMP] = np.nan

In [None]:
low_cutoff = 2.5

In [None]:
print("Below 0°C:", np.count_nonzero(df[TEMP] <= 0))
print(f"Between 0 and {low_cutoff}°C:", np.count_nonzero((df[TEMP] > 0) & (df[TEMP] <= low_cutoff)))

In [None]:
# eliminate all data points below the lower outlier cutoff
df.loc[df[TEMP] <= low_cutoff, TEMP] = np.nan

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
px.line(df, TIME, TEMP)

After cleanup, it appears that the data from June 10th 2009 onwards is best. The big gap with very little data from 2003 to 2009 is probably unusable.
The data before that (Jun 2001 - Jun 2003) seems mostly usable but has some large gaps as well.

It might be easier to discard just this data as it's less than 2 years worth and could have non-negligible differences in measurement methodology, distribution, etc. compared to the new data from 6 years later. Comparing (Jun 2001 - Jun 2003) to (Jun 2009 - Jun 2011) (see below) doesn't raise any warning flags that the data prior to 2009 would be invalid. However, discarding it also loses about 18 months of data and leaves us with 15 years (180 months) of continuous data; that's a 9% loss.

If experimentation shows that more data would be helpful, efforts to recover & clean the data prior to 2009 can be made. To start experimentation and modelling, the 15+ years after should be enough.

In [None]:
px.line(df.loc[between(df, "2001-06-01", "2003-06-01")], TIME, TEMP)

In [None]:
px.line(df.loc[between(df, "2009-06-01", "2011-06-01")], TIME, TEMP)

In [None]:
# Remove any data prior to the start date completely
good_start_date = "2009-06-10"
df.drop(df[df[TIME] < good_start_date].index, inplace=True)
# this would work as well, but then we're working with a slice copy
# df = df[df[TIME] >= good_start_date]
df.shape

In [None]:
px.line(df, TIME, TEMP)

Visually inspecting the data we can still see

* Random downward spikes of unrealistic magnitude
* Occasional gaps

Apart from those, the data seems very clean already.

In [None]:
df.describe().T

In [None]:
df.isna().value_counts()

In [None]:
df["temp_diff_to_prev"] = df[TEMP].diff().abs()
df["temp_diff_to_next"] = df[TEMP].diff(-1).abs()

In [None]:
outlier_quantile = 0.999
outlier_diff = max(df["temp_diff_to_prev"].quantile(outlier_quantile), df["temp_diff_to_next"].quantile(outlier_quantile))
outlier_diff

In [None]:
df[(df["temp_diff_to_prev"] > outlier_diff) | (df["temp_diff_to_next"] > outlier_diff)]

In [None]:
df[(df["temp_diff_to_prev"] > outlier_diff) & (df["temp_diff_to_next"] > outlier_diff)]

One variant of outlier is at the start and end of measurement, so [NaN, outlier, normal measurement, ...] or reverse. This is a common pattern in industry sensor data measurements at least from my experience. \
A slight variation of this first variant is the case when the prev or next measurement is exactly 0 instead of NaN. \
Another is a random drop or spike so [normal, outlier, normal]. These have both diffs above threshold.

All of these variants appear in the data.

Ps. another analysis like that might be necessary after interpolating the gaps.

In [None]:
# variant 1a
df.loc[(df["temp_diff_to_prev"].isna() | (df["temp_diff_to_prev"] == 0)) & (df["temp_diff_to_next"] > outlier_diff), TEMP] = np.nan
# variant 1b
df.loc[(df["temp_diff_to_next"].isna() | (df["temp_diff_to_next"] == 0)) & (df["temp_diff_to_prev"] > outlier_diff), TEMP] = np.nan
# variant 2
df.loc[(df["temp_diff_to_prev"] > outlier_diff) & (df["temp_diff_to_next"] > outlier_diff), TEMP] = np.nan

In [None]:
# update diffs because outliers have now been removed (set to NaN)
df["temp_diff_to_prev"] = df[TEMP].diff().abs()
df["temp_diff_to_next"] = df[TEMP].diff(-1).abs()

In [None]:
# all of these look legit, although they certainly fall outside the norm
df[(df["temp_diff_to_prev"] > outlier_diff) | (df["temp_diff_to_next"] > outlier_diff)]

In [None]:
px.line(df, TIME, TEMP)

Moving to Darts now helps with gap analysis.

In [None]:
def to_ts(df):
    """Transforms a dataframe into a darts TimeSeries using the predefined TIME column (or index)."""
    if TIME in df.columns:
        tdf = df.set_index(TIME)
    else:
        tdf = df
    # turn it into timezone-naive timestamps because that's what darts wants.
    # all the data is in UTC anyway, so a conversion is necessary on display no matter what.
    tdf.index = tdf.index.tz_localize(None)
    
    return TimeSeries.from_dataframe(tdf, freq=freq)

In [None]:
ts = to_ts(df)
ts

In [None]:
ts.gaps()

In [None]:
ts.gaps().value_counts("gap_size")

Most gaps are size 1. This is also fairly visible in the data, for example at the end of 2013. The data doesn't appear to be wrong, just more sparse than it should be.

In [None]:
df[TEMP].isna().value_counts()

In [None]:
df[TEMP].interpolate(limit=1).isna().value_counts()

In [None]:
df["temp_filled"] = df[TEMP].interpolate(limit=1)
df["was_filled"] = df[TEMP].isna() & ~df["temp_filled"].isna()

In [None]:
px.scatter(df, x=TIME, y="temp_filled", color="was_filled")

TODO Found some wild outlier on 2009-06-06, maybe move start date a bit later (e.g. 09.06.) or manually remove that outlier (can threshold to 25 for example).