In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

In [None]:
# with pandas 2.0, one could use date_format='%Y-%m-%d %H:%M:%S%z', but that's not yet available on Arch Linux
solar_ts=pd.read_csv("data/energy_charts.csv", sep=",", header=0)#date_format='%Y-%m-%d %H:%M:%S%z')#parse_dates={"date": ["Datum"]})

In [None]:
solar_ts['Datum']=pd.to_datetime(solar_ts['Datum'], format='%Y-%m-%d %H:%M:%S%z', utc=True)
solar_ts=solar_ts.set_index(keys="Datum",drop=True)
solar_ts.plot()

In [None]:
adfresult = adfuller(solar_ts[2:30000])
print(adfresult[0])
print(adfresult[1])

In [None]:
# see https://stackoverflow.com/questions/30379789/plot-pandas-data-frame-with-year-over-year-data
pv = pd.pivot_table(solar_ts, index=solar_ts.index.dayofyear, columns=solar_ts.index.year,
                    values='Leistung', aggfunc='sum')
pv.plot(cmap="Grays")

In [None]:
# see https://stackoverflow.com/questions/30379789/plot-pandas-data-frame-with-year-over-year-data
pv = pd.pivot_table(solar_ts, index=solar_ts.index.month, columns=solar_ts.index.year,
                    values='Leistung', aggfunc='sum')
pv.plot(cmap="Grays")

In [None]:
# An example of a gap in the data
# TODO: Also, there is duplicate data here that pandas duplicated-function will not find...?
solar_ts.index[5660:5680]

In [None]:
pd.Series(solar_ts.index.duplicated()).value_counts()

In [None]:
(pd.Series(solar_ts.index[5660:5680]).diff())

In [None]:
# Those values need imputation!
pd.date_range(solar_ts.index.min(), solar_ts.index.max(), freq='15Min').difference(solar_ts.index)

In [None]:
# This add NaN as value for the missing indices, we can impute this later.
solar_ts = solar_ts.resample("15Min").first()
# As only a few values need imputation, so the choice of the imputation algorithm does not matter much.
solar_ts = solar_ts.interpolate(method="time")
# Only now can we infer a frequency.
solar_ts=solar_ts.asfreq(pd.infer_freq(solar_ts.index))

In [None]:
# There are no duplicated dates, good!
# (Although, a bit questionable, see above)
np.count_nonzero(solar_ts.index.duplicated())

In [None]:
solar_ts=solar_ts.asfreq(pd.infer_freq(solar_ts.index))

In [None]:
solar_ts.plot()

In [None]:
solar_ts_series = solar_ts.Leistung

In [None]:
# Normalize
avg, dev = solar_ts_series.mean(), solar_ts_series.std()
solar_ts_series = (solar_ts_series - avg)/dev
solar_ts_series.plot()

In [None]:
# Remove trend (TODO: compare with the approach in the Fourier series video, where they also detrend?)
solar_ts_series = solar_ts_series.diff().dropna()
solar_ts_series.plot()

In [None]:
# Consider taking another difference: solar_ts_series = solar_ts_series.diff().dropna()
# solar_ts_series.plot()

In [None]:
# remove increasing volatility - or (TODO: use a (G)ARCH here).
annual_volatility = solar_ts_series.groupby(solar_ts_series.index.year).std()
annual_vol_per_day = solar_ts_series.index.map(lambda d: annual_volatility.loc[d.year])
solar_ts_series_corrected_variance = solar_ts_series/annual_vol_per_day

In [None]:
annual_volatility

In [None]:
annual_vol_per_day

In [None]:
solar_ts_series_corrected_variance.plot()

In [None]:
# ritvik takes monthly means here
# why not take dayofyear?
monthly_mean = solar_ts_series_corrected_variance.groupby(solar_ts_series_corrected_variance.index.month).mean()
monthly_mean_per_day = solar_ts_series_corrected_variance.index.map(lambda d: monthly_mean.loc[d.month])

In [None]:
solar_ts_series_corrected_variance= solar_ts_series_corrected_variance - monthly_mean_per_day

In [None]:
solar_ts_series_corrected_variance.plot()

In [None]:
# we only take the first few samples as my RAM explodes otherwise
adfresult = adfuller(solar_ts_series_corrected_variance[3:30000])
print(adfresult[0])
print(adfresult[1])
adfresult = adfuller(solar_ts_series_corrected_variance[120000:150000])
print(adfresult[0])
print(adfresult[1])