In [None]:
import pandas as pd

# Load Data

In [None]:
# Setup
solar_ts = pd.read_csv("data/energy_charts.csv", sep=",", header=0)
solar_ts["Datum"] = pd.to_datetime(solar_ts["Datum"], utc=True)

# Split
train_ts = solar_ts[solar_ts["Datum"] < "2023-01-01"]
val_ts = solar_ts[(solar_ts["Datum"] >= "2023-01-01") & (solar_ts["Datum"] < "2024-01-01")]
test_ts = solar_ts[solar_ts["Datum"] >= "2024-01-01"]

train_ts = train_ts.set_index(keys="Datum", drop=True)
val_ts = val_ts.set_index(keys="Datum", drop=True)
test_ts = test_ts.set_index(keys="Datum", drop=True)


# Remove leap years

# Remove Duplicates


In [8]:
# Check for duplicated dates
for ts in [train_ts, val_ts, test_ts]:
    if ts.index.duplicated().any():
        print("Duplicates found:")
        print(ts[ts.index.duplicated(keep=False)])

# Keep the first occurrence of each duplicate
for i, ts in enumerate([train_ts, val_ts, test_ts]):
    ts = ts[~ts.index.duplicated(keep='first')]
    if i == 0:
        train_ts = ts
    elif i == 1:
        val_ts = ts
    else:
        test_ts = ts

# Confirm that there are no duplicates anymore
for ts in [train_ts, val_ts, test_ts]:
    print(pd.Series(ts.index.duplicated()).value_counts())

False    280516
Name: count, dtype: int64
False    35040
Name: count, dtype: int64
False    12038
Name: count, dtype: int64


# Interpolate Missing Timesteps

In [10]:
# Check for continuity
def check_continuity(ts, split):    
    expected_range = pd.date_range(start=ts.index.min(), end=ts.index.max(), freq='15min')
    missing_times = expected_range.difference(ts.index)
    if not missing_times.empty:
        print(f"Missing timestamps detected in {split}: {missing_times}")
    else:
        print(f"No missing timestamps in {split}")

# Check for continuity before reindexing
for split, ts in zip(['train_ts', 'val_ts', 'test_ts'], [train_ts, val_ts, test_ts]):
    check_continuity(ts, split)

# Create a full range of 15-min intervals and reindex, updating the original DataFrames
train_ts = train_ts.reindex(pd.date_range(start=train_ts.index.min(), end=train_ts.index.max(), freq='15min'))
val_ts = val_ts.reindex(pd.date_range(start=val_ts.index.min(), end=val_ts.index.max(), freq='15min'))
test_ts = test_ts.reindex(pd.date_range(start=test_ts.index.min(), end=test_ts.index.max(), freq='15min'))

# Interpolate missing timesteps using time interpolation and reassign the updated DataFrames
train_ts.interpolate(method='time', inplace=True)
val_ts.interpolate(method='time', inplace=True)
test_ts.interpolate(method='time', inplace=True)

# Check for continuity after interpolation
for split, ts in zip(['train_ts', 'val_ts', 'test_ts'], [train_ts, val_ts, test_ts]):
    check_continuity(ts, split)

No missing timestamps in train_ts
No missing timestamps in val_ts
No missing timestamps in test_ts
No missing timestamps in train_ts
No missing timestamps in val_ts
No missing timestamps in test_ts


# Check Missing Values

In [15]:
# Find missing values 



TypeError: 'bool' object is not callable

# Resample from 15min to 1h

In [16]:
# Resample 
def resample_ts(ts, freq='h'):
    return ts.resample(freq).sum()

train_ts = resample_ts(train_ts)
val_ts = resample_ts(val_ts)
test_ts = resample_ts(test_ts)

In [22]:
# Check for continuity after resampling
for split, ts in zip(['train_ts', 'val_ts', 'test_ts'], [train_ts, val_ts, test_ts]):
    check_continuity(ts, split)

# get info
train_ts.info()
val_ts.info()
test_ts.info()

Missing timestamps detected in train_ts: DatetimeIndex(['2014-12-31 23:15:00+00:00', '2014-12-31 23:30:00+00:00',
               '2014-12-31 23:45:00+00:00', '2015-01-01 00:15:00+00:00',
               '2015-01-01 00:30:00+00:00', '2015-01-01 00:45:00+00:00',
               '2015-01-01 01:15:00+00:00', '2015-01-01 01:30:00+00:00',
               '2015-01-01 01:45:00+00:00', '2015-01-01 02:15:00+00:00',
               ...
               '2022-12-31 19:45:00+00:00', '2022-12-31 20:15:00+00:00',
               '2022-12-31 20:30:00+00:00', '2022-12-31 20:45:00+00:00',
               '2022-12-31 21:15:00+00:00', '2022-12-31 21:30:00+00:00',
               '2022-12-31 21:45:00+00:00', '2022-12-31 22:15:00+00:00',
               '2022-12-31 22:30:00+00:00', '2022-12-31 22:45:00+00:00'],
              dtype='datetime64[ns, UTC]', length=210384, freq=None)
Missing timestamps detected in val_ts: DatetimeIndex(['2023-01-01 00:15:00+00:00', '2023-01-01 00:30:00+00:00',
               '2023-01-01 0

In [21]:
test_ts.info()
test_ts.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3010 entries, 2024-01-01 00:00:00+00:00 to 2024-05-05 09:00:00+00:00
Freq: h
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Leistung  3010 non-null   float64
dtypes: float64(1)
memory usage: 111.6 KB


Unnamed: 0,Leistung
count,3010.0
mean,24335.84402
std,38752.120566
min,0.0
25%,0.0
50%,80.75
75%,38862.85
max,180412.1


In [None]:
# Resample
solar_ts = solar_ts.resample('h').sum()

# Set the frequency to the inferred frequency of the DataFrame index
solar_ts = solar_ts.asfreq(pd.infer_freq(solar_ts.index))
solar_ts