In [1]:
%cd ../

d:\Time-Series\MTSFP


In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import defaultdict
tqdm.pandas()

In [3]:
os.makedirs("imgs/ch02", exist_ok=True)
source_data = Path("data/london_smart_meters/")
block_data_path = source_data/"hhblock_dataset"/"hhblock_dataset"
assert block_data_path.is_dir()

In [12]:
block_1 = pd.read_csv(block_data_path / 'block_0.csv', parse_dates=False)
block_1['day'] = pd.to_datetime(block_1['day'], yearfirst=True)

In [17]:
block_1.groupby('LCLid')['day'].max().sample(5)

LCLid
MAC003686   2014-02-27
MAC003718   2013-10-15
MAC000002   2014-02-27
MAC003826   2014-02-27
MAC004387   2014-02-27
Name: day, dtype: datetime64[ns]

In [4]:
max_date = None
for f in tqdm(block_data_path.rglob("*.csv")):
    df = pd.read_csv(f, parse_dates=False)
    df["day"] = pd.to_datetime(df["day"], yearfirst=True)
    if max_date is None:
        max_date = df["day"].max()
    else:
        if df["day"].max() > max_date:
            max_date = df["day"].max()
print(f"Max Date across all block: {max_date}")

0it [00:00, ?it/s]

Max Date across all block: 2014-02-27 00:00:00


In [30]:
block_1 = block_1.set_index(['LCLid', 'day']).stack().reset_index().rename(columns={"level_2": "hour_block", 0: "energy_consumption"})
block_1['offset'] = block_1['hour_block'].str.replace('hh_', '').astype(int)

In [32]:
block_1.head()

Unnamed: 0,LCLid,day,hour_block,energy_consumption,offset
0,MAC000002,2012-10-13,hh_0,0.263,0
1,MAC000002,2012-10-13,hh_1,0.269,1
2,MAC000002,2012-10-13,hh_2,0.275,2
3,MAC000002,2012-10-13,hh_3,0.256,3
4,MAC000002,2012-10-13,hh_4,0.211,4


In [5]:
def preprocess_compact(x):
    start_date = x["day"].min()
    name = x["LCLid"].unique()[0]

    dr = pd.date_range(start=start_date, end=max_date, freq="1D")
    dr = (
        pd.DataFrame(columns=[f"hh_{i}" for i in range(48)], index=dr)
        .unstack()
        .reset_index()
    )
    dr.columns = ["hour_block", "day", "_"]
    dr = dr.merge(x, on=["hour_block", "day"], how="left")
    dr = dr.sort_values(["day", "offset"])
    ts = dr["energy_consumption"].values
    len_ts = len(ts)
    return start_date, name, ts, len_ts


def load_process_block_compact(
    block_df, freq="30min", ts_identifier="series_name", value_name="series_value"
):
    grps = block_df.groupby("LCLid")
    all_data = {}
    all_metadata = defaultdict(list)
    for _, df in grps:
        start_date, name, ts, len_ts = preprocess_compact(df)
        all_metadata["all_series"].append(ts)
        all_metadata["all_start_dates"].append(start_date)
        all_metadata["all_names"].append(name)
        all_metadata["all_len"].append(len_ts)

    all_data[ts_identifier] = all_metadata["all_names"]
    all_data["start_timestamp"] = all_metadata["all_start_dates"]
    all_data["frequency"] = freq
    all_data[value_name] = all_metadata["all_series"]
    all_data["series_length"] = all_metadata["all_len"]
    return pd.DataFrame(all_data)

In [6]:
block1_compact = load_process_block_compact(block_1, freq='30min', ts_identifier='LCLid',
                                            value_name='energy_consumption')

NameError: name 'block_1' is not defined

In [80]:
block1_compact.head()
# grps.agg({'day': 'min'})

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length
0,MAC000002,2012-10-13,30min,"[0.263, 0.2689999999999999, 0.275, 0.256, 0.21...",24144
1,MAC000246,2011-12-04,30min,"[0.175, 0.098, 0.144, 0.065, 0.071, 0.037, 0.0...",39216
2,MAC000450,2012-03-23,30min,"[1.337, 1.426, 0.996, 0.971, 0.994, 0.952, 0.8...",33936
3,MAC001074,2012-05-09,30min,"[0.18, 0.086, 0.106, 0.173, 0.146, 0.223, 0.21...",31680
4,MAC003223,2012-09-18,30min,"[0.076, 0.079, 0.123, 0.109, 0.051, 0.069, 0.0...",25344


In [84]:
display(block1_compact.memory_usage(deep=True))
print(f'Total: {block1_compact.memory_usage(deep=True).sum() / 1024 ** 2} MB')

Index                  128
LCLid                 3300
start_timestamp        400
frequency             3100
energy_consumption    6000
series_length          400
dtype: int64

Total: 0.0127105712890625 MB


In [7]:
def preprocessed_expanded(x):
    start_date = x["day"].min()
    dr = pd.date_range(start=start_date, end=x["day"].max(), freq="1D")
    dr = (
        pd.DataFrame(columns=[f"hh_{i}" for i in range(48)], index=dr)
        .unstack()
        .reset_index()
    )
    dr.columns = ["hour_block", "day", "_"]
    dr = dr.merge(x, on=["hour_block", "day"], how="left")
    dr["series_length"] = len(dr)
    return dr


def load_process_block_expanded(block_df, freq="30min"):
    grps = block_df.groupby("LCLid")
    all_series = []
    for _, df in tqdm(grps, leave=False):
        ts = preprocessed_expanded(df)
        all_series.append(ts)
    block_df = pd.concat(all_series)
    block_df["offset"] = block_df["hour_block"].str.replace("hh_", "").astype(int)
    block_df["timestamp"] = (
        block_df["day"] + pd.to_timedelta(block_df["offset"] * 30, unit='m')
    )
    block_df["frequency"] = freq
    block_df = block_df.sort_values(["LCLid", "timestamp"])
    block_df = block_df.drop(columns=["_", "hour_block", "offset", "day"])
    return block_df

In [114]:
block1_expanded = load_process_block_expanded(block_1, freq='30min')

  0%|          | 0/50 [00:00<?, ?it/s]

In [116]:
block1_expanded.head()

Unnamed: 0,LCLid,energy_consumption,series_length,timestamp,frequency
0,MAC000002,0.263,24144,2012-10-13 00:00:00,30min
503,MAC000002,0.269,24144,2012-10-13 00:30:00,30min
1006,MAC000002,0.275,24144,2012-10-13 01:00:00,30min
1509,MAC000002,0.256,24144,2012-10-13 01:30:00,30min
2012,MAC000002,0.211,24144,2012-10-13 02:00:00,30min


In [117]:
display(block1_expanded.memory_usage())
print(f"Total: {block1_expanded.memory_usage().sum()/1024**2} MB")

Index                 9834240
LCLid                 9834240
energy_consumption    9834240
series_length         9834240
timestamp             9834240
frequency             9834240
dtype: int64

Total: 56.27197265625 MB


In [118]:
del block1_expanded, block_1, block1_compact

In [8]:
block_df_1 = []
for file in tqdm(
    sorted(list(block_data_path.glob("*.csv"))), desc="Processing Blocks..."
):
    block_df = pd.read_csv(file, parse_dates=False)
    block_df["day"] = pd.to_datetime(block_df["day"], yearfirst=True)

    block_df = block_df.loc[block_df["day"] >= "2012-01-01"]
    block_df = (
        block_df.set_index(["LCLid", "day"])
        .stack()
        .reset_index()
        .rename(columns={"level_2": "hour_block", 0: "energy_consumption"})
    )
    block_df["offset"] = block_df["hour_block"].str.replace("hh_", "").astype(int)
    block_df_1.append(
        load_process_block_compact(
            block_df,
            freq="30min",
            ts_identifier="LCLid",
            value_name="energy_consumption",
        )
    )
hhblock_df = pd.concat(block_df_1)
del block_df_1

Processing Blocks...:   0%|          | 0/112 [00:00<?, ?it/s]

In [9]:
del block_df

In [10]:
household_info = pd.read_csv(source_data / "informations_households.csv")
hhblock_df = hhblock_df.merge(household_info, on="LCLid", validate="one_to_one")

In [11]:
hhblock_df.sample()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file
4350,MAC003008,2012-05-16,30min,"[0.06, 0.099, 0.11, 0.062, 0.054, 0.044, 0.043...",31344,Std,ACORN-L,Adversity,block_77


In [12]:
hhblock_df.to_csv(source_data/'hhblock_df.csv')

In [9]:
bank_holidays = pd.read_csv(source_data / "uk_bank_holidays.csv", parse_dates=False)
bank_holidays["Bank holidays"] = pd.to_datetime(
    bank_holidays["Bank holidays"], yearfirst=True
)
bank_holidays = bank_holidays.set_index("Bank holidays")

In [37]:
bank_holidays.sample()

Unnamed: 0_level_0,Type
Bank holidays,Unnamed: 1_level_1
2012-06-04,Good Friday


In [10]:
bank_holidays = bank_holidays.resample("30min").asfreq()

In [11]:
bank_holidays = (
    bank_holidays.groupby(bank_holidays.index.date).ffill().fillna("NO_HOLIDAY")
)
bank_holidays.index.name = 'datetime'

In [12]:
weather_hourly = pd.read_csv(
    source_data / "weather_hourly_darksky.csv", parse_dates=False
)
weather_hourly["time"] = pd.to_datetime(weather_hourly["time"], yearfirst=True)
weather_hourly = weather_hourly.set_index("time")

In [41]:
weather_hourly.head()

Unnamed: 0_level_0,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-11-11 00:00:00,5.97,104,10.24,8.86,1016.76,10.24,2.77,rain,partly-cloudy-night,0.91,Partly Cloudy
2011-11-11 01:00:00,4.88,99,9.76,8.83,1016.63,8.24,2.95,rain,partly-cloudy-night,0.94,Partly Cloudy
2011-11-11 02:00:00,3.7,98,9.46,8.79,1016.36,7.76,3.17,rain,partly-cloudy-night,0.96,Partly Cloudy
2011-11-11 03:00:00,3.12,99,9.23,8.63,1016.28,7.44,3.25,rain,fog,0.96,Foggy
2011-11-11 04:00:00,1.85,111,9.26,9.21,1015.98,7.24,3.7,rain,fog,1.0,Foggy


In [13]:
weather_hourly = weather_hourly.resample('30min').ffill()
weather_hourly.head()

Unnamed: 0_level_0,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-11-01 00:00:00,13.63,160,13.49,11.48,1008.14,13.49,3.11,rain,clear-night,0.88,Clear
2011-11-01 00:30:00,13.63,160,13.49,11.48,1008.14,13.49,3.11,rain,clear-night,0.88,Clear
2011-11-01 01:00:00,13.26,154,12.73,11.58,1007.88,12.73,3.08,rain,partly-cloudy-night,0.93,Partly Cloudy
2011-11-01 01:30:00,13.26,154,12.73,11.58,1007.88,12.73,3.08,rain,partly-cloudy-night,0.93,Partly Cloudy
2011-11-01 02:00:00,12.94,161,13.65,12.14,1007.09,13.65,3.71,rain,clear-night,0.91,Clear


In [14]:
def map_weather_holidays(row):
    date_range = pd.date_range(
        row["start_timestamp"], periods=row["series_length"], freq=row["frequency"]
    )
    std_df = pd.DataFrame(index=date_range)
    holidays = std_df.join(bank_holidays, how="left").fillna("NO_HOLIDAY")
    weather = std_df.join(weather_hourly, how="left")
    assert len(holidays) == row["series_length"]
    assert len(weather) == row["series_length"]
    row["holidays"] = holidays["Type"].values
    for col in weather:
        row[col] = weather[col].values
    return row


In [16]:
hhblock_df = hhblock_df.progress_apply(map_weather_holidays, axis=1)

  0%|          | 0/5560 [00:00<?, ?it/s]

MemoryError: Unable to allocate 1.17 MiB for an array with shape (5, 30672) and data type float64