# Read first


In [1]:
import json
import glob
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
from IPython.core.interactiveshell import InteractiveShell
from datetime import date, timedelta, datetime
from tqdm.auto import tqdm
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# disable chained assignments
pd.options.mode.chained_assignment = None


plt.rcParams['figure.figsize'] = [20, 12]


# Make Jupyter output multiple results without explicit print statements (see https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/)
InteractiveShell.ast_node_interactivity = "all"

# Helper Functions

In [3]:
def clean_weather_data(data, date):
    # Select relevant columns (Condition is currently not used)
    columns = ["Time", "Temperature", "Humidity", "Condition"]
    
    try:
        selected_data = data[columns].dropna()
    except:
        try: 
            selected_data = data[0][columns].dropna()
        except Exception as e:
            print(date)
            raise e
            
    # Create a list of the correct dates (First two rows belong to previous day)
    n_rows = len(selected_data)
    date_list = [str((pd.to_datetime(date) - timedelta(1)).date())] * 2 \
                + [str(pd.to_datetime(date).date())] * (n_rows-2)
    selected_data["Date"] = date_list
    
    # Combine the dates and the times into one timestamp
    selected_data["Datetime"] = pd.to_datetime(selected_data.Date + " " + 
                                               selected_data.Time)
    
    # Drop now unnecessary columns and set index
    selected_data.drop(["Time", "Date"], axis=1, inplace=True)
    selected_data.set_index("Datetime", inplace=True)
    
    # Convert Temperature from Fahrenheit to Celsius
    selected_data["Temperature"] = (((selected_data.Temperature.str.rstrip('F')
                                     .astype(int) - 32) * (5/9))
                                    .astype(int))
    
    # Convert Humidity into decimal number
    selected_data["Humidity"] = (selected_data.Humidity.str.rstrip(' %')
                                 .astype(int) / 100)
    
    selected_data.columns =  map(str.lower, selected_data.columns)
    return selected_data



def scrape_weather(start, end, url):
    # Manually scrape Wunderground for historical weather data.

    days = pd.date_range(start=start,end=end)

    driver = webdriver.Chrome('misc/chromedriver')

    weather = []

    for day in tqdm(days):
        date = str(day.date())
        current_url = url + date
        driver.get(current_url)
        tables = WebDriverWait(driver,2000).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
        
        # Ugly I know, but it works for now. Sometimes, the table structure is
        # different for some reason, so we catch those cases.
        try:
            table = pd.read_html(tables[1].get_attribute('outerHTML'))
        except:
            try:
                table = pd.read_html(tables[0].get_attribute('outerHTML'))
            except Exception as e:
                print(date)
                raise e

        weather.append(clean_weather_data(table[0], date))

    driver.close()

    df = pd.concat(weather)
    return df

In [4]:
def read_dir(path, name="value", usecols=None, header=None):
    print(f"Reading directory {path} ...", flush=True)
    all_files = glob.glob(path + "/*.csv")

    df_list = []

    for filename in tqdm(all_files):
        df = pd.read_csv(filename, 
                         header=header,
                         usecols=usecols)
        
        # Extract day from filename and create Datetime Index
        day = filename.rsplit("\\")[-1].split('.')[0]
        idx = pd.Series(pd.date_range(start=day, 
                                      freq="1S", 
                                      periods=86400),
                        name="time")
        df.set_index(idx, inplace=True)
        df_list.append(df)

    out_df = pd.concat(df_list, axis=0, ignore_index=False)
    out_df.columns=[name]
    print("Directory processed successfully.\n")
    return out_df

In [5]:
def save_compressed(file, path, name, key):
    print(f"Saving {name}.h5...")
    try:
        file.to_hdf(
            path_or_buf = path + name + ".h5",
              key=key, 
              mode="w",
              complevel=5
        )
        print("File saved successfully.")
    except Exception as e:
        print("Something went wrong with saving. Error:")
        raise e

In [6]:
def read_compressed(path):
    print("Reading file...")
    try:
        df = pd.read_hdf(path)
        return df
    except Exception as e:
        print(f"Something went wrong while reading the file. Make sure {path} "
              "contains a h5 with that name.")
        raise e

In [7]:
def resample_and_export(df, sampling_rate, dataset, name):
    print(f"Resampling {name} to {sampling_rate}...")
    df = df.resample(sampling_rate).mean()
    save_compressed(
        file=df,
        path="data/"+dataset+"/preprocessed/",
        name=name+"_"+sampling_rate,
        key=name
    )

In [7]:
columns = ["mains", "fridge", "oven", "microwave", "washing machine", "toaster"]
columns = ["fridge", "washing machine", "microwave", "laptop computer", "television"]

# DRED



## Mains

In [8]:
mains_DRED = pd.read_csv(
    filepath_or_buffer="data/DRED/raw/Aggregated_data.csv", 
    header=1,
    low_memory=False,
    index_col=None
)

mains_DRED.dropna(inplace=True)
mains_DRED["time"] = pd.to_datetime(
    mains_DRED["Unnamed: 0"].astype(str).str[:-6], 
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

mains_DRED.drop(["Unnamed: 0"], axis=1, inplace=True)
mains_DRED.set_index("time", inplace=True)

In [9]:
save_compressed(
    file=mains_DRED,
    path="data/DRED/preprocessed/",
    name="mains_preprocessed",
    key="mains"
)

Saving mains_preprocessed.h5...
File saved successfully.


## Appliances

In [10]:
appliances_DRED = pd.read_csv(
    filepath_or_buffer="data/DRED/raw/Appliance_data.csv", 
    header=1,
    low_memory=False,
    index_col=None
)

appliances_DRED.dropna(inplace=True)
appliances_DRED["time"] = pd.to_datetime(
    appliances_DRED["Unnamed: 0"].astype(str).str[:-6], 
    format='%Y-%m-%d %H:%M:%S',
    errors='coerce'
)

appliances_DRED.drop(["Unnamed: 0"], axis=1, inplace=True)
appliances_DRED.set_index("time", inplace=True)

In [11]:
save_compressed(
    file=appliances_DRED,
    path="data/DRED/preprocessed/",
    name="appliances_preprocessed",
    key="appliances"
)

Saving appliances_preprocessed.h5...
File saved successfully.


## Weather data

In [16]:
weather_df_DRED = scrape_weather(
    start="2015-07-05",
    end="2015-12-05",
    url='https://www.wunderground.com/history/daily/nl/rotterdam/EHRD/date/'
)

  0%|          | 0/154 [00:00<?, ?it/s]

In [18]:
save_compressed(
    file=weather_df_DRED,
    path="data/DRED/preprocessed/",
    name="weather_scraped",
    key="weather"
)

# Resample data from half hourly to 1s intervals to match mains frequency 
# for merging later. Gaps are simply filled forward.
# Currently, condition gets lost in this process.

weather_df_DRED_full = weather_df_DRED.resample('1s').median().fillna(method='ffill')

save_compressed(
    file=weather_df_DRED_full,
    path="data/DRED/preprocessed/",
    name="weather_preprocessed_1s",
    key="weather"
)

Saving weather_scraped.h5...
File saved successfully.
Saving weather_preprocessed_1s.h5...
File saved successfully.


## Occupancy

In [19]:
# We use the smartphones bluetooth data which indicates its connection to 
# various beacons in the house to determine when the household is occupied.

bt_rssi_DRED = pd.read_csv(
    filepath_or_buffer="data/DRED/raw/BT_rssi.csv", 
    header=0,
    low_memory=False,
    index_col=None
)

In [20]:
# We only need a connection to any beacon to determine the house occupied

bt_rssi_DRED.drop_duplicates(subset="Time", inplace=True)
bt_rssi_DRED["Occupied"] = 1

bt_rssi_DRED["time"] = pd.to_datetime(bt_rssi_DRED["Time"],
                         format='%Y-%m-%d %H:%M:%S',
                         errors='coerce')
bt_rssi_DRED.set_index('time', inplace=True)


bt_rssi_DRED.drop(
    labels=["Time", "RSSI", "Temperature", "BatteryLevel", "Id", 
            "Proximity", "Location"],
    axis=1,
    inplace=True
)

In [21]:
# Resample the data to 1 second intervals
# Fill all gaps shorter than the minimum duration (1h), otherwise set Occupied=0
# This gap is set manually and tries to minimize false negatives (cases where
# the house is marked as unoccupied).
# Selecting gaps by length: https://stackoverflow.com/a/30538371

min_unoccupied_length = 3600

bt_rssi_DRED_res = bt_rssi_DRED.resample("1s").median()
mask = bt_rssi_DRED_res.copy()
grp = ((mask.notnull() != mask.shift().notnull()).cumsum())
grp['ones'] = 1

occupancy_DRED = pd.DataFrame(
    data = ((grp.groupby("Occupied")['ones'].transform('count')
             < min_unoccupied_length)| bt_rssi_DRED_res["Occupied"].notnull()
           ).astype(int),
    columns=["occupied"])

## Join everything together

In [22]:
data_DRED = mains_DRED.join(
    other=[appliances_DRED, weather_df_DRED_full, occupancy_DRED],
    lsuffix='_l', 
    rsuffix='_r'
).dropna()

In [23]:
save_compressed(
    file=data_DRED,
    path="data/DRED/preprocessed/",
    name="dred_preprocessed",
    key="data"
)

Saving dred_preprocessed.h5...
File saved successfully.


## Export aggregated data

In [24]:
data_DRED_6s = data_DRED.resample("6s").mean()

save_compressed(
    file=data_DRED_6s,
    path="data/DRED/preprocessed/",
    name="dred_6s",
    key="dred"
)

Saving dred_6s.h5...
File saved successfully.


In [25]:
data_DRED_1min = data_DRED.resample("1min").mean()

save_compressed(
    file=data_DRED_1min,
    path="data/DRED/preprocessed/",
    name="dred_1min",
    key="dred"
)

Saving dred_1min.h5...
File saved successfully.


## Cleanup

In [26]:
del (
    mains_DRED,
    appliances_DRED,
    weather_df_DRED, weather_df_DRED_full,
    bt_rssi_DRED, bt_rssi_DRED_res, mask, grp,
    occupancy_DRED,
    data_DRED,
    data_DRED_6s,
    data_DRED_1min
)

# ECO

In [27]:
# Reusable function to process occupancy in ECO dataset

def process_occupancy(occ):
    # Create time index
    occupancy = occ.copy()
    occupancy["time"] = occupancy["Unnamed: 0"].astype(str) + ' '\
        + occupancy["variable"].str.strip("'")

    occupancy["time"] = pd.to_datetime(occupancy["time"],
                                       format="%d-%b-%Y %H:%M:%S")

    occupancy.set_index("time", inplace=True)
    occupancy.drop(["Unnamed: 0", "variable"], axis=1, inplace=True)
    occupancy.sort_index(inplace=True)

    # For consistency with DRED, make sure there are not gaps shorter than 30
    # minutes.
    min_unoccupied_length = 3600

    mask = occupancy.copy()
    grp = ((mask.notnull() != mask.shift().notnull()).cumsum())
    grp['ones'] = 1

    occupancy = pd.DataFrame(((grp.groupby("value")['ones'].transform('count')
                               < min_unoccupied_length)
                              | occupancy["value"] == 1).astype(int),
                             columns=["occupied"])
    return occupancy

## Weather

In [54]:
# We only scrape the days for which we also have occupancy data
weather_summer = scrape_weather(start="2012-07-15",
                                end="2012-08-25",
                                url="https://www.wunderground.com/history/daily/ch/r%C3%BCmlang/LSZH/date/")

  0%|          | 0/42 [00:00<?, ?it/s]

In [55]:
weather_winter = scrape_weather(start="2012-11-24",
                                end="2013-01-09",
                                url="https://www.wunderground.com/history/daily/ch/r%C3%BCmlang/LSZH/date/")

  0%|          | 0/47 [00:00<?, ?it/s]

In [56]:
weather_df_ECO = pd.concat([weather_summer, weather_winter])
save_compressed(
    file=weather_df_ECO, 
    path="data/ECO/preprocessed/",
    name="weather_scraped",
    key="weather"
)

weather_df_ECO_full = weather_df_ECO.resample('1s').median().fillna(method='ffill')
save_compressed(
    file=weather_df_ECO_full, 
    path="data/ECO/preprocessed/",
    name="weather_preprocessed_1s",
    key="weather"
)

Saving weather_scraped.h5...
File saved successfully.
Saving weather_preprocessed_1s.h5...
File saved successfully.


In [57]:
del(weather_df_ECO, weather_summer, weather_winter)

## ECO House 1

### Mains

In [58]:
mains_ECO01 = read_dir(
    path=r'data\ECO\House_1\01_sm_csv',
    name="mains",
    usecols=[0]
)

save_compressed(
    file=mains_ECO01,
    path="data/ECO/preprocessed/",
    name="mains01_preprocessed",
    key="mains"
)

Reading directory data\ECO\House_1\01_sm_csv ...


  0%|          | 0/179 [00:00<?, ?it/s]

Directory processed successfully.

Saving mains01_preprocessed.h5...
File saved successfully.


### Fridge

In [59]:
fridge_ECO01 = read_dir(
    path=r'data\ECO\House_1\01_plugs_csv\01',
    name="fridge"
)

save_compressed(
    file=fridge_ECO01,
    path="data/ECO/preprocessed/",
    name="fridge01_preprocessed",
    key="fridge"
)

Reading directory data\ECO\House_1\01_plugs_csv\01 ...


  0%|          | 0/173 [00:00<?, ?it/s]

Directory processed successfully.

Saving fridge01_preprocessed.h5...
File saved successfully.


### Washing machine

In [60]:
washing_machine_ECO01 = read_dir(
    path=r'data\ECO\House_1\01_plugs_csv\05',
    name="washing_machine"
)

save_compressed(
    file=washing_machine_ECO01,
    path="data/ECO/preprocessed/",
    name="washing_machine01_preprocessed",
    key="washing_machine"
)

Reading directory data\ECO\House_1\01_plugs_csv\05 ...


  0%|          | 0/173 [00:00<?, ?it/s]

Directory processed successfully.

Saving washing_machine01_preprocessed.h5...
File saved successfully.


### Occupancy

In [61]:
occ_summer_ECO01 = pd.read_csv(
    "data/ECO/House_1/01_occupancy_csv/01_summer.csv")
occ_winter_ECO01 = pd.read_csv(
    "data/ECO/House_1/01_occupancy_csv/01_winter.csv")

# Unpivot occupancy data and combines dataframes
occ_summer_ECO01 = occ_summer_ECO01.melt(id_vars="Unnamed: 0")
occ_winter_ECO01 = occ_winter_ECO01.melt(id_vars="Unnamed: 0")
occupancy_ECO01 = pd.concat([occ_summer_ECO01, occ_winter_ECO01])

# Preprocess into final format
occupancy_ECO01 = process_occupancy(occupancy_ECO01)

In [62]:
save_compressed(
    file=occupancy_ECO01,
    path="data/ECO/preprocessed/",
    name="occupancy01_preprocessed",
    key="occupancy"
)

Saving occupancy01_preprocessed.h5...
File saved successfully.


### Export aggregated data

#### Fridge

In [63]:
fridge_full_ECO01 = occupancy_ECO01.join(
    other=[fridge_ECO01, mains_ECO01, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=fridge_full_ECO01,
    sampling_rate="6s",
    dataset="ECO",
    name="fridge")

# Resample 1 minute
resample_and_export(
    df=fridge_full_ECO01,
    sampling_rate="1min",
    dataset="ECO",
    name="fridge")

Resampling fridge to 6s...
Saving fridge_6s.h5...
File saved successfully.
Resampling fridge to 1min...
Saving fridge_1min.h5...
File saved successfully.


#### Washing machine

In [64]:
washing_machine_full_ECO01 = occupancy_ECO01.join(
    other=[washing_machine_ECO01, mains_ECO01, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=washing_machine_full_ECO01,
    sampling_rate="6s",
    dataset="ECO",
    name="washing_machine")

# Resample 1 minute
resample_and_export(
    df=washing_machine_full_ECO01,
    sampling_rate="1min",
    dataset="ECO",
    name="washing_machine")

Resampling washing_machine to 6s...
Saving washing_machine_6s.h5...
File saved successfully.
Resampling washing_machine to 1min...
Saving washing_machine_1min.h5...
File saved successfully.


#### All data

In [65]:
data_ECO01 = occupancy_ECO01.join(
    other=[fridge_ECO01, washing_machine_ECO01,
           mains_ECO01, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=data_ECO01,
    sampling_rate="6s",
    dataset="ECO",
    name="full_data01"
)

# Resample 1 minute
resample_and_export(
    df=data_ECO01,
    sampling_rate="1min",
    dataset="ECO",
    name="full_data01"
)

Resampling full_data01 to 6s...
Saving full_data01_6s.h5...
File saved successfully.
Resampling full_data01 to 1min...
Saving full_data01_1min.h5...
File saved successfully.


### Cleanup
Delete all variables no longer needed to free up RAM.

In [66]:
# Remember: weather_df_ECO_full is still needed for other houses

del(
    mains_ECO01, 
    fridge_ECO01, fridge_full_ECO01,
    washing_machine_ECO01, washing_machine_full_ECO01,
    occ_summer_ECO01, occ_winter_ECO01, occupancy_ECO01,
    data_ECO01    
   )


## ECO House 2

### Mains

In [67]:
mains_ECO02 = read_dir(
    path=r'data\ECO\House_2\02_sm_csv',
    name="mains",
    usecols=[0]
)

save_compressed(
    file=mains_ECO02,
    path="data/ECO/preprocessed/",
    name="mains02_preprocessed",
    key="mains"
)

Reading directory data\ECO\House_2\02_sm_csv ...


  0%|          | 0/244 [00:00<?, ?it/s]

Directory processed successfully.

Saving mains02_preprocessed.h5...
File saved successfully.


### Television

In [68]:
television_ECO02 = read_dir(
    path=r'data\ECO\House_2\02_plugs_csv\11',
    name="television"
)

save_compressed(
    file=television_ECO02,
    path="data/ECO/preprocessed/",
    name="television02_preprocessed",
    key="television"
)

Reading directory data\ECO\House_2\02_plugs_csv\11 ...


  0%|          | 0/240 [00:00<?, ?it/s]

Directory processed successfully.

Saving television02_preprocessed.h5...
File saved successfully.


### Laptop

In [69]:
laptop_ECO02 = read_dir(
    path=r'data\ECO\House_2\02_plugs_csv\09',
    name="laptop"
)

save_compressed(
    file=laptop_ECO02,
    path="data/ECO/preprocessed/",
    name="laptop02_preprocessed",
    key="laptop"
)

Reading directory data\ECO\House_2\02_plugs_csv\09 ...


  0%|          | 0/240 [00:00<?, ?it/s]

Directory processed successfully.

Saving laptop02_preprocessed.h5...
File saved successfully.


### Stove

In [70]:
stove_ECO02 = read_dir(
    path=r'data\ECO\House_2\02_plugs_csv\10',
    name="stove"
)

save_compressed(
    file=stove_ECO02,
    path="data/ECO/preprocessed/",
    name="stove02_preprocessed",
    key="stove"
)

Reading directory data\ECO\House_2\02_plugs_csv\10 ...


  0%|          | 0/28 [00:00<?, ?it/s]

Directory processed successfully.

Saving stove02_preprocessed.h5...
File saved successfully.


### Occupancy

In [71]:
occ_summer_ECO02 = pd.read_csv(
    "data/ECO/House_2/02_occupancy_csv/02_summer.csv")
occ_winter_ECO02 = pd.read_csv(
    "data/ECO/House_2/02_occupancy_csv/02_winter.csv")

# Unpivot occupancy data and combines dataframes
occ_summer_ECO02 = occ_summer_ECO02.melt(id_vars="Unnamed: 0")
occ_winter_ECO02 = occ_winter_ECO02.melt(id_vars="Unnamed: 0")
occupancy_ECO02 = pd.concat([occ_summer_ECO02, occ_winter_ECO02])

# Preprocess into final format
occupancy_ECO02 = process_occupancy(occupancy_ECO02)

In [72]:
save_compressed(
    file=occupancy_ECO02,
    path="data/ECO/preprocessed/",
    name="occupancy02_preprocessed",
    key="occupancy"
)

Saving occupancy02_preprocessed.h5...
File saved successfully.


### Export aggregated data

#### Television

In [73]:
television_full_ECO02 = occupancy_ECO02.join(
    other=[television_ECO02, mains_ECO02, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=television_full_ECO02,
    sampling_rate="6s",
    dataset="ECO",
    name="television"
)

# Resample 1 minute
resample_and_export(
    df=television_full_ECO02,
    sampling_rate="1min",
    dataset="ECO",
    name="television"
)

Resampling television to 6s...
Saving television_6s.h5...
File saved successfully.
Resampling television to 1min...
Saving television_1min.h5...
File saved successfully.


#### Laptop

In [74]:
laptop_full_ECO02 = occupancy_ECO02.join(
    other=[laptop_ECO02, mains_ECO02, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=laptop_full_ECO02,
    sampling_rate="6s",
    dataset="ECO",
    name="laptop"
)

# Resample 1 minute
resample_and_export(
    df=laptop_full_ECO02,
    sampling_rate="1min",
    dataset="ECO",
    name="laptop"
)

Resampling laptop to 6s...
Saving laptop_6s.h5...
File saved successfully.
Resampling laptop to 1min...
Saving laptop_1min.h5...
File saved successfully.


#### Stove

In [75]:
stove_full_ECO02 = occupancy_ECO02.join(
    other=[stove_ECO02, mains_ECO02, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=stove_full_ECO02,
    sampling_rate="6s",
    dataset="ECO",
    name="stove"
)

# Resample 1 minute
resample_and_export(
    df=stove_full_ECO02,
    sampling_rate="1min",
    dataset="ECO",
    name="stove"
)

Resampling stove to 6s...
Saving stove_6s.h5...
File saved successfully.
Resampling stove to 1min...
Saving stove_1min.h5...
File saved successfully.


#### All data

In [76]:
data_ECO02 = occupancy_ECO02.join(
    other=[television_ECO02, laptop_ECO02,
           stove_ECO02, mains_ECO02, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=data_ECO02,
    sampling_rate="6s",
    dataset="ECO",
    name="full_data02"
)

# Resample 1 minute
resample_and_export(
    df=data_ECO02,
    sampling_rate="1min",
    dataset="ECO",
    name="full_data02"
)

Resampling full_data02 to 6s...
Saving full_data02_6s.h5...
File saved successfully.
Resampling full_data02 to 1min...
Saving full_data02_1min.h5...
File saved successfully.


### Cleanup

In [77]:
# Remember: weather_df_ECO_full is still needed for other houses

del(
    mains_ECO02, 
    television_ECO02, television_full_ECO02,
    laptop_ECO02, laptop_full_ECO02,
    stove_ECO02, stove_full_ECO02,
    occ_summer_ECO02, occ_winter_ECO02, occupancy_ECO02,
    data_ECO02
   )

## ECO House 4

### Mains

In [78]:
mains_ECO04 = read_dir(
    path=r'data\ECO\House_4\04_sm_csv',
    name="mains",
    usecols=[0]
)

save_compressed(
    file=mains_ECO04,
    path="data/ECO/preprocessed/",
    name="mains04_preprocessed",
    key="mains"
)

Reading directory data\ECO\House_4\04_sm_csv ...


  0%|          | 0/219 [00:00<?, ?it/s]

Directory processed successfully.

Saving mains04_preprocessed.h5...
File saved successfully.


### Microwave

In [79]:
microwave_ECO04 = read_dir(
    path=r'data\ECO\House_4\04_plugs_csv\08',
    name="microwave"
)

save_compressed(
    file=microwave_ECO04,
    path="data/ECO/preprocessed/",
    name="microwave04_preprocessed",
    key="microwave"
)

Reading directory data\ECO\House_4\04_plugs_csv\08 ...


  0%|          | 0/195 [00:00<?, ?it/s]

Directory processed successfully.

Saving microwave04_preprocessed.h5...
File saved successfully.


### Occupancy

In [80]:
occ_summer_ECO04 = pd.read_csv(
    "data/ECO/House_4/04_occupancy_csv/04_summer.csv")
occ_winter_ECO04 = pd.read_csv(
    "data/ECO/House_4/04_occupancy_csv/04_winter.csv")

# Unpivot occupancy data and combines dataframes
occ_summer_ECO04 = occ_summer_ECO04.melt(id_vars="Unnamed: 0")
occ_winter_ECO04 = occ_winter_ECO04.melt(id_vars="Unnamed: 0")
occupancy_ECO04 = pd.concat([occ_summer_ECO04, occ_winter_ECO04])

# Preprocess into final format
occupancy_ECO04 = process_occupancy(occupancy_ECO04)

In [81]:
save_compressed(
    file=occupancy_ECO04,
    path="data/ECO/preprocessed/",
    name="occupancy04_preprocessed",
    key="occupancy"
)

Saving occupancy04_preprocessed.h5...
File saved successfully.


### Export aggregated data

#### Microwave

In [82]:
microwave_full_ECO04 = occupancy_ECO04.join(
    other=[microwave_ECO04, mains_ECO04, weather_df_ECO_full],
    lsuffix='_l', rsuffix='_r'
).dropna()

# Resample 6 seconds
resample_and_export(
    df=microwave_full_ECO04,
    sampling_rate="6s",
    dataset="ECO",
    name="microwave"
)

# Resample 1 minute
resample_and_export(
    df=microwave_full_ECO04,
    sampling_rate="1min",
    dataset="ECO",
    name="microwave"
)

Resampling microwave to 6s...
Saving microwave_6s.h5...
File saved successfully.
Resampling microwave to 1min...
Saving microwave_1min.h5...
File saved successfully.


### Cleanup

In [83]:
# If not executed last, either exclude weather_df_ECO_full or reload it from 
# disk afterwards.

del(
    mains_ECO04, 
    microwave_ECO04, microwave_full_ECO04,
    occ_summer_ECO04, occ_winter_ECO04, occupancy_ECO04,
    weather_df_ECO_full
   )

# Train Test Split

## DRED

In [98]:
def normalize_df(train_df, stats, exclude_cols):
    for col in train_df.columns:
        if not col in exclude_cols:
            mean = stats["mean"][col]
            std = stats["std"][col]
            train_df[col] = (train_df[col] - mean) / std
    return train_df

### 1 min data

In [100]:
df_DRED_1min = read_compressed(path="data\DRED\preprocessed\dred_1min.h5")

Reading file...


In [115]:
# Create training, validation and testing frames by iteratively adding
# 4-week and 1-week segments of the data to the respective datasets

train_DRED_1min = []
validation_DRED_1min = []
test_DRED_1min = []

start_date = datetime.strptime("2015-07-20", "%Y-%m-%d")
end_date = start_date
max_date = datetime.strptime("2015-12-06", "%Y-%m-%d")

while(start_date < max_date):

    # Add train
    end_date = start_date + timedelta(days=21)

    train_DRED_1min.append(df_DRED_1min.loc[start_date: end_date])
    print(f"Added to train:\t{start_date.date()} - {end_date.date()}")

    # Add validation
    start_date = end_date
    end_date = start_date + timedelta(days=7)

    validation_DRED_1min.append(df_DRED_1min.loc[start_date: end_date])
    print(f"Added to val:\t{start_date.date()} - {end_date.date()}")

    # Add test
    start_date = end_date
    end_date = start_date + timedelta(days=7)

    test_DRED_1min.append(df_DRED_1min.loc[start_date: end_date])
    print(f"Added to test:\t{start_date.date()} - {end_date.date()}")

    start_date = end_date

train_df_DRED_1min = pd.concat(train_DRED_1min)
validation_df_DRED_1min = pd.concat(validation_DRED_1min)
test_df_DRED_1min = pd.concat(test_DRED_1min)

Added to train:	2015-07-20 - 2015-08-10
Added to val:	2015-08-10 - 2015-08-17
Added to test:	2015-08-17 - 2015-08-24
Added to train:	2015-08-24 - 2015-09-14
Added to val:	2015-09-14 - 2015-09-21
Added to test:	2015-09-21 - 2015-09-28
Added to train:	2015-09-28 - 2015-10-19
Added to val:	2015-10-19 - 2015-10-26
Added to test:	2015-10-26 - 2015-11-02
Added to train:	2015-11-02 - 2015-11-23
Added to val:	2015-11-23 - 2015-11-30
Added to test:	2015-11-30 - 2015-12-07


In [102]:
# Calculate mean/std and store as json

stats_DRED_1min = {}
stats_DRED_1min["mean"] = dict(train_df_DRED_1min.mean())
stats_DRED_1min["std"] = dict(train_df_DRED_1min.std())
with open('stats_DRED_1min.json', 'w') as fp:
    json.dump(stats_DRED_1min, fp, indent=4)

In [103]:
exclude_columns = ["temperature", "humidity", "occupied"]

train_df_DRED_1min = normalize_df(
    train_df_DRED_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

validation_df_DRED_1min = normalize_df(
    validation_df_DRED_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

test_df_DRED_1min = normalize_df(
    test_df_DRED_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)


In [92]:
columns = ["fridge", "washing machine", "television", "laptop computer", "microwave"]

In [111]:
output_dir = "data/appliances/"

for app in columns:
    train = train_df_DRED_1min[[app, "mains", "occupied", "temperature",
                                "humidity"]]

    validation = validation_df_DRED_1min[[app, "mains", "occupied",
                                          "temperature", "humidity"]]

    test = test_df_DRED_1min[[app, "mains", "occupied", "temperature",
                              "humidity"]]

    train.to_csv(
        output_dir + app + "/" + app + "_training_1min_.csv",
        index=False,
        float_format='%g'
    )

    validation.to_csv(
        output_dir + app + "/" + app + "_validation_1min_.csv",
        index=False,
        float_format='%g'
    )

    test.to_csv(
        output_dir + app + "/" + app + "_test_1min_.csv",
        index=False,
        float_format='%g'
    )

### 6s data

If there's time this definitely needs to be refactored to reduce redundancy.

In [116]:
df_DRED_6s = read_compressed(path="data\DRED\preprocessed\dred_6s.h5")

# Create training, validation and testing frames by iteratively adding 
# 4-week and 1-week segments of the data to the respective datasets

train_DRED_6s = []
validation_DRED_6s = []
test_DRED_6s = []

start_date = datetime.strptime("2015-07-20", "%Y-%m-%d")
end_date = start_date
max_date = datetime.strptime("2015-12-06", "%Y-%m-%d")

while(start_date < max_date):
    
    # Add train
    end_date = start_date + timedelta(days=21)
    
    train_DRED_6s.append(df_DRED_6s.loc[start_date: end_date])
    print(f"Added to train:\t{start_date.date()} - {end_date.date()}")
    
    # Add validation
    start_date = end_date    
    end_date = start_date + timedelta(days=7)
    
    validation_DRED_6s.append(df_DRED_6s.loc[start_date: end_date])
    print(f"Added to val:\t{start_date.date()} - {end_date.date()}")
    
    # Add test 
    start_date = end_date    
    end_date = start_date + timedelta(days=7)
    
    test_DRED_6s.append(df_DRED_6s.loc[start_date: end_date])
    print(f"Added to test:\t{start_date.date()} - {end_date.date()}")
    start_date = end_date
    
    
train_df_DRED_6s = pd.concat(train_DRED_6s)
validation_df_DRED_6s = pd.concat(validation_DRED_6s)
test_df_DRED_6s = pd.concat(test_DRED_6s)

Reading file...
Added to train:	2015-07-20 - 2015-08-10
Added to val:	2015-08-10 - 2015-08-17
Added to test:	2015-08-17 - 2015-08-24
Added to train:	2015-08-24 - 2015-09-14
Added to val:	2015-09-14 - 2015-09-21
Added to test:	2015-09-21 - 2015-09-28
Added to train:	2015-09-28 - 2015-10-19
Added to val:	2015-10-19 - 2015-10-26
Added to test:	2015-10-26 - 2015-11-02
Added to train:	2015-11-02 - 2015-11-23
Added to val:	2015-11-23 - 2015-11-30
Added to test:	2015-11-30 - 2015-12-07


In [117]:
# Calculate mean/std and store as json

stats_DRED_6s = {}
stats_DRED_6s["mean"] = dict(train_df_DRED_6s.mean())
stats_DRED_6s["std"] = dict(train_df_DRED_6s.std())
with open('stats_DRED_6s.json', 'w') as fp:
    json.dump(stats_DRED_6s, fp, indent=4)

In [118]:
exclude_columns = ["temperature", "humidity", "occupied"]

train_df_DRED_6s = normalize_df(
    train_df_DRED_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

validation_df_DRED_6s = normalize_df(
    validation_df_DRED_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

test_df_DRED_6s = normalize_df(
    test_df_DRED_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)


In [119]:
output_dir = "data/appliances/"

for app in columns:
    train = train_df_DRED_6s[[app, "mains", "occupied", "temperature",
                                "humidity"]]

    validation = validation_df_DRED_6s[[app, "mains", "occupied",
                                          "temperature", "humidity"]]

    test = test_df_DRED_6s[[app, "mains", "occupied", "temperature",
                              "humidity"]]

    train.to_csv(
        output_dir + app + "/" + app + "_training_6s_.csv",
        index=False,
        float_format='%g'
    )

    validation.to_csv(
        output_dir + app + "/" + app + "_validation_6s_.csv",
        index=False,
        float_format='%g'
    )

    test.to_csv(
        output_dir + app + "/" + app + "_test_6s_.csv",
        index=False,
        float_format='%g'
    )

## ECO

### Fridge

In [138]:
# 1 Minute data
fridge_ECO_1min = read_compressed(path="data\ECO\preprocessed\\fridge_1min.h5")

fridge_ECO_1min = normalize_df(
    fridge_ECO_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

fridge_ECO_1min.to_csv(
        output_dir + "fridge/fridge_ECO_1min_.csv",
        index=False,
        float_format='%g'
)

Reading file...


In [139]:
# 6 seconds data
fridge_ECO_6s = read_compressed(path="data\ECO\preprocessed\\fridge_6s.h5")

fridge_ECO_6s = normalize_df(
    fridge_ECO_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

fridge_ECO_6s.to_csv(
        output_dir + "fridge/fridge_ECO_6s.csv",
        index=False,
        float_format='%g'
)

Reading file...


### Microwave

In [154]:
# 1 Minute data
microwave_ECO_1min = read_compressed(
    path="data\ECO\preprocessed\microwave_1min.h5"
)

microwave_ECO_1min = normalize_df(
    microwave_ECO_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

microwave_ECO_1min.to_csv(
        output_dir + "microwave/microwave_ECO_1min_.csv",
        index=False,
        float_format='%g'
)

Reading file...


In [156]:
# 6 seconds data
microwave_ECO_6s = read_compressed(
    path="data\ECO\preprocessed\microwave_6s.h5"
)

microwave_ECO_6s = normalize_df(
    microwave_ECO_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

microwave_ECO_6s.to_csv(
        output_dir + "microwave/microwave_ECO_6s_.csv",
        index=False,
        float_format='%g'
)

Reading file...


### Washing machine

In [152]:
# 1 Minute data
washing_machine_ECO_1min = read_compressed(
    path="data\ECO\preprocessed\washing_machine_1min.h5"
)
# Temporary fix for naming. If time, change preprocessing function.
washing_machine_ECO_1min.rename(
    columns={"washing_machine":"washing machine"},
    inplace=True
)

washing_machine_ECO_1min = normalize_df(
    washing_machine_ECO_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

washing_machine_ECO_1min.to_csv(
        output_dir + "washing machine/washing_machine_ECO_1min_.csv",
        index=False,
        float_format='%g'
)

Reading file...


In [157]:
# 6 second data
washing_machine_ECO_6s = read_compressed(
    path="data\ECO\preprocessed\washing_machine_6s.h5"
)
# Temporary fix for naming. If time, change preprocessing function.
washing_machine_ECO_6s.rename(
    columns={"washing_machine":"washing machine"},
    inplace=True
)

washing_machine_ECO_6s = normalize_df(
    washing_machine_ECO_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

washing_machine_ECO_6s.to_csv(
        output_dir + "washing machine/washing_machine_ECO_6s_.csv",
        index=False,
        float_format='%g'
)

Reading file...


### Television

In [160]:
# 1 Minute data
television_ECO_1min = read_compressed(
    path="data\ECO\preprocessed\\television_1min.h5"
)

television_ECO_1min = normalize_df(
    television_ECO_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

television_ECO_1min.to_csv(
        output_dir + "television/television_ECO_1min_.csv",
        index=False,
        float_format='%g'
)

Reading file...


In [161]:
# 6 second data
television_ECO_6s = read_compressed(
    path="data\ECO\preprocessed\\television_6s.h5"
)

television_ECO_6s = normalize_df(
    television_ECO_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

television_ECO_6s.to_csv(
        output_dir + "television/television_ECO_6s_.csv",
        index=False,
        float_format='%g'
)

Reading file...


### Laptop

In [166]:
# 1 Minute data
laptop_ECO_1min = read_compressed(
    path="data\ECO\preprocessed\\laptop_1min.h5"
)

# Temporary fix for naming. If time, change preprocessing function.
laptop_ECO_1min.rename(
    columns={"laptop":"laptop computer"},
    inplace=True
)

laptop_ECO_1min = normalize_df(
    laptop_ECO_1min, 
    stats=stats_DRED_1min, 
    exclude_cols=exclude_columns
)

laptop_ECO_1min.to_csv(
        output_dir + "laptop computer/laptop_ECO_1min_.csv",
        index=False,
        float_format='%g'
)

Reading file...


In [167]:
# 6 second data
laptop_ECO_6s = read_compressed(
    path="data\ECO\preprocessed\\laptop_6s.h5"
)

# Temporary fix for naming. If time, change preprocessing function.
laptop_ECO_6s.rename(
    columns={"laptop":"laptop computer"},
    inplace=True
)

laptop_ECO_6s = normalize_df(
    laptop_ECO_6s, 
    stats=stats_DRED_6s, 
    exclude_cols=exclude_columns
)

laptop_ECO_6s.to_csv(
        output_dir + "laptop computer/laptop_ECO_6s_.csv",
        index=False,
        float_format='%g'
)

Reading file...


In [83]:
path = r"C:\Users\Josef\Google Drive\Uni\Master\3 Wintersemester 20-21\Seminar Information Systems\Contribution\Seq2Seq\seq2point-nilm-master\dataset_management\redd\out"

Was nun?

ECO
- Gleicher Split: 15 Sommer, 15 Winter
- Normiere mit std/mean von DRED


OUTPUT
- Speichere Train x, Train y