# Building Data Cube Integration & Slicing

This script aggregates the cleaned building-data-genome-project-2 data into a data-cube-frame. The cube is then sliced into three cuboids of the 2D lattice, namely {time, site}, {time, attribute} and {attribute, site}.

![3DCube](../figures/3DCube.png)

The cuboids are thereafter saved.

In [2]:
# Import modules
import pandas as pd
import numpy as np
import time

# Parameter Selection
meter_data = ["electricity", "hotwater", "chilledwater"]
weather_cols = ["airTemperature", "seaLvlPressure"]
columns_considered = meter_data + weather_cols

# Path & url definition
url_root = 'https://media.githubusercontent.com/media/buds-lab/building-data-genome-project-2/master/data/'
url_path_meta = "metadata/"
url_path_weather = "weather/"

path_meters = "..\\data\\cleaned\\"
path_data_out = "..\\data\\cube\\"

meter_files = [path_meters + meter+ ".csv" for meter in meter_data]

## Cube manipulation Function

To integrate the building-cube to a dataframe, we structure the meter data to multicolumn dataframes using {meter, building_id} as column keys.
The cube integration and manipulation functions are gathered below.

In [3]:
def set_upperlevel_column(df, upperlevel_column_name):
    """"A function to define an upper level column over a dataframe."""
    lowerlevel_column_name = df.columns
    tuple_column = []
    for i in lowerlevel_column_name:
        tuple_column.append((upperlevel_column_name, i))
    df.columns = pd.MultiIndex.from_tuples(tuple_column)
    return df

def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

def union(list1, list2):
    return list(set().union(list1, list2))

def mergeAll(meter_df, weather_df, metadata_df, intersect_fct=intersection):
    """"A function to merge meta, weather and meter data together.
    The interstect_fct defines wether merging is done using union or intersection ensembles."""

    # Extract upper level column (meter_type) information
    meter_type_list = []
    for meter_type, blg_id in meter_df.columns.values:
        meter_type_list.append(meter_type)
    meter_type_list = list(set(meter_type_list))

    #  Identify only unique building ID within the meters considered
    blg_dict = dict()
    i = True
    for meter in meter_type_list:
        blg_dict[meter] = []
        for blg_id in meter_df[meter].columns.values:
            blg_dict[meter].append(blg_id)
        if i:
            blg_list_intersect = blg_dict[meter]
            i = False
        else:
            blg_list_intersect = intersect_fct(blg_dict[meter], blg_list_intersect)

    # Filters metadata with only current meter info & unique building intersection ids
    site_list = []
    for metername in meter_type_list:
        df_meta = metadata_df.loc[np.logical_and(metadata_df[metername] == "Yes", metadata_df["building_id"].isin(blg_list_intersect)),
                                  ["building_id", "site_id"]].copy()
        site_list.extend(list(df_meta.site_id.unique()))
    site_list_unique = list(set(site_list))

    # Filters weather with only current sites
    df_weather = weather_df.loc[weather_df["site_id"].isin(site_list_unique) == True,].copy()
    # Converts timestamp to datetime object
    df_weather["timestamp"] = pd.to_datetime(df_weather["timestamp"], format="%Y-%m-%d %H:%M:%S")
    dfs = []
    for i in meter_type_list:
        # Select only intersecting information within a set of buildings
        df = pd.melt(meter_df[i][intersection(blg_dict[i], blg_list_intersect)].reset_index(),
                      id_vars="timestamp",
                      var_name="building_id",
                      value_name=i)
        df.set_index(["building_id", "timestamp"], inplace=True)
        dfs.append(df)  # append to list
    meter_df = pd.concat(dfs, axis=1)
    del (dfs, df)

    # Merge
    meter_df = pd.merge(meter_df.reset_index(), df_meta, how="left", on="building_id").merge(
        df_weather, how="left", on=["timestamp", "site_id"])
    return meter_df

def multicol_2ndColumnSelection(df_multicol, allcol1, col2):
    """"Function to select data from a multi-column dataframe based on the 2nd column value.
    From a defined 2nd-level column of interest - col2,
     the function loops over the dataframe from all the values interest from the 1st-level column - allcol1"""
    df = pd.DataFrame()
    for i in allcol1:
        df[i] = df_multicol[i, col2].copy()
    return df

def multi2singlecol_1stCol(df_in):
    """"Function to transform a 2 column dataframe to a single one, while appending the 2nd column information
    to a new attribute."""
    # Extract upper level column meter_type information
    meter_type_list = []
    for meter_type, blg_id in df_in.columns.values:
        meter_type_list.append(meter_type)
    meter_type_list = list(set(meter_type_list))

    dfs = []
    for i in meter_type_list:
        df1 = pd.melt(df_in[i].reset_index(),
                      id_vars=df_in.index.name,
                      var_name="building_id",
                      value_name=i)
        df1.set_index(["building_id", df_in.index.name], inplace=True)
        dfs.append(df1)  # append to list
    meter_df = pd.concat(dfs, axis=1)
    meter_df = meter_df.reset_index().set_index([df_in.index.name], drop=True)
    return meter_df

def reduce_mem_usage(df, verbose=True):
    """"Function to reduce the memory usage of a dataframe.
    Source: https://www.kaggle.com/caesarlupum/ashrae-start-here-a-gentle-introduction"""

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
# Weather
weather = pd.read_csv(url_root + url_path_weather + "weather.csv", usecols=(["timestamp","site_id"]+weather_cols))

# Meta data
meta = pd.read_csv(url_root + url_path_meta + "metadata.csv", usecols=["building_id","site_id"]+meter_data,)

# Meter data
dfs = [] # empty list of the dataframes to create
for f in meter_files:
    meter_type = f.split("\\")[3].split(".")[0]
    meter = pd.read_csv(f, index_col="timestamp") # load the dataset
    # Define multicolumn Dataframe
    meter = set_upperlevel_column(meter, meter_type)
    dfs.append(meter)  # append to list
# Concatenate all meters
df_meter = pd.concat(dfs, axis=1)
del(dfs, meter, f, meter_files)
# Format index to datetime object
df_meter.index = pd.to_datetime(df_meter.index, format='%Y-%m-%d %H:%M:%S')

In [6]:
# Merging weather, meter and meta-data 
df_all = mergeAll(df_meter, weather, meta, intersect_fct=union)
# Reduce memory usage
df_all = reduce_mem_usage(df_all, verbose=True)

# Unmelt for multicolumn frame {attribute_X, building_id}
df_cube = df_all.pivot(index="timestamp", columns="building_id", values=columns_considered)
df_cube.head()

Mem. usage decreased to 1212.18 Mb (36.1% reduction)


Unnamed: 0_level_0,electricity,electricity,electricity,electricity,electricity,electricity,electricity,electricity,electricity,electricity,...,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure,seaLvlPressure
building_id,Bear_assembly_Angel,Bear_assembly_Beatrice,Bear_assembly_Danial,Bear_assembly_Diana,Bear_assembly_Genia,Bear_assembly_Harry,Bear_assembly_Jose,Bear_assembly_Roxy,Bear_assembly_Ruby,Bear_education_Alfredo,...,Wolf_office_Emanuel,Wolf_office_Haydee,Wolf_office_Joana,Wolf_office_Nadia,Wolf_office_Rochelle,Wolf_public_Norma,Wolf_retail_Harriett,Wolf_retail_Marcella,Wolf_retail_Toshia,Wolf_science_Alfreda
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2016-01-01 00:00:00,225.75,9.5625,,,183.125,,149.75,9.8125,51.625,0.100647,...,,,,,,,,,,
2016-01-01 01:00:00,225.75,9.5625,,,183.125,,149.75,9.8125,51.625,0.100647,...,,,,,,,,,,
2016-01-01 02:00:00,225.75,9.5625,,,183.125,,149.75,9.8125,51.625,0.100647,...,,,,,,,,,,
2016-01-01 03:00:00,222.375,9.8125,,,185.25,,152.25,9.875,51.625,0.10437,...,,,,,,,,,,
2016-01-01 04:00:00,227.375,9.546875,,,185.5,,151.25,9.9375,51.3125,0.106262,...,,,,,,,,,,


# Cube slicing
## A - Building Benchmarking
Cuboid {time, site} selection, shows an inter-building analytical frame, typically relevant for cross building benchmarking from top-down approaches.

In [7]:
# Fix attribute
attribute = columns_considered[0]

# Cuboid selection
df_cubA = df_cube[attribute]
#df_cubA.dropna(axis=1, how='all', inplace=True)

# Save cube slice
df_cubA.to_csv(path_data_out+"cuboid_A_"+attribute+'.csv')

## B - In-site view
Cuboid {time, attribute} selection covers the intra-building frame, common to bottom-up approaches. It serves for within-site exploration on how a given building operates across time and building-specific attributes.

In [8]:
## Fix building identification
blg_id = "Fox_education_Melinda"

# Cuboid selection
df_cubB = multicol_2ndColumnSelection(df_cube, columns_considered, blg_id)
#df_cubB.dropna(axis=1, how='all', inplace=True)

# Save cube slice
df_cubB.to_csv(path_data_out+"cuboid_B_"+blg_id+'.csv')

## C - Cross building/attribute slice
The {site, attribute} cuboid allows exploration of cross-building/attributes combined analysis within a fixed time slice of interested for temporal drill-in analytics.

In [9]:
## Per timerange insight
timestamp = "2016-06-07"
start_date = timestamp+" 00:00:00"
end_date = timestamp+" 23:00:00"
timerange_considered = (df_cube.index >= start_date) & (df_cube.index <= end_date)

# Cuboid selection
df_cubC = df_cube.loc[timerange_considered]
#df_cubC.dropna(axis=1, how='all', inplace=True)

# Save cube slice
df_cubC.to_csv(path_data_out+"cuboid_C_"+timestamp+'.csv')