# Data Cleaning

This script loads raw data from the Building-Data-Genome-Project-2 and performs data Cleaning.

In [None]:
import pandas as pd
import numpy as np
import time
from glob import glob

In [12]:
# Path definition & Parameter Selection
meter_data = ["electricity", "gas", "hotwater", "chilledwater"]
weather_cols = ["airTemperature", "windSpeed", "cloudCoverage"]

url_root = 'https://media.githubusercontent.com/media/buds-lab/building-data-genome-project-2/master/data/'
path_meters = "meters/raw/"
path_meta = "metadata/"
path_weather = "weather/"

path_data_out = "..\\data\\"
path_fig_out = "..\\figures\\"

meter_files = [url_root + path_meters + meter+ ".csv" for meter in meter_data]

### Reading

Dataframes are organized in a multicolumn fashion {meter, building_id}

In [None]:
def set_upperlevel_column(df, upperlevel_column_name):
    """"A function to define an upper level column over a dataframe."""
    lowerlevel_column_name = df.columns
    tuple_column = []
    for i in lowerlevel_column_name:
        tuple_column.append((upperlevel_column_name, i))
    df.columns = pd.MultiIndex.from_tuples(tuple_column)
    return df

# Meter data
dfs = [] # empty list of the dataframes to create
for file in meter_files:
    meter_type = file.split("/")[10].split(".")[0] # meter_type to rename the value feature
    meter = pd.read_csv(file) # load the dataset
    meter = meter.set_index("timestamp")
    # Define multicolumn Dataframe
    meter = set_upperlevel_column(meter, meter_type)
    dfs.append(meter)  # append to list
df_meter = pd.concat(dfs, axis=1) # concatenate all meter
del(dfs, meter, file, meter_files, meter_type)
df_meter.index = pd.to_datetime(df_meter.index, format='%Y-%m-%d %H:%M:%S')
df_meter


In [11]:
url_root + path_weather + "weather.csv"

'https://media.githubusercontent.com/media/buds-lab/building-data-genome-project-2/master/data//weatherweather.csv'

In [13]:
# Weather
weather = pd.read_csv(url_root + path_weather + "weather.csv", usecols=(["timestamp","site_id"]+weather_cols))
weather

Unnamed: 0,timestamp,site_id,airTemperature,cloudCoverage,windSpeed
0,2016-01-01 00:00:00,Panther,19.4,,0.0
1,2016-01-01 01:00:00,Panther,21.1,6.0,0.0
2,2016-01-01 02:00:00,Panther,21.1,,1.5
3,2016-01-01 03:00:00,Panther,20.6,,0.0
4,2016-01-01 04:00:00,Panther,21.1,,1.5
...,...,...,...,...,...
331161,2017-12-31 19:00:00,Mouse,8.5,,8.2
331162,2017-12-31 20:00:00,Mouse,8.5,,7.2
331163,2017-12-31 21:00:00,Mouse,8.2,,10.3
331164,2017-12-31 22:00:00,Mouse,7.5,,12.9


In [None]:
# Meta data
meta = pd.read_csv(
    url_root + path_meta + "metadata.csv",
    usecols=["building_id","site_id", "electricity", "hotwater", "chilledwater", "water", "steam", "solar", "gas", "irrigation"],)
meta  

In [None]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

def union(list1, list2):
    return list(set().union(list1, list2))

def mergeAll(meter_df, weather_df, metadata_df, intersect_fct=intersection):
    """"A function to merge meta, weather and meter data together.
    The interstect_fct defines wether the merging is done using union of intersection ensembles."""

    # Extract upper level column (meter_type) information
    meter_type_list = []
    for meter_type, blg_id in meter_df.columns.values:
        meter_type_list.append(meter_type)
    meter_type_list = list(set(meter_type_list))

    #  Identify only unique building ID within the meters considered
    blg_dict = dict()
    i = True
    for meter in meter_type_list:
        blg_dict[meter] = []
        for blg_id in meter_df[meter].columns.values:
            blg_dict[meter].append(blg_id)
        if i:
            blg_list_intersect = blg_dict[meter]
            i = False
        else:
            blg_list_intersect = intersect_fct(blg_dict[meter], blg_list_intersect)

    # Filters metadata with only current meter info & unique building intersection ids
    site_list = []
    for metername in meter_type_list:
        df_meta = metadata_df.loc[np.logical_and(metadata_df[metername] == "Yes", metadata_df["building_id"].isin(blg_list_intersect)),
                                  ["building_id", "site_id"]].copy()
        site_list.extend(list(df_meta.site_id.unique()))
    site_list_unique = list(set(site_list))

    # Filters weather with only current sites
    df_weather = weather_df.loc[weather_df["site_id"].isin(site_list_unique) == True,].copy()
    # Converts timestamp to datetime object
    df_weather["timestamp"] = pd.to_datetime(df_weather["timestamp"], format="%Y-%m-%d %H:%M:%S")
    dfs = []
    for i in meter_type_list:
        # Select only intersecting information within a set of buildings
        df = pd.melt(meter_df[i][intersection(blg_dict[i], blg_list_intersect)].reset_index(),
                      id_vars="timestamp",
                      var_name="building_id",
                      value_name=i)
        df.set_index(["building_id", "timestamp"], inplace=True)
        dfs.append(df)  # append to list
    meter_df = pd.concat(dfs, axis=1)
    del (dfs, df)

    # Merge
    meter_df = pd.merge(meter_df.reset_index(), df_meta, how="left", on="building_id").merge(
        df_weather, how="left", on=["timestamp", "site_id"])
    return meter_df

In [None]:
def reduce_mem_usage(df, verbose=True):
    """"Function to reduce the memory usage of a dataframe.
    Source: https://www.kaggle.com/caesarlupum/ashrae-start-here-a-gentle-introduction"""

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## PreProcessing
### Multidimensional cuboid selection - General formating

In [None]:
## Formating
df_all = mergeAll(df_meter, weather, meta, intersect_fct=union)
columns_considered = meter_data+weather_cols  # using all selected attributed

# Unmelt - multicolumn frame {attributeX, building_id}
df1 = df_all.pivot(index="timestamp", columns="building_id", values=columns_considered)
# Reduce memory usage
df1 = reduce_mem_usage(df1, verbose=True)
df1

### Multidimensional cuboid manipulation & transformation functions

In [None]:
def multicol_2ndColumnSelection(df_multicol, allcol1, col2):
    """"Function to select data from a multi-column dataframe based on the 2nd column value.
    From a defined 2nd-level column of interest - col2,
     the function loops over the dataframe from all the values interest from the 1st-level column - allcol1"""
    df = pd.DataFrame()
    for i in allcol1:
        df[i] = df_multicol[i, col2].copy()
    return df

from sklearn.preprocessing import MinMaxScaler
def scale_NanRobust(data_array, scaler):
    """ A function to scale an array while being robust to outliers.
    Adapted from: https://stackoverflow.com/questions/55280054/handling-missing-nan-values-on-sklearn-preprocessing"""
    # Set valid mask
    nan_mask = np.isnan(data_array)
    valid_mask = ~nan_mask
    # create a result array
    result = np.full(data_array.shape, np.nan)
    # assign only valid cases to
    result[valid_mask] = scaler.fit_transform(data_array[valid_mask].reshape(-1, 1)).reshape(data_array[valid_mask].shape)
    return result

def scale_df_columns_NanRobust(df_in, target_columns, scaler=MinMaxScaler(feature_range=(1, 2))):
    """"A function to normalize columns of a dataframe per column, while being robust to Nan values.
    The function returns a similar dataframe with missing values in identical places - normalized with the scaler object."""
    # Identify target from non-target column values
    nontarget_columns = list(set(df_in.columns) - set(target_columns))
    df = df_in[target_columns].copy()
    # Scale over the target columns
    array_scaled = []
    for col in df.columns:
        array_scaled.append(scale_NanRobust(df[col].values, scaler))
    df_scaled = pd.DataFrame(np.vstack(array_scaled).transpose(), columns=df.columns)
    # Set scaled dataframe index
    df_scaled[df_in.index.name] = df_in.index
    df_scaled.set_index([df_in.index.name], inplace=True, drop=True)
    # Merge non-target columns to the scaled frame
    df_scaled[nontarget_columns] = df_in[nontarget_columns]
    return df_scaled