In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
raw_data_path = Path.home() / 'Data/USA_residential_openei'
output_path = Path('../data/US_single_site_preprocessed')

### Import data

In [13]:
state = 'FL' # US State abbreviation
n_households = 45
year = 2015
start = pd.datetime(year,1,1)
end   = pd.datetime(year+1,1,1)

index = pd.DatetimeIndex(start=start, end=end, freq='h')[:-1]

prefix = '{}_{}_'.format(state, n_households)

cooling_df     = pd.read_csv(output_path / (prefix + 'cooling.csv'), 
                             header=None, squeeze=True)

heating_df     = pd.read_csv(output_path / (prefix + 'heating.csv'), 
                             header=None, squeeze=True)

electricity_df = pd.read_csv(output_path / (prefix + 'electricity.csv'), 
                             header=None, squeeze=True)

df = pd.concat([cooling_df, heating_df, electricity_df], axis=1, 
          keys=['cooling', 'heating', 'electricity'])

df.set_index(index, inplace=True)

df.head()

Unnamed: 0,cooling,heating,electricity
2015-01-01 00:00:00,0.0,109.111,36.4
2015-01-01 01:00:00,0.0,123.874,29.682
2015-01-01 02:00:00,0.0,136.301,27.501
2015-01-01 03:00:00,0.0,150.053,27.319
2015-01-01 04:00:00,0.0,165.931,28.923


### Normalize

In [14]:
norm_df = (df - df.mean())/df.std()

norm_days_df = norm_df.assign(
    date=norm_df.index.date, 
    time=norm_df.index.time
).rename_axis('measure', axis=1) \
    .set_index(['date', 'time']) \
    .stack().unstack(level=0)

norm_days_df.head()

Unnamed: 0_level_0,date,2015-01-01 00:00:00,2015-01-02 00:00:00,2015-01-03 00:00:00,2015-01-04 00:00:00,2015-01-05 00:00:00,2015-01-06 00:00:00,2015-01-07 00:00:00,2015-01-08 00:00:00,2015-01-09 00:00:00,2015-01-10 00:00:00,...,2015-12-22 00:00:00,2015-12-23 00:00:00,2015-12-24 00:00:00,2015-12-25 00:00:00,2015-12-26 00:00:00,2015-12-27 00:00:00,2015-12-28 00:00:00,2015-12-29 00:00:00,2015-12-30 00:00:00,2015-12-31 00:00:00
time,measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
00:00:00,cooling,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,...,-1.180571,-1.164938,-1.13365,-1.077344,-1.071924,-0.997679,-1.045966,-1.064287,-1.050311,-1.00572
00:00:00,heating,2.779788,2.574989,3.599843,3.74082,4.978284,2.646651,2.459612,2.158621,3.019868,3.22235,...,2.164917,1.81531,1.325795,2.519483,2.941257,2.844411,1.898138,1.603532,1.749707,0.902358
00:00:00,electricity,-0.865555,-0.900704,-0.876321,-0.873682,-0.842016,-0.896218,-0.871888,-0.880807,-0.888196,-0.88086,...,-1.14385,-1.150395,-1.156675,-1.114664,-0.855738,-0.840538,-0.872943,-0.885082,-0.848085,-0.859907
01:00:00,cooling,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,-1.241445,...,-1.192687,-1.189081,-1.16122,-1.127289,-1.113067,-1.057725,-1.080413,-1.107221,-1.110043,-1.087132
01:00:00,heating,3.218222,2.870723,3.974129,3.971575,5.69303,3.133522,2.894749,2.544281,3.634204,3.678395,...,2.626278,2.146801,1.649861,2.870278,3.359763,3.316879,2.401522,2.110242,2.167114,1.188946


### get distance matrix

In [15]:
from scipy.spatial.distance import pdist, squareform

In [16]:
dist = squareform(pdist(norm_days_df.values.T, metric='euclidean'))

dist_df = pd.DataFrame(dist, columns=norm_days_df.columns, index=norm_days_df.columns)

dist_df.head()

date,2015-01-01 00:00:00,2015-01-02 00:00:00,2015-01-03 00:00:00,2015-01-04 00:00:00,2015-01-05 00:00:00,2015-01-06 00:00:00,2015-01-07 00:00:00,2015-01-08 00:00:00,2015-01-09 00:00:00,2015-01-10 00:00:00,...,2015-12-22 00:00:00,2015-12-23 00:00:00,2015-12-24 00:00:00,2015-12-25 00:00:00,2015-12-26 00:00:00,2015-12-27 00:00:00,2015-12-28 00:00:00,2015-12-29 00:00:00,2015-12-30 00:00:00,2015-12-31 00:00:00
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,0.0,4.498932,5.851426,6.103524,7.143042,2.736782,1.488267,2.319302,2.090402,2.28485,...,6.094777,7.456383,7.972682,5.987562,2.974106,3.042855,4.642304,5.760903,6.309674,8.53219
2015-01-02,4.498932,0.0,4.530294,6.64272,10.310488,6.239225,4.517155,3.034836,5.801419,6.209175,...,7.423845,8.506064,7.130686,5.475902,5.164634,6.200015,6.422875,7.280563,8.3755,9.609127
2015-01-03,5.851426,4.530294,0.0,3.788723,8.294561,8.352001,6.747938,5.631794,5.939656,7.118954,...,9.555127,11.37138,10.523005,6.691423,5.684137,7.897554,9.465435,10.568368,11.406316,13.197639
2015-01-04,6.103524,6.64272,3.788723,0.0,6.513817,8.290218,7.308395,6.285298,5.140002,7.175264,...,9.909201,11.931949,11.576897,7.520354,5.989067,8.091211,10.13627,11.193306,11.88253,13.96034
2015-01-05,7.143042,10.310488,8.294561,6.513817,0.0,8.06172,8.231281,9.003791,5.510071,6.203822,...,10.253195,12.246423,13.612814,9.852274,7.223173,7.897633,10.70225,11.730456,11.820924,14.577361


### save distance matrix

In [17]:
_filename = (prefix + 'distance.csv')
np.savetxt(output_path / _filename, dist, delimiter=",")