In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [2]:
# dataset: https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014
data = pd.read_csv('LD2011_2014.txt', sep=";", index_col=0, parse_dates=True, decimal=',')
num_timeseries = data.shape[1]
data.head()

Unnamed: 0,MT_001,MT_002,MT_003,MT_004,MT_005,MT_006,MT_007,MT_008,MT_009,MT_010,...,MT_361,MT_362,MT_363,MT_364,MT_365,MT_366,MT_367,MT_368,MT_369,MT_370
2011-01-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2011-01-01 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2011-01-01 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2011-01-01 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2011-01-01 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
data_kw = data.resample('2H').sum() / 8
timeseries = []
num_timeseries = data_kw.shape[1]
for i in range(num_timeseries):
    timeseries.append(np.trim_zeros(data_kw.iloc[10000:11000,i], trim='f'))

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(20, 20), sharex=True)
axx = axs.ravel()
for i in range(0, 10):
    timeseries[i].loc["2013-04-13":"2013-05-13"].plot(ax=axx[i])
    axx[i].set_xlabel("date")    
    axx[i].set_ylabel("kW consumption")   
    axx[i].grid(which='minor', axis='x')

In [None]:
# we use 2 hour frequency for the time series
freq = '2H'
len(timeseries[i].loc["2013-04-13":"2013-07-13"])

In [None]:
start_dataset = pd.Timestamp("2013-04-13 00:00:00", freq=freq)
end_training = pd.Timestamp("2013-06-13 00:00:00", freq=freq)
end_testing = pd.Timestamp("2013-07-13 00:00:00", freq=freq)

In [None]:
training_data = [
    {
        "start": str(start_dataset),
        "target": ts[start_dataset:end_training - 1].tolist()  # We use -1, because pandas indexing includes the upper bound 
    }
    for ts in timeseries
]
print(len(training_data))

In [None]:
num_test_windows = 4

test_data = [
    {
        "start": str(end_training),
        "target": ts[end_training: end_testing].tolist()
    }
    for ts in timeseries
]
len(test_data[0]['target'])

In [None]:
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [None]:
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("test.json", test_data)

In [None]:
#https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption
data_individual = pd.read_csv('household_power_consumption.txt', sep=";", index_col=0,
                              parse_dates=True, decimal=',', na_values=['?'])
data_individual.shape

In [None]:
data_individual['Sub_metering_1'] = data_individual['Sub_metering_1'].astype('float')
data_individual['Sub_metering_2'] = data_individual['Sub_metering_2'].astype('float')
data_individual['Sub_metering_3'] = data_individual['Sub_metering_3'].astype('float')
data_individual.dtypes

In [None]:
data_kw_household = data_individual.resample('1D').sum()

In [None]:
data_kw_household['total_reading'] = data_kw_household['Sub_metering_1'] + data_kw_household['Sub_metering_2'] + data_kw_household['Sub_metering_3']

In [None]:
years = ['2007','2008', '2009']
file = open("individual_household_train.json", "a")
for year in years:
    # One JSON sample per line
    line = "\"start\":\"{} 00:00:00\",\"target\":{}".format(year, data_kw_household[year]['total_reading'].tolist())
    file.write('{'+line+'}\n')
file.close()


# Given the consumption in the last 30 days, you predict the next 30 days

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(20, 20))
axx = axs.ravel()
for i in range(0, 3):
    data_kw_household['total_reading'].loc[years[i]].plot(ax=axx[i])
    axx[i].set_xlabel("date")    
    axx[i].set_ylabel("kW consumption")   
    axx[i].grid(which='minor', axis='x')

In [None]:
years = ['2010']
file = open("individual_household_test.json", "a")
for year in years:
    # One JSON sample per line
    line = "\"start\":\"{} 00:00:00\",\"target\":{}".format(year, data_kw_household[year]['total_reading'].tolist())
    file.write('{'+line+'}\n')
file.close()

In [None]:
import requests
import json
def get_prediction(data):
    url = 'https://2b8pgrcpx9.execute-api.us-east-1.amazonaws.com/time-series-AICamp/time-series-aicamp'
    r = requests.post(url, data=json.dumps(data))
    response = getattr(r,'_content').decode("utf-8")
    #print(response)
    return response

In [None]:
# Focus on household 1
house = 1
predictions = []
for a in range(0,10):
    test_sample = {}
    test_sample = {"instances": [test_data[house].copy()]}
    test_sample["instances"][0]["target"] = test_data[house]["target"][a:a+100]
    test_sample["configuration"] =  {"output_types": ["mean",
                                                      "quantiles"
                                                     ],
                                     "quantiles": ["0.1",
                                                   "0.9"
                                                  ]
                                    }
    prediction_sample = json.loads(get_prediction(test_sample))['body']['predictions'][0]['quantiles']['0.9']
    #prediction_sample = json.loads(get_prediction(test_sample))['body']['predictions'][0]['mean']
    predictions.append(prediction_sample)

In [None]:
import matplotlib.pyplot as plt
plt.plot(predictions[9], label="predictions")
plt.plot(test_data[house]["target"][109:129], label="true value")
plt.xlabel('freq = 2H')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()


In [None]:
rmse = np.sqrt(np.mean((np.array(predictions[0]) - np.array(test_data[house]["target"][100:120]))**2))

In [None]:
rmse

In [None]:
np.sqrt(np.mean((np.array(predictions[0]) - np.array(test_data[house]["target"][100:120]))**2))