In [1]:
# plotting
#import matplotlib as mpl
#mpl.style.use('ggplot')
#import matplotlib.pyplot as plt

# math and data manipulation
import numpy as np
import pandas as pd

# to handle paths
from pathlib import Path

# set random seeds 
from numpy.random import seed
from tensorflow import set_random_seed

RANDOM_SEED = 2018
seed(RANDOM_SEED)
set_random_seed(RANDOM_SEED)


In [2]:
import sys
sys.path.append("numpy_path")

## DATA PREPROCESSING (Date time parsing and normalization)

#### [1]Date Time Parsing

In [3]:
train_data = pd.read_csv('consumption_train.csv',index_col=0, parse_dates=['timestamp'])
test_data = pd.read_csv('cold_start_test.csv',index_col=0, parse_dates=['timestamp'])
sub_data = pd.read_csv('submission_format.csv',index_col='pred_id',parse_dates=['timestamp'])

train_data = train_data.loc[:,["series_id","timestamp","consumption","temperature"]] 
test_data = test_data.loc[:,['series_id','timestamp','consumption','temperature']]

sub_data = sub_data.loc[:,['series_id','timestamp','temperature','consumption','prediction_window']]

FileNotFoundError: File b'consumption_train.csv' does not exist

#### [2]Creating Lagged Features 

In [None]:
def create_lagged_features(df, lag=1):
    if not type(df) == pd.DataFrame:
        df = pd.DataFrame(df, columns=['consumption'])
    
    def _rename_lag(ser, j):
        ser.name = ser.name + f'_{j}'
        return ser
        
    # add a column lagged by `i` steps
    for i in range(1, lag + 1):
        df = df.join(df.consumption.shift(i).pipe(_rename_lag, i))

    df.dropna(inplace=True)
    return df

# example series
test_series = train_data[train_data.series_id == 100283]
create_lagged_features(test_series.consumption, lag=3).head()

#### [3]Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

def prepare_training_data(consumption_series, lag):
    """ Converts a series of consumption data into a
        lagged, scaled sample.
    """
    # scale training data
    scaler = MinMaxScaler(feature_range=(-1, 1))
    consumption_vals = scaler.fit_transform(consumption_series.values.reshape(-1, 1))
    
    # convert consumption series to lagged features
    consumption_lagged = create_lagged_features(consumption_vals, lag=lag)

    # X, y format taking the first column (original time series) to be the y
    X = consumption_lagged.drop('consumption', axis=1).values
    y = consumption_lagged.consumption.values
    
    # keras expects 3 dimensional X
    X = X.reshape(X.shape[0], 1, X.shape[1])
    
    return X, y, scaler

_X, _y, scaler = prepare_training_data(test_series.consumption, 5)
print(_X.shape)
print(_y.shape)
print(scaler)


In [None]:
# modeling
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

# progress bar
from tqdm import tqdm

## Post Processing

### Long short-term memory (LSTM)

##### LSTM units are units of a recurrent neural network (RNN)  well-suited to classifying, processing and making predictions based on time series data

In [None]:
# lag of 24 to simulate smallest cold start window. Our series
# will be converted to a num_timesteps x lag size matrix
lag =  24

# model parameters
num_neurons = 24
batch_size = 1  # this for`ces the lstm to step through each time-step one at a time
batch_input_shape=(batch_size, 1, lag)

# instantiate a sequential model
model = Sequential()

# add LSTM layer - stateful MUST be true here in 
# order to learn the patterns within a series
model.add(LSTM(units=num_neurons, 
              batch_input_shape=batch_input_shape, 
              stateful=True))

# followed by a dense layer with a single output for regression
model.add(Dense(1))

# compile
model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
%%time
num_training_series = train_data.series_id.nunique()
num_passes_through_data = 3

for i in tqdm(range(num_passes_through_data), 
              total=num_passes_through_data, 
              desc='Learning Consumption Trends - Epoch'):
    
    # reset the LSTM state for training on each series
    for ser_id, ser_data in train_data.groupby('series_id'):

        # prepare the data
        X, y, scaler = prepare_training_data(ser_data.consumption, lag)

        # fit the model: note that we don't shuffle batches (it would ruin the sequence)
        # and that we reset states only after an entire X has been fit, instead of after
        # each (size 1) batch, as is the case when stateful=False
        model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
        model.reset_states()

In [None]:
def generate_hourly_forecast(num_pred_hours, consumption, model, scaler, lag):
    """ Uses last hour's prediction to generate next for num_pred_hours, 
        initialized by most recent cold start prediction. Inverts scale of 
        predictions before return.
    """
    # allocate prediction frame
    preds_scaled = np.zeros(num_pred_hours)
    
    # initial X is last lag values from the cold start
    X = scaler.transform(consumption.values.reshape(-1, 1))[-lag:]
    
    # forecast
    for i in range(num_pred_hours):
        # predict scaled value for next time step
        yhat = model.predict(X.reshape(1, 1, lag), batch_size=1)[0][0]
        preds_scaled[i] = yhat
        
        # update X to be latest data plus prediction
        X = pd.Series(X.ravel()).shift(-1).fillna(yhat).values

    # revert scale back to original range
    hourly_preds = scaler.inverse_transform(preds_scaled.reshape(-1, 1)).ravel()
    return hourly_preds

### [1]Predicting for submission format

In [None]:
# copy submission format and fill in values
my_submission = sub_data.copy()

In [None]:
mytest = test_data

In [None]:

%%time
pred_window_to_num_preds = {'hourly': 24, 'daily': 7, 'weekly': 2}
pred_window_to_num_pred_hours = {'hourly': 24, 'daily': 7 * 24, 'weekly': 2 * 7 * 24}

num_test_series = my_submission.series_id.nunique()

model.reset_states()

for ser_id, pred_df in tqdm(my_submission.groupby('series_id'), 
                            total=num_test_series, 
                            desc="Forecasting from Cold Start Data"):
        
    # get info about this series' prediction window
    pred_window = pred_df.prediction_window.unique()[0]
    num_preds = pred_window_to_num_preds[pred_window]
    num_pred_hours = pred_window_to_num_pred_hours[pred_window]
    
    # prepare cold start data
    series_data = test_data[test_data.series_id == ser_id].consumption
    cold_X, cold_y, scaler = prepare_training_data(series_data, lag)
    
    # fine tune our lstm model to this site using cold start data    
    model.fit(cold_X, cold_y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
    
    # make hourly forecasts for duration of pred window
    preds = generate_hourly_forecast(num_pred_hours, series_data, model, scaler, lag)
    
    # reduce by taking sum over each sub window in pred window
    reduced_preds = [pred.sum() for pred in np.split(preds, num_preds)]
    
    # store result in submission DataFrame
    ser_id_mask = my_submission.series_id == ser_id
    my_submission.loc[ser_id_mask, 'consumption'] = reduced_preds

In [None]:
my_submission[my_submission.prediction_window == 'hourly'].consumption.describe()

In [None]:
my_submission.head(5)

In [None]:
my_submission.to_csv("my_submmission.csv", index_label='pred_id')

### [2]Predicting for test data

In [None]:

%%time
pred_window_to_num_preds = {'hourly': 24, 'daily': 7, 'weekly': 2}
pred_window_to_num_pred_hours = {'hourly': 24, 'daily': 7 * 24, 'weekly': 2 * 7 * 24}

num_test_series = mytest.series_id.nunique()

model.reset_states()

for ser_id, pred_df in tqdm(mytest.groupby('series_id'), 
                            total=num_test_series, 
                            desc="Forecasting from Cold Start Data"):
        
    # get info about this series' prediction window
    pred_window = pred_df.prediction_window.unique()[0]
    num_preds = 1
    num_pred_hours = 1
    
    # prepare cold start data
    series_data = test_data[test_data.series_id == ser_id].consumption
    cold_X, cold_y, scaler = prepare_training_data(series_data, lag)
    
    # fine tune our lstm model to this site using cold start data    
    model.fit(cold_X, cold_y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
    
    # make hourly forecasts for duration of pred window
    preds = generate_hourly_forecast(1, series_data, model, scaler, lag)
    
    # reduce by taking sum over each sub window in pred window
    #reduced_preds = [pred.sum() for pred in np.split(preds, num_preds)]
    
    # store result in submission DataFrame
    ser_id_mask = mytest.series_id == ser_id
    mytest.loc[ser_id_mask, 'consumption'] = preds

In [None]:
mytest.consumption

###### Accuracy for test data

In [None]:
from sklearn.metrics import r2_score
r2_score(y1, y0) 

### [2]Predicting on Train data

In [None]:
mytrain=train_data.copy()

In [None]:
## for train
#%%time
pred_window_to_num_preds = {'hourly': 24, 'daily': 7, 'weekly': 2}
pred_window_to_num_pred_hours = {'hourly': 24, 'daily': 7 * 24, 'weekly': 2 * 7 * 24}

num_test_series = mytrain.series_id.nunique()

model.reset_states()

for ser_id, pred_df in tqdm(mytrain.groupby('series_id'), 
                            total=num_test_series, 
                            desc="Forecasting from Cold Start Data"):
        
    # get info about this series' prediction window
    #pred_window = pred_df.prediction_window.unique()[0]
    num_preds = 1
    num_pred_hours = 1
    
    # prepare cold start data
    series_data = train_data[train_data.series_id == ser_id].consumption
    cold_X, cold_y, scaler = prepare_training_data(series_data, lag)
    
    # fine tune our lstm model to this site using cold start data    
    model.fit(cold_X, cold_y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
    
    # make hourly forecasts for duration of pred window
    preds = generate_hourly_forecast(1, series_data, model, scaler, lag)
    
    # reduce by taking sum over each sub window in pred window
    #reduced_preds = [pred.sum() for pred in np.split(preds, num_preds)]
    
    # store result in submission DataFrame
    ser_id_mask = mytrain.series_id == ser_id
    mytrain.loc[ser_id_mask, 'consumption'] = preds

In [None]:
yy0=mytrain.consumption

yy1=train_data.consumption
print(yy0)
print(yy1)

###### Accuracy for Train Data

In [None]:
from sklearn.metrics import r2_score
r2_score(yy1, yy0) 