# Introduction: Encoding Time Series Features

In this notebook, we will explore the options for encoding times and dates in a time series problem. The primary objective is to determine the optimal method for representing time in a time-series problem, particularly as it relates to building energy data.

In [1]:
# Standard Data Science Helpers
import numpy as np
import pandas as pd
import os

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.set_config_file(world_readable=True, theme="pearl")
cf.go_offline(connected=True)

# Extra options
pd.options.display.max_rows = 10
pd.options.display.max_columns = 30
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from time_features_utils import *

In [10]:
data = pd.read_csv('data/building_1.csv', parse_dates=['timestamp'], 
                    usecols=['timestamp', 'energy'])
data.head()

Unnamed: 0,timestamp,energy
0,2014-01-01 06:15:00,43.012862
1,2014-01-01 06:30:00,43.780204
2,2014-01-01 06:45:00,43.012709
3,2014-01-01 07:00:00,42.631804
4,2014-01-01 07:15:00,42.627307


In [11]:
# Remove missing values
data = data.dropna(subset=['energy'])

# Count frequencies
freq_counts = data['timestamp'].diff(1).value_counts()
# Most common frequency in minutes
freq = round(freq_counts.idxmax().total_seconds() / 60)
freq

15

In [18]:
def data_reading(building_filename):
    """
    Read in building energy data from csv file
    """
    data = pd.read_csv(building_filename, parse_dates=['timestamp'], 
                    usecols=['timestamp', 'energy'])
    # Remove missing values
    data = data.dropna(subset=['energy'])
    
    # Count frequencies
    freq_counts = data['timestamp'].diff(1).value_counts()
    # Most common frequency in minutes
    freq = round(freq_counts.idxmax().total_seconds() / 60)
    
    # Set the index
    data = data.set_index('timestamp').sort_index()
    return data, freq, len(data)

In [19]:
def data_testing(building_filename, model):
    """
    Test the model on the building energy data in a csv file
    
    :param filename: string filename of building
    :param model: sklearn compatible model
    
    :return: dataframe of results
    """
    # File the building id for indexing
    building_id = building_filename.split('_')[-1].split('.csv')[0]
    # Read in the file
    data, freq, dpoints = data_reading(building_filename)
    # Test the model on the data
    results = test_time_features(data, model)
    # Record the results
    results['freq'] = freq
    results['dpoints'] = dpoints
    results['building_id'] = building_id
    return results

In [37]:
def test_time_features(data, model):

    data = pd.concat(
        [data, create_time_features(data.index, cyc_encode=True)], axis=1)

    scores = []
    methods = []

    y = data.pop('energy')

    normal_features = [
        'timestamp_' + t
        for t in ['hour', 'dayofweek', 'month', 'dayofyear', 'year']
    ]
    normal_cyc_features = [
        'sin_' + t for t in normal_features
        if t not in ['timestamp_dayofyear', 'timestamp_year']
    ] + [
        'cos_' + t for t in normal_features
        if t not in ['timestamp_dayofyear', 'timestamp_year']
    ]

    frac_features = [
        'timestamp_' + t
        for t in ['fracday', 'fracweek', 'fracmonth', 'fracyear']
    ]
    
    frac_cyc_features = ['sin_' + t for t in frac_features
                         ] + ['cos_' + t for t in frac_features]

    domain_features = [
        'sin_timestamp_fracday', 'cos_timestamp_fracday', 'timestamp_dayofweek',
        'sin_timestamp_fracyear', 'cos_timestamp_fracyear'
    ]

    results = {}
    dataset_names = ['normal', 'normal_cyc', 'frac', 'frac_cyc', 'domain']

    for features, name in zip(
        [normal_features, normal_cyc_features, frac_features, frac_cyc_features, domain_features],
            dataset_names):
        
        dataset = data[features].copy()
        
        # Drop any columns with only 1 value
        to_drop = dataset.columns[(dataset.nunique() == 1)]
        dataset = dataset.drop(columns=to_drop)
        
        # Set the y value
        dataset['energy'] = y.copy()
        tss = TimeSeriesSplit(10)
        print(cross_val_score(model, dataset, y, cv=tss))
        try:
            data_results = monthly_validation(dataset, model)
            scores.append(data_results['score'])
            methods.append(name)
        
        except Exception as e:
            print(e, name)

    results = pd.DataFrame(dict(score=scores, method=methods))
    return results

In [38]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()

data = data_testing('data/building_10.csv', linear_model)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [31]:
data

Unnamed: 0,score,method,freq,dpoints,building_id
0,0.767523,normal,15,105286,10
1,0.792206,normal_cyc,15,105286,10
2,0.768929,frac,15,105286,10
3,0.252185,frac_cyc,15,105286,10
4,0.53998,domain,15,105286,10


In [None]:
data

In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit


# cross_val_score(model, dataset, y, cv=tss)

In [92]:
linear_results = []

for file in tqdm_notebook(os.listdir('data/')):
    if file.endswith('.csv'):
        linear_results.append(data_testing(f'data/{file}', linear_model))
        
        
all_linear_results = pd.concat(linear_results)
all_linear_results.to_csv('results/linear_model.csv')

HBox(children=(IntProgress(value=0, max=602), HTML(value='')))

Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required. normal
Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required. normal_cyc
Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required. frac
Found array with 0 sample(s) (shape=(0, 8)) while a minimum of 1 is required. frac_cyc
Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required. normal
Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required. normal_cyc
Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required. frac
Found array with 0 sample(s) (shape=(0, 8)) while a minimum of 1 is required. frac_cyc
Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required. normal
Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required. normal_cyc
Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required. frac
Found array with 0 sample(s) (shape=(0, 8)) while a min

In [94]:
all_linear_results.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax(), 'method']).value_counts()

frac          413
normal_cyc    140
normal         30
frac_cyc        5
dtype: int64

In [103]:
all_linear_results.pivot_table(index='building_id', columns='method', values='score').iplot()

DataError: No numeric types to aggregate

In [95]:
all_linear_results.set_index('building_id').iplot(y='score', categories='method')

KeyError: ''

In [105]:
from sklearn.ensemble import RandomForestRegressor

random_model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, n_jobs=-1)
data_testing('data/building_10.csv', random_model)

Unnamed: 0,method,score,freq,dpoints,building_id
0,normal,0.81854,15,105286,10
1,normal_cyc,0.812786,15,105286,10
2,frac,0.818358,15,105286,10
3,frac_cyc,0.822429,15,105286,10


In [106]:
random_results = []

for file in tqdm_notebook(os.listdir('data/')):
    if file.endswith('.csv'):
        random_results.append(data_testing(f'data/{file}', random_model))
        
        
all_random_results = pd.concat(random_results)
all_random_results.to_csv('results/random_forest_model.csv')

HBox(children=(IntProgress(value=0, max=602), HTML(value='')))

Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required. normal
Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required. normal_cyc
Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required. frac
Found array with 0 sample(s) (shape=(0, 8)) while a minimum of 1 is required. frac_cyc
Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required. normal
Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required. normal_cyc
Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required. frac
Found array with 0 sample(s) (shape=(0, 8)) while a minimum of 1 is required. frac_cyc
Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required. normal
Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required. normal_cyc
Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required. frac
Found array with 0 sample(s) (shape=(0, 8)) while a min

In [107]:
all_random_results.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax, 'method']).value_counts()

frac_cyc      271
normal        170
normal_cyc     87
frac           60
dtype: int64