# Time Series Forecasting.

#### Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
# import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor

### Read the data in a pandas dataframe.

In [2]:
df = pd.read_csv('./Data/data/PJME_hourly.csv')

# set the datetime column as the index
df = df.set_index('Datetime')
df.index = pd.to_datetime(df.index)
# show the first five rows of the data.
df.head() 

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2002-12-31 01:00:00,26498.0
2002-12-31 02:00:00,25147.0
2002-12-31 03:00:00,24574.0
2002-12-31 04:00:00,24393.0
2002-12-31 05:00:00,24860.0


In [3]:
# show the last five rows of the data.
df.tail()

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2018-01-01 20:00:00,44284.0
2018-01-01 21:00:00,43751.0
2018-01-01 22:00:00,42402.0
2018-01-01 23:00:00,40164.0
2018-01-02 00:00:00,38608.0


### plot the data for visualization.

In [4]:
img_name = 'img.png'
img = df.plot(style='.', c='r', figsize=(15, 5), title='Energy Consumption in MW.')
plt.savefig(img_name)

plt.show()

#### Check For Outliers.

In [5]:
img1_name = 'img1.png'
img1 = df.loc[df['PJME_MW'] < 20000.00].plot(style='.', c='g', figsize=(15, 5), title='Outlier analysis.')

plt.savefig(img1_name)
plt.show()

#### Remove Outliers From The Dataframe

In [6]:
df = df.query('PJME_MW >= 19000').copy()
df.head()

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2002-12-31 01:00:00,26498.0
2002-12-31 02:00:00,25147.0
2002-12-31 03:00:00,24574.0
2002-12-31 04:00:00,24393.0
2002-12-31 05:00:00,24860.0


#### Get The Minimum And Maximum Date Of The Dataset.

In [7]:
df.index.min(), df.index.max()

(Timestamp('2002-01-01 01:00:00'), Timestamp('2018-08-03 00:00:00'))

#### Train Test Split Using The Index.

In [8]:
train = df.loc[df.index < '2017-01-01']
test = df.loc[df.index >= '2017-01-01']

#### Plot The Train And Test Data

In [9]:
fig, axs = plt.subplots(figsize=(15, 5))
train.plot(style='.', c='r', ax=axs, title='Train Test Set.')
test.plot(style='.', c='g', ax=axs)
axs.axvline('2017-01-01', color='b', ls='-')

plt.legend(['Train set.', 'Test Set.'])
img3_name = 'img3_name.png'

plt.savefig(img3_name)
plt.show()

#### Plot One Week Of Data

In [11]:
df.loc[(df.index > '2018-01-01') & (df.index < '2018-01-08')].plot(style='.', figsize=(15, 5), c='g', title='One Week Data')

plt.savefig('img4_name')
plt.show()

#### Plot One Month Of Data

In [14]:
df.loc[(df.index > '2003-01-01') & (df.index < '2003-02-01')].plot(style='.', c='b', figsize=(15, 5), title='One Month data Plot.')

plt.savefig('img5_name.png')
plt.show()

## Cross Validation On Timeseries Data.

In [15]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()

#### Loop Through Our Dataset Using Timeseries Object.

In [16]:
fig, axs = plt.subplots(5, 1, figsize=(15, 10), sharex=True)
fold = 0
for train_idx, val_idx in tss.split(df): 
    train_set = df.iloc[train_idx]
    test_set = df.iloc[val_idx]

    train_set['PJME_MW'].plot(title=f'Train Test Set Split Fold: {fold}.', ax=axs[fold])
    test_set['PJME_MW'].plot(ax=axs[fold], label='Test Set.')
    axs[fold].axvline(test_set.index.min(), ls='--', color='r')
    
    fold += 1
    plt.tight_layout()
    plt.savefig('img6.png')

    plt.show()

#### Creating Features For the Data

In [19]:
def create_features(df):
    '''
    Feature creation fo the 
    dataset using the index.
    '''
    df['Hour'] = df.index.hour
    df['Day'] = df.index.day
    df['DayOfWeek'] = df.index.dayofweek
    df['Month'] = df.index.month
    df['DayOfYear'] = df.index.dayofyear
    df['WeekOfYear'] = df.index.isocalendar().week
    df['Quarter'] = df.index.quarter
    df['Year'] = df.index.year

    return df

In [20]:
df = create_features(df)
df.head()

Unnamed: 0_level_0,PJME_MW,Hour,Day,DayOfWeek,Month,DayOfYear,WeekOfYear,Quarter,Year
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2002-01-01 01:00:00,30393.0,1,1,1,1,1,1,1,2002
2002-01-01 02:00:00,29265.0,2,1,1,1,1,1,1,2002
2002-01-01 03:00:00,28357.0,3,1,1,1,1,1,1,2002
2002-01-01 04:00:00,27899.0,4,1,1,1,1,1,1,2002
2002-01-01 05:00:00,28057.0,5,1,1,1,1,1,1,2002


### Forecasting Horizon.

#### Adding Lag Features.

In [22]:
def add_lags(df):
    '''Add lag features to help in
    the training of the model.'''
    target = df['PJME_MW'].to_dict() # The target is what we will use to add horizons in days to our dataframe.
    df = df.copy()
    df['Lag1'] = (df.index - pd.Timedelta('364 days')).map(target)
    df['Lag2'] = (df.index - pd.Timedelta('728 days')).map(target)
    df['Lag3'] = (df.index - pd.Timedelta('1092 days')).map(target)
    df['Lag4'] = (df.index - pd.Timedelta('1456 days')).map(target)
    
    return df

In [23]:
df = add_lags(df)
df.tail()

Unnamed: 0_level_0,PJME_MW,Hour,Day,DayOfWeek,Month,DayOfYear,WeekOfYear,Quarter,Year,Lag1,Lag2,Lag3,Lag4
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-08-02 20:00:00,44057.0,20,2,3,8,214,31,3,2018,42256.0,41485.0,38804.0,37889.0
2018-08-02 21:00:00,43256.0,21,2,3,8,214,31,3,2018,41210.0,40249.0,38748.0,37419.0
2018-08-02 22:00:00,41552.0,22,2,3,8,214,31,3,2018,39525.0,38698.0,37330.0,35897.0
2018-08-02 23:00:00,38500.0,23,2,3,8,214,31,3,2018,36490.0,35406.0,34552.0,32832.0
2018-08-03 00:00:00,35486.0,0,3,4,8,215,31,3,2018,33539.0,32094.0,31695.0,29716.0


In [25]:
df.head()

Unnamed: 0_level_0,PJME_MW,Hour,Day,DayOfWeek,Month,DayOfYear,WeekOfYear,Quarter,Year,Lag1,Lag2,Lag3,Lag4
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2002-01-01 01:00:00,30393.0,1,1,1,1,1,1,1,2002,,,,
2002-01-01 02:00:00,29265.0,2,1,1,1,1,1,1,2002,,,,
2002-01-01 03:00:00,28357.0,3,1,1,1,1,1,1,2002,,,,
2002-01-01 04:00:00,27899.0,4,1,1,1,1,1,1,2002,,,,
2002-01-01 05:00:00,28057.0,5,1,1,1,1,1,1,2002,,,,


#### Run The Function on our dataset.

In [26]:
df = create_features(df)
df.head()

Unnamed: 0_level_0,PJME_MW,Hour,Day,DayOfWeek,Month,DayOfYear,WeekOfYear,Quarter,Year,Lag1,Lag2,Lag3,Lag4
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2002-01-01 01:00:00,30393.0,1,1,1,1,1,1,1,2002,,,,
2002-01-01 02:00:00,29265.0,2,1,1,1,1,1,1,2002,,,,
2002-01-01 03:00:00,28357.0,3,1,1,1,1,1,1,2002,,,,
2002-01-01 04:00:00,27899.0,4,1,1,1,1,1,1,2002,,,,
2002-01-01 05:00:00,28057.0,5,1,1,1,1,1,1,2002,,,,


#### Plot Feature Relation To The Target Variable.

In [None]:
fig, axs = plt.subplots(1, 7, figsize=(40, 10), sharey=True)
sn.boxplot(data=df, x='Hour', y='PJME_MW', ax=axs[0], palette='Blues', hue='PJME_MW')
sn.boxplot(data=df, x='Day', y='PJME_MW', ax=axs[1], palette='Greens', hue='PJME_MW')
sn.boxplot(data=df, x='DayOfWeek', y='PJME_MW', ax=axs[2], palette='Reds', hue='PJME_MW') 
sn.boxplot(data=df, x='Month', y='PJME_MW', ax=axs[3], palette='Reds', hue='PJME_MW') 
sn.boxplot(data=df, x='DayOfYear', y='PJME_MW', ax=axs[4], palette='Greens', hue='PJME_MW') 
sn.boxplot(data=df, x='Quarter', y='PJME_MW', ax=axs[5], palette='Blues', hue='PJME_MW') 
sn.boxplot(data=df, x='Year', y='PJME_MW', ax=axs[6], palette='Reds', hue='PJME_MW')

axs[0].set_title('Energy Use Per Hour In MW.') 
axs[1].set_title('Daily Energy Consumption In MW.') 
axs[2].set_title('Day Of Week Energy Consumption In MW.') 
axs[3].set_title('Monthly Energy Consumption In MW')
axs[4].set_title('Day Of Year Energy Consumption In MW.')
axs[5].set_title('Quarterly Energy Consumption In MW.')
axs[6].set_title('Yearly Energy Consumption In MW.')

plt.tight_layout()
plt.savefig('img7.png')

plt.show()

In [28]:
df.columns

Index(['PJME_MW', 'Hour', 'Day', 'DayOfWeek', 'Month', 'DayOfYear',
       'WeekOfYear', 'Quarter', 'Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4'],
      dtype='object')

#### Training Our Model Using Cross Validation.

In [32]:
fig, axs = plt.subplots(5, 1, figsize=(15, 10), sharex=True)
fold = 0
scores = []
preds = []

for train_idx, val_idx in tss.split(df): 
    train_set = df.iloc[train_idx]
    test_set = df.iloc[val_idx]

    FEATURES = ['Hour', 'Day', 'DayOfWeek', 'Month', 'DayOfYear', 'WeekOfYear', 'Quarter', 'Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4']
    TARGET = ['PJME_MW']

    train = create_features(train_set)
    test = create_features(test_set)

    x_train = train[FEATURES]
    x_test = test[FEATURES]

    y_train = train[TARGET]
    y_test = test[TARGET]

    train_set['PJME_MW'].plot(title=f'Train Test Set Split Fold: {fold}.', ax=axs[fold])
    test_set['PJME_MW'].plot(ax=axs[fold], label='Test Set.')
    axs[fold].axvline(test_set.index.min(), ls='--', color='r')
    
    fold += 1
    plt.tight_layout()
    plt.savefig('img6.png')

    plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hour'] = df.index.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Day'] = df.index.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DayOfWeek'] = df.index.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

In [None]:
df.columns

In [30]:
fig, axs = plt.subplots(5, 1, figsize=(15, 10), sharex=True)
fold = 0
scores = []
preds = []

for train_idx, val_idx in tss.split(df): 
    train_set = df.iloc[train_idx]
    test_set = df.iloc[val_idx]

    FEATURES = ['Hour', 'Day', 'DayOfWeek', 'Month', 'DayOfYear', 'WeekOfYear', 'Quarter', 'Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4']
    TARGET = ['PJME_MW']

    train = create_features(train_set)
    test = create_features(test_set)

    x_train = train[FEATURES]
    x_test = test[FEATURES]

    y_train = train[TARGET]
    y_test = test[TARGET]

    model = HistGradientBoostingRegressor(learning_rate= 0.01, max_iter=500, max_leaf_nodes=50)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    preds.append(y_pred)

    score = np.sqrt(mean_squared_error(y_test, y_pred))

    scores.append(score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hour'] = df.index.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Day'] = df.index.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DayOfWeek'] = df.index.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

TypeError: 'DataFrame' object is not callable