In [1]:
import pandas as pd, re
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from datetime import datetime, timedelta
import datetime as dt
import calendar
import matplotlib.dates as mdates
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_error
import math
import holidays
from operator import itemgetter
from sklearn.model_selection import train_test_split, KFold

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(absolute_percentage_error(y_true, y_pred))

def absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.abs((y_true - y_pred) / y_true) * 100

In [4]:
df = pd.read_csv('demandForecastingData.csv', parse_dates=['Date'])

In [5]:
def add_datepart(df, fldname, drop=True, time=False):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Quarter', 'Hour', 'Minute', 'Day', 'Dayofweek', 'Dayofyear']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    if drop: df.drop(fldname, axis=1, inplace=True)
        

In [6]:
add_datepart(df, 'Date', drop=False)

# Lasso Regression Model - All Variables

In [7]:
def get_best_alpha(X, y_train, df_train_year, alphas=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], max_variables=100000, correction=False):
    bestAlpha = min(alphas)
    bestMAPE = 1000
    for alpha in alphas:
        featureSelection = SelectFromModel(Lasso(alpha=alpha))
        featureSelection.fit(X, y_train)
        
        num_variables = len(X.columns[featureSelection.get_support()].get_values())
        if (num_variables > max_variables):
            continue
        x_train = df_train_year[X.columns[featureSelection.get_support()].get_values()]
            
        regressor = LinearRegression()
        result = regressor.fit(x_train, y_train)
        y_pred = regressor.predict(x_train)
        
        
        if (correction == True):
            y_pred = df_train_year['Load Last Week'] - y_pred
            y_actual = df_train_year['Load']
            MAPE = mean_absolute_percentage_error(y_actual, y_pred)
        else:
            MAPE = mean_absolute_percentage_error(y_train, y_pred)
            
        print(f'MAPE {MAPE} Alpha {alpha}')
        if bestMAPE > MAPE:
            bestAlpha = alpha
            bestMAPE = MAPE       
    return bestAlpha

In [8]:
metrics_df = pd.DataFrame(columns=['Training Year', 'Test Year', 'MAPE', 'Variables', 'No. of Variables', 'Alpha'])
for train_year in range(2012, 2018):
    df_train_year = df[df.Year == train_year].reset_index()
    X = df_train_year.drop(columns=['index', 'Date', 'Load'])
    y_train = df_train_year['Load']
    alpha = get_best_alpha(X, y_train, df_train_year)
    for test_year in range(2012, 2018):
            df_test_year = df[df.Year == test_year].reset_index()
                        
            featureSelection = SelectFromModel(Lasso(alpha=alpha))
            featureSelection.fit(X, y_train)
            x_train = df_train_year[X.columns[featureSelection.get_support()].get_values()]
            y_test = df_test_year['Load']
            x_test = df_test_year[X.columns[featureSelection.get_support()].get_values()]
            
            regressor = LinearRegression()
            result = regressor.fit(x_train, y_train)
            y_pred = regressor.predict(x_test)
            
            metrics_df.loc[len(metrics_df)]=[train_year,test_year,mean_absolute_percentage_error(y_test, y_pred), X.columns[featureSelection.get_support()].get_values(), len(X.columns[featureSelection.get_support()].get_values()), alpha] 

MAPE 8.653676326840882 Alpha 1
MAPE 8.743245644876627 Alpha 2
MAPE 8.79679807190293 Alpha 3
MAPE 8.79679807190293 Alpha 4
MAPE 8.79679807190293 Alpha 5
MAPE 9.048094644020617 Alpha 6
MAPE 9.048094644020617 Alpha 7
MAPE 9.068402842418507 Alpha 8
MAPE 9.068447522461009 Alpha 9
MAPE 9.104359162032994 Alpha 10




MAPE 8.570935894276587 Alpha 1
MAPE 8.61550325631171 Alpha 2
MAPE 8.621655354552184 Alpha 3
MAPE 8.621655354552184 Alpha 4
MAPE 8.675035882552725 Alpha 5
MAPE 8.803973666184794 Alpha 6
MAPE 8.885462026944742 Alpha 7
MAPE 8.885462026944742 Alpha 8
MAPE 8.947183115937534 Alpha 9
MAPE 8.947183115937534 Alpha 10




MAPE 8.62165320067166 Alpha 1
MAPE 8.644348380382164 Alpha 2
MAPE 8.644348380382164 Alpha 3
MAPE 8.70762252532548 Alpha 4
MAPE 8.70762252532548 Alpha 5
MAPE 8.70762252532548 Alpha 6
MAPE 8.859162028085347 Alpha 7
MAPE 8.966833292084212 Alpha 8
MAPE 8.966833292084212 Alpha 9
MAPE 9.075880503871304 Alpha 10
MAPE 8.747946336130887 Alpha 1




MAPE 8.777475527945692 Alpha 2
MAPE 8.82095900639638 Alpha 3
MAPE 8.82095900639638 Alpha 4
MAPE 8.908806668071692 Alpha 5
MAPE 9.079738475233611 Alpha 6
MAPE 9.123607421805476 Alpha 7
MAPE 9.123607421805476 Alpha 8
MAPE 9.123607421805476 Alpha 9
MAPE 9.217131815462109 Alpha 10




MAPE 8.808406816499746 Alpha 1
MAPE 8.840917050125395 Alpha 2
MAPE 8.889646043194197 Alpha 3
MAPE 8.889646043194197 Alpha 4
MAPE 8.968535968097424 Alpha 5
MAPE 8.968535968097424 Alpha 6
MAPE 9.153346884124193 Alpha 7
MAPE 9.35293282064097 Alpha 8
MAPE 9.35293282064097 Alpha 9
MAPE 9.368060052017656 Alpha 10




MAPE 9.466643578806618 Alpha 1
MAPE 9.565187179010938 Alpha 2
MAPE 9.565187179010938 Alpha 3
MAPE 9.537119753903076 Alpha 4
MAPE 9.537119753903076 Alpha 5
MAPE 9.852733672711246 Alpha 6
MAPE 9.852733672711246 Alpha 7
MAPE 9.855314490148023 Alpha 8
MAPE 9.916496135088584 Alpha 9
MAPE 9.916496135088584 Alpha 10




In [9]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].sort_values('MAPE', ascending=True)

Unnamed: 0,Training Year,Test Year,MAPE,Variables,No. of Variables,Alpha
8,2013,2014,8.991725,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Hour, Minute, Day, Dayofweek, Dayofyear]",30,1
15,2014,2015,9.093867,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Hour, Minute, Day, Dayofweek, Dayofyear]",29,1
1,2012,2013,9.17503,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Temperature over 12hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator sunny day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Quarter, Hour, Day, Dayofweek, Dayofyear]",31,1
22,2015,2016,9.330781,"[Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Temperature over 12hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Potential solar irradiance, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Hour, Minute, Day, Dayofweek, Dayofyear]",29,1
9,2013,2015,9.527448,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Hour, Minute, Day, Dayofweek, Dayofyear]",30,1
16,2014,2016,9.60211,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Hour, Minute, Day, Dayofweek, Dayofyear]",29,1
2,2012,2014,9.752127,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Temperature over 12hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator sunny day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Quarter, Hour, Day, Dayofweek, Dayofyear]",31,1
3,2012,2015,10.009814,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Temperature over 12hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator sunny day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Quarter, Hour, Day, Dayofweek, Dayofyear]",31,1
10,2013,2016,10.329354,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Hour, Minute, Day, Dayofweek, Dayofyear]",30,1
29,2016,2017,10.62465,"[Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-36hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Temperature over 24hrs, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (sine wave), Daily cycle (cosine wave), Holiday_Alternate, Week, Hour, Minute, Day, Dayofweek, Dayofyear]",28,1


# Lasso Regression SDLW Weather Corrected Model

In [10]:
def displaced_by_week(df, year):
    end_date = datetime(year+1, 1, 1)
    start_date = datetime(year, 1, 1)
    start_date_displaced_week = start_date - timedelta(minutes=10080)
    end_date_displaced_week = end_date - timedelta(minutes=10080)
    df_displaced = df[(df.Date >= start_date_displaced_week) & (df.Date < end_date_displaced_week)].reset_index(drop=True)
    return df_displaced

In [11]:
metrics_df = pd.DataFrame(columns=['Training Year', 'Test Year', 'MAPE', 'Variables', 'No. of Variables', 'Alpha'])
for train_year in range(2012, 2018):
    df_train_year = df[df.Year == train_year].reset_index()
    df_same_week_displaced_train = displaced_by_week(df, train_year)
    df_train_year['Load Last Week'] = df_same_week_displaced_train['Load']
    df_train_year['Load Difference'] = df_train_year['Load Last Week'] - df_train_year['Load']
    df_train_year['Temperature Difference'] = df_train_year['Temperature'] - df_same_week_displaced_train['Temperature']
    df_train_year['Temp-48 Difference'] = df_train_year['Temperature-48hrs'] - df_same_week_displaced_train['Temperature-48hrs']
    df_train_year['Wind speed Difference'] = df_train_year['Wind speed'] - df_same_week_displaced_train['Wind speed']
    df_train_year['Sun duration*potential solar irradiance Difference'] = df_train_year['Sun duration*potential solar irradiance'] - df_same_week_displaced_train['Sun duration*potential solar irradiance']
    df_train_year['Humidity Difference'] = df_train_year['Humidity'] - df_same_week_displaced_train['Humidity']
    y_train = df_train_year['Load Difference'].values
    X = df_train_year.drop(columns=['index', 'Date', 'Load', 'Load Last Week', 'Load Difference'])
    alpha = get_best_alpha(X, y_train, df_train_year, correction=True)
    for test_year in range(2012, 2018):
            df_test_year = df[df.Year == test_year].reset_index()
            df_same_week_displaced_test = displaced_by_week(df, test_year)
            df_test_year['Load Last Week'] = df_same_week_displaced_test['Load']
            df_test_year['Load Difference'] = df_test_year['Load Last Week'] - df_test_year['Load']
            
            df_test_year['Temperature Difference'] = df_test_year['Temperature'] - df_same_week_displaced_test['Temperature']
            df_test_year['Temp-48 Difference'] = df_test_year['Temperature-48hrs'] - df_same_week_displaced_test['Temperature-48hrs']
            df_test_year['Wind speed Difference'] = df_test_year['Wind speed'] - df_same_week_displaced_test['Wind speed']
            df_test_year['Sun duration*potential solar irradiance Difference'] = df_test_year['Sun duration*potential solar irradiance'] - df_same_week_displaced_test['Sun duration*potential solar irradiance']
            df_test_year['Humidity Difference'] = df_test_year['Humidity'] - df_same_week_displaced_test['Humidity']
            
            featureSelection = SelectFromModel(Lasso(alpha=alpha))
                    
            featureSelection.fit(X, y_train)
            
            x_train = df_train_year[X.columns[featureSelection.get_support()].get_values()]
            y_test = df_test_year['Load Difference']
            x_test = df_test_year[X.columns[featureSelection.get_support()].get_values()]
            
            regressor = LinearRegression()
            result = regressor.fit(x_train, y_train)
            y_pred = regressor.predict(x_test)
            
            predicted_load = df_test_year['Load Last Week'] - y_pred
            actual_load = df_test_year['Load']
            
            metrics_df.loc[len(metrics_df)]=[train_year,test_year, mean_absolute_percentage_error(actual_load, predicted_load), X.columns[featureSelection.get_support()].get_values(), len(X.columns[featureSelection.get_support()].get_values()), alpha] 

MAPE 3.5798578434937918 Alpha 1
MAPE 3.6043226661164147 Alpha 2
MAPE 3.6026620910527534 Alpha 3
MAPE 3.59272498336935 Alpha 4
MAPE 3.5952815338525705 Alpha 5
MAPE 3.6209582123965296 Alpha 6
MAPE 3.6255339698157307 Alpha 7
MAPE 3.793580632928586 Alpha 8
MAPE 3.851375406210835 Alpha 9
MAPE 3.851375406210835 Alpha 10
MAPE 3.3449770067510407 Alpha 1
MAPE 3.3490888054779244 Alpha 2
MAPE 3.382345541469905 Alpha 3
MAPE 3.3837748488747974 Alpha 4
MAPE 3.3907729192749763 Alpha 5
MAPE 3.5159351238383785 Alpha 6
MAPE 3.5159351238383785 Alpha 7
MAPE 3.5159351238383785 Alpha 8
MAPE 3.5159351238383785 Alpha 9
MAPE 3.5159351238383785 Alpha 10
MAPE 3.1699688431929616 Alpha 1
MAPE 3.170363787793612 Alpha 2
MAPE 3.1727830780925106 Alpha 3
MAPE 3.174547704681739 Alpha 4
MAPE 3.176154368491541 Alpha 5
MAPE 3.3428872514725305 Alpha 6
MAPE 3.407871263639541 Alpha 7
MAPE 3.407871263639541 Alpha 8
MAPE 3.407871263639541 Alpha 9
MAPE 3.407871263639541 Alpha 10
MAPE 3.439979978386949 Alpha 1
MAPE 3.540675981258

In [12]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].sort_values('MAPE', ascending=True)

Unnamed: 0,Training Year,Test Year,MAPE,Variables,No. of Variables,Alpha
8,2013,2014,3.247632,"[Temperature, Temperature-24hrs, Temperature-72hrs, Wind speed, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Hour, Day, Dayofweek, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",21,1
15,2014,2015,3.535719,"[Temperature-6hrs, Temperature-18hrs, Temperature-36hrs, Temperature-72hrs, Temperature-96hrs, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Holiday_Alternate, Week, Hour, Day, Dayofweek, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",23,1
2,2012,2014,3.560689,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-24hrs, Temperature-48hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Hour, Day, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",24,1
9,2013,2015,3.589122,"[Temperature, Temperature-24hrs, Temperature-72hrs, Wind speed, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Hour, Day, Dayofweek, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",21,1
1,2012,2013,3.661991,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-24hrs, Temperature-48hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Hour, Day, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",24,1
3,2012,2015,3.768921,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-24hrs, Temperature-48hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Hour, Day, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",24,1
16,2014,2016,3.781191,"[Temperature-6hrs, Temperature-18hrs, Temperature-36hrs, Temperature-72hrs, Temperature-96hrs, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Holiday_Alternate, Week, Hour, Day, Dayofweek, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",23,1
22,2015,2016,3.886706,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-18hrs, Temperature-24hrs, Temperature-72hrs, Temperature-96hrs, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Binary indicator sunny day, Weekday, Yearly cycle (sine wave), Holiday_Alternate, Hour, Day, Dayofweek, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Sun duration*potential solar irradiance Difference, Humidity Difference]",26,1
10,2013,2016,3.991638,"[Temperature, Temperature-24hrs, Temperature-72hrs, Wind speed, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Hour, Day, Dayofweek, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",21,1
4,2012,2016,4.124199,"[Temperature, Temperature-6hrs, Temperature-12hrs, Temperature-24hrs, Temperature-48hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed^3, Wind direction, Cloud height, Visibility, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Hour, Day, Dayofyear, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",24,1


In [13]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].MAPE.mean()

4.250755888617716

# Random splits

In [14]:
def displaced_by_week_whole_data(df, start_date, end_date):
    start_date_displaced_week = start_date - timedelta(minutes=10080)
    end_date_displaced_week = end_date - timedelta(minutes=10080)
    df_displaced = df[(df.Date >= start_date_displaced_week) & (df.Date < end_date_displaced_week)].reset_index(drop=True)
    return df_displaced

In [15]:
metrics_df = pd.DataFrame(columns=['MAPE', 'Variables', 'No. of Variables', 'Alpha'])

end_date = datetime(2018, 1, 1)
start_date = datetime(2012, 1, 1)
df_to_be_split = df.reset_index()
df_same_week_displaced_to_be_split = displaced_by_week_whole_data(df_to_be_split, start_date, end_date).reset_index()
df_to_be_split = df_to_be_split[(df_to_be_split.Date >= start_date) & (df_to_be_split.Date < end_date)].reset_index()
df_to_be_split['Load Last Week'] = df_same_week_displaced_to_be_split['Load']
df_to_be_split['Load Difference'] = df_to_be_split['Load Last Week'] - df_to_be_split['Load']
df_to_be_split['Temperature Difference'] = df_to_be_split['Temperature'] - df_same_week_displaced_to_be_split['Temperature']
df_to_be_split['Temp-48 Difference'] = df_to_be_split['Temperature-48hrs'] - df_same_week_displaced_to_be_split['Temperature-48hrs']
df_to_be_split['Wind speed Difference'] = df_to_be_split['Wind speed'] - df_same_week_displaced_to_be_split['Wind speed']
df_to_be_split['Sun duration*potential solar irradiance Difference'] = df_to_be_split['Sun duration*potential solar irradiance'] - df_same_week_displaced_to_be_split['Sun duration*potential solar irradiance']
df_to_be_split['Humidity Difference'] = df_to_be_split['Humidity'] - df_same_week_displaced_to_be_split['Humidity']
y = df_to_be_split['Load Difference']

X_train, X_test, y_train, y_test = train_test_split(df_to_be_split, y, test_size=0.2)

X = X_train.drop(columns=['index', 'Date', 'Load', 'Load Last Week', 'Load Difference'])
alpha = get_best_alpha(X, y_train, X_train, alphas=np.arange(1, 1000, 5), max_variables=10, correction=True)

featureSelection = SelectFromModel(Lasso(alpha=alpha))
                    
featureSelection.fit(X, y_train)

x_train = X_train[X.columns[featureSelection.get_support()].get_values()]
y_test = X_test['Load Difference']
x_test = X_test[X.columns[featureSelection.get_support()].get_values()]

regressor = LinearRegression()
result = regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
            
predicted_load = X_test['Load Last Week'] - y_pred
actual_load = X_test['Load']
            
metrics_df.loc[len(metrics_df)]=[mean_absolute_percentage_error(actual_load, predicted_load), X.columns[featureSelection.get_support()].get_values(), len(X.columns[featureSelection.get_support()].get_values()), alpha] 

MAPE 4.08611083778972 Alpha 36
MAPE 4.08611083778972 Alpha 41
MAPE 4.067476566753158 Alpha 46
MAPE 4.172665836972013 Alpha 51
MAPE 4.172665836972013 Alpha 56
MAPE 4.172665836972013 Alpha 61
MAPE 4.172665836972013 Alpha 66
MAPE 4.172665836972013 Alpha 71
MAPE 4.172665836972013 Alpha 76
MAPE 4.172665836972013 Alpha 81
MAPE 4.172665836972013 Alpha 86
MAPE 4.177547101783532 Alpha 91
MAPE 4.177547101783532 Alpha 96
MAPE 4.172665836972013 Alpha 101
MAPE 4.173032484479211 Alpha 106
MAPE 4.179925485914042 Alpha 111
MAPE 4.179925485914042 Alpha 116
MAPE 4.179925485914042 Alpha 121
MAPE 4.179925485914042 Alpha 126
MAPE 4.173508886594713 Alpha 131
MAPE 4.173508886594713 Alpha 136
MAPE 4.173508886594713 Alpha 141
MAPE 4.173508886594713 Alpha 146
MAPE 4.173508886594713 Alpha 151
MAPE 4.173508886594713 Alpha 156
MAPE 4.173508886594713 Alpha 161
MAPE 4.173508886594713 Alpha 166
MAPE 4.173508886594713 Alpha 171
MAPE 4.289895965885125 Alpha 176
MAPE 4.289895965885125 Alpha 181
MAPE 4.289895965885125 Al

In [16]:
metrics_df

Unnamed: 0,MAPE,Variables,No. of Variables,Alpha
0,4.071131,"[level_0, Wind speed^2, Wind speed^3, Wind direction, Cloud height, Visibility, Dayofyear, Temperature Difference, Humidity Difference]",9,46


# Elastic Net Regression SDLW Weather Corrected Model


In [17]:
def get_best_alpha_and_l1_ratio(X, y_train, df_train_year):
    alphas = [1, 10, 100]
    ratios = np.arange(0, 1, 0.1)
    bestAlpha = 1
    bestl1_ratio = 0.25
    bestMAPE = 1000
    for l1_ratio in ratios:
        for alpha in alphas:
            featureSelection = SelectFromModel(ElasticNet(alpha=alpha, l1_ratio=l1_ratio))
            featureSelection.fit(X, y_train)
            x_train = df_train_year[X.columns[featureSelection.get_support()].get_values()]
            
            regressor = LinearRegression()
            result = regressor.fit(x_train, y_train)
            y_pred = regressor.predict(x_train)
            MAPE = mean_absolute_percentage_error(y_train, y_pred)
            if bestMAPE > MAPE:
                bestAlpha = alpha
                bestMAPE = MAPE
                bestl1_ratio = l1_ratio
    return bestAlpha, bestl1_ratio

In [18]:
metrics_df = pd.DataFrame(columns=['Training Year', 'Test Year', 'MAPE', 'Variables', 'No. of Variables', 'Alpha', 'L1_Ratio'])
for train_year in range(2012, 2018):
    df_train_year = df[df.Year == train_year].reset_index()
    df_same_week_displaced_train = displaced_by_week(df, train_year)
    df_train_year['Load Last Week'] = df_same_week_displaced_train['Load']
    df_train_year['Load Difference'] = df_train_year['Load Last Week'] - df_train_year['Load']
    df_train_year['Temperature Difference'] = df_train_year['Temperature'] - df_same_week_displaced_train['Temperature']
    df_train_year['Temp-48 Difference'] = df_train_year['Temperature-48hrs'] - df_same_week_displaced_train['Temperature-48hrs']
    df_train_year['Wind speed Difference'] = df_train_year['Wind speed'] - df_same_week_displaced_train['Wind speed']
    df_train_year['Sun duration*potential solar irradiance Difference'] = df_train_year['Sun duration*potential solar irradiance'] - df_same_week_displaced_train['Sun duration*potential solar irradiance']
    df_train_year['Humidity Difference'] = df_train_year['Humidity'] - df_same_week_displaced_train['Humidity']
    y_train = df_train_year['Load Difference']
    X = df_train_year.drop(columns=['index', 'Date', 'Load', 'Load Last Week', 'Load Difference'])
    alpha, l1_ratio = get_best_alpha_and_l1_ratio(X, y_train, df_train_year)
    for test_year in range(2012, 2018):
            df_test_year = df[df.Year == test_year].reset_index()
            df_same_week_displaced_test = displaced_by_week(df, test_year)
            df_test_year['Load Last Week'] = df_same_week_displaced_test['Load']
            df_test_year['Load Difference'] = df_test_year['Load Last Week'] - df_test_year['Load']
            
            df_test_year['Temperature Difference'] = df_test_year['Temperature'] - df_same_week_displaced_test['Temperature']
            df_test_year['Temp-48 Difference'] = df_test_year['Temperature-48hrs'] - df_same_week_displaced_test['Temperature-48hrs']
            df_test_year['Wind speed Difference'] = df_test_year['Wind speed'] - df_same_week_displaced_test['Wind speed']
            df_test_year['Sun duration*potential solar irradiance Difference'] = df_test_year['Sun duration*potential solar irradiance'] - df_same_week_displaced_test['Sun duration*potential solar irradiance']
            df_test_year['Humidity Difference'] = df_test_year['Humidity'] - df_same_week_displaced_test['Humidity']
            
            featureSelection = SelectFromModel(ElasticNet(alpha=alpha, l1_ratio=l1_ratio))
                    
            featureSelection.fit(X, y_train)
            
            x_train = df_train_year[X.columns[featureSelection.get_support()].get_values()]
            y_test = df_test_year['Load Difference']
            x_test = df_test_year[X.columns[featureSelection.get_support()].get_values()]
            
            regressor = LinearRegression()
            result = regressor.fit(x_train, y_train)
            y_pred = regressor.predict(x_test)
            
            predicted_load = df_test_year['Load Last Week'] - y_pred
            actual_load = df_test_year['Load']
            
            metrics_df.loc[len(metrics_df)]=[train_year,test_year, mean_absolute_percentage_error(actual_load, predicted_load), X.columns[featureSelection.get_support()].get_values(), len(X.columns[featureSelection.get_support()].get_values()), alpha, l1_ratio] 

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  


In [19]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].sort_values('MAPE', ascending=True)

Unnamed: 0,Training Year,Test Year,MAPE,Variables,No. of Variables,Alpha,L1_Ratio
8,2013,2014,3.420187,"[Temperature, Wind speed, Humidity, Binary indicator windy day, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Day, Temperature Difference, Temp-48 Difference, Wind speed Difference]",12,1,0.25
2,2012,2014,3.424776,"[Temperature-48hrs, Temperature over 6hrs, Yearly cycle (sine wave), Holiday_Alternate, Quarter, Day, Temp-48 Difference, Humidity Difference]",8,1,0.25
15,2014,2015,3.500788,"[Temperature-6hrs, Binary indicator sunny day, Binary indicator windy day, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Dayofweek, Temperature Difference, Temp-48 Difference, Humidity Difference]",11,1,0.25
1,2012,2013,3.553135,"[Temperature-48hrs, Temperature over 6hrs, Yearly cycle (sine wave), Holiday_Alternate, Quarter, Day, Temp-48 Difference, Humidity Difference]",8,1,0.25
3,2012,2015,3.68566,"[Temperature-48hrs, Temperature over 6hrs, Yearly cycle (sine wave), Holiday_Alternate, Quarter, Day, Temp-48 Difference, Humidity Difference]",8,1,0.25
9,2013,2015,3.698706,"[Temperature, Wind speed, Humidity, Binary indicator windy day, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Day, Temperature Difference, Temp-48 Difference, Wind speed Difference]",12,1,0.25
16,2014,2016,3.826659,"[Temperature-6hrs, Binary indicator sunny day, Binary indicator windy day, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Dayofweek, Temperature Difference, Temp-48 Difference, Humidity Difference]",11,1,0.25
4,2012,2016,3.966709,"[Temperature-48hrs, Temperature over 6hrs, Yearly cycle (sine wave), Holiday_Alternate, Quarter, Day, Temp-48 Difference, Humidity Difference]",8,1,0.25
10,2013,2016,4.163365,"[Temperature, Wind speed, Humidity, Binary indicator windy day, Yearly cycle (sine wave), Yearly cycle (cosine wave), Holiday_Alternate, Week, Day, Temperature Difference, Temp-48 Difference, Wind speed Difference]",12,1,0.25
22,2015,2016,4.204275,"[Cloud height, Humidity, Dayofyear, Humidity Difference]",4,100,0.7


In [20]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].MAPE.mean()

4.332844864889896

# Elastic Net Regression SDLW Weather Corrected Model w/ no Holidays

In [21]:
metrics_df = pd.DataFrame(columns=['Training Year', 'Test Year', 'MAPE', 'Variables', 'No. of Variables', 'Alpha', 'L1_Ratio'])
for train_year in range(2012, 2018):
    df_train_year = df[df.Year == train_year].reset_index()
    df_same_week_displaced_train = displaced_by_week(df, train_year)
    df_train_year['Load Last Week'] = df_same_week_displaced_train['Load']
    df_train_year['Load Difference'] = df_train_year['Load Last Week'] - df_train_year['Load']
    df_train_year['Temperature Difference'] = df_train_year['Temperature'] - df_same_week_displaced_train['Temperature']
    df_train_year['Temp-48 Difference'] = df_train_year['Temperature-48hrs'] - df_same_week_displaced_train['Temperature-48hrs']
    df_train_year['Wind speed Difference'] = df_train_year['Wind speed'] - df_same_week_displaced_train['Wind speed']
    df_train_year['Sun duration*potential solar irradiance Difference'] = df_train_year['Sun duration*potential solar irradiance'] - df_same_week_displaced_train['Sun duration*potential solar irradiance']
    df_train_year['Humidity Difference'] = df_train_year['Humidity'] - df_same_week_displaced_train['Humidity']
    df_train_year = df_train_year[df_train_year['Holiday_Alternate'] == False]
    y_train = df_train_year['Load Difference']
    X = df_train_year.drop(columns=['index', 'Date', 'Load', 'Load Last Week', 'Load Difference'])
    alpha, l1_ratio = get_best_alpha_and_l1_ratio(X, y_train, df_train_year)
    for test_year in range(2012, 2018):
            df_test_year = df[df.Year == test_year].reset_index()
            df_same_week_displaced_test = displaced_by_week(df, test_year)
            df_test_year['Load Last Week'] = df_same_week_displaced_test['Load']
            df_test_year['Load Difference'] = df_test_year['Load Last Week'] - df_test_year['Load']
            
            df_test_year['Temperature Difference'] = df_test_year['Temperature'] - df_same_week_displaced_test['Temperature']
            df_test_year['Temp-48 Difference'] = df_test_year['Temperature-48hrs'] - df_same_week_displaced_test['Temperature-48hrs']
            df_test_year['Wind speed Difference'] = df_test_year['Wind speed'] - df_same_week_displaced_test['Wind speed']
            df_test_year['Sun duration*potential solar irradiance Difference'] = df_test_year['Sun duration*potential solar irradiance'] - df_same_week_displaced_test['Sun duration*potential solar irradiance']
            df_test_year['Humidity Difference'] = df_test_year['Humidity'] - df_same_week_displaced_test['Humidity']
            
            df_test_year = df_test_year[df_test_year['Holiday_Alternate'] == False]
            featureSelection = SelectFromModel(ElasticNet(alpha=alpha, l1_ratio=l1_ratio))
                    
            featureSelection.fit(X, y_train)
            
            x_train = df_train_year[X.columns[featureSelection.get_support()].get_values()]
            y_test = df_test_year['Load Difference']
            x_test = df_test_year[X.columns[featureSelection.get_support()].get_values()]
            
            regressor = LinearRegression()
            result = regressor.fit(x_train, y_train)
            y_pred = regressor.predict(x_test)
            
            predicted_load = df_test_year['Load Last Week'] - y_pred
            actual_load = df_test_year['Load']
            
            metrics_df.loc[len(metrics_df)]=[train_year,test_year, mean_absolute_percentage_error(actual_load, predicted_load), X.columns[featureSelection.get_support()].get_values(), len(X.columns[featureSelection.get_support()].get_values()), alpha, l1_ratio] 

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


In [22]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].sort_values('MAPE', ascending=True).head(20)

Unnamed: 0,Training Year,Test Year,MAPE,Variables,No. of Variables,Alpha,L1_Ratio
8,2013,2014,2.897846,"[Temperature, Temperature-24hrs, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Day, Dayofweek, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",13,1,0.25
1,2012,2013,2.956907,"[Temperature-24hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Quarter, Day, Temperature Difference, Temp-48 Difference, Humidity Difference]",16,1,0.25
15,2014,2015,3.011306,"[Temperature-18hrs, Temperature-72hrs, Sun duration, Binary indicator sunny day, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Quarter, Dayofweek, Temperature Difference, Temp-48 Difference, Wind speed Difference, Sun duration*potential solar irradiance Difference, Humidity Difference]",16,1,0.25
2,2012,2014,3.023205,"[Temperature-24hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Quarter, Day, Temperature Difference, Temp-48 Difference, Humidity Difference]",16,1,0.25
9,2013,2015,3.157341,"[Temperature, Temperature-24hrs, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Day, Dayofweek, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",13,1,0.25
3,2012,2015,3.249682,"[Temperature-24hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Quarter, Day, Temperature Difference, Temp-48 Difference, Humidity Difference]",16,1,0.25
16,2014,2016,3.366383,"[Temperature-18hrs, Temperature-72hrs, Sun duration, Binary indicator sunny day, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Quarter, Dayofweek, Temperature Difference, Temp-48 Difference, Wind speed Difference, Sun duration*potential solar irradiance Difference, Humidity Difference]",16,1,0.25
22,2015,2016,3.422028,"[Temperature-12hrs, Temperature-48hrs, Temperature over 18hrs, Humidity, Temperature Difference, Temp-48 Difference, Humidity Difference]",7,100,0.1
4,2012,2016,3.568234,"[Temperature-24hrs, Temperature-48hrs, Temperature-72hrs, Temperature-96hrs, Temperature over 6hrs, Wind speed, Humidity, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Quarter, Day, Temperature Difference, Temp-48 Difference, Humidity Difference]",16,1,0.25
10,2013,2016,3.587615,"[Temperature, Temperature-24hrs, Binary indicator windy day, Weekday, Yearly cycle (sine wave), Yearly cycle (cosine wave), Daily cycle (cosine wave), Day, Dayofweek, Temperature Difference, Temp-48 Difference, Wind speed Difference, Humidity Difference]",13,1,0.25


In [23]:
metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']].MAPE.mean()

3.7894077887720594

In [24]:
variables_2d_array = metrics_df[metrics_df['Test Year'] > metrics_df['Training Year']]['Variables'].tolist()

In [25]:
variables_2d_array

[array(['Temperature-24hrs', 'Temperature-48hrs', 'Temperature-72hrs',
        'Temperature-96hrs', 'Temperature over 6hrs', 'Wind speed',
        'Humidity', 'Weekday', 'Yearly cycle (sine wave)',
        'Yearly cycle (cosine wave)', 'Daily cycle (cosine wave)',
        'Quarter', 'Day', 'Temperature Difference', 'Temp-48 Difference',
        'Humidity Difference'], dtype=object),
 array(['Temperature-24hrs', 'Temperature-48hrs', 'Temperature-72hrs',
        'Temperature-96hrs', 'Temperature over 6hrs', 'Wind speed',
        'Humidity', 'Weekday', 'Yearly cycle (sine wave)',
        'Yearly cycle (cosine wave)', 'Daily cycle (cosine wave)',
        'Quarter', 'Day', 'Temperature Difference', 'Temp-48 Difference',
        'Humidity Difference'], dtype=object),
 array(['Temperature-24hrs', 'Temperature-48hrs', 'Temperature-72hrs',
        'Temperature-96hrs', 'Temperature over 6hrs', 'Wind speed',
        'Humidity', 'Weekday', 'Yearly cycle (sine wave)',
        'Yearly cycle (cosine 

In [26]:
variable_counts = {}
for variables in variables_2d_array:
    for variable in variables:
        if variable in variable_counts:
            count = variable_counts[variable]
        else:
            count = 0
        count += 1
        variable_counts[variable] = count

In [27]:
sorted_variable_counts = sorted(variable_counts.items(), key=lambda x: x[1], reverse=True)

In [28]:
sorted_variable_counts

[('Temperature Difference', 15),
 ('Temp-48 Difference', 15),
 ('Humidity Difference', 15),
 ('Yearly cycle (sine wave)', 13),
 ('Yearly cycle (cosine wave)', 13),
 ('Daily cycle (cosine wave)', 13),
 ('Weekday', 12),
 ('Day', 10),
 ('Temperature-24hrs', 9),
 ('Quarter', 9),
 ('Temperature-72hrs', 8),
 ('Dayofweek', 8),
 ('Wind speed Difference', 8),
 ('Temperature-48hrs', 7),
 ('Humidity', 7),
 ('Binary indicator windy day', 7),
 ('Temperature-96hrs', 6),
 ('Wind speed', 6),
 ('Temperature over 6hrs', 5),
 ('Temperature', 4),
 ('Sun duration', 4),
 ('Sun duration*potential solar irradiance Difference', 4),
 ('Temperature-18hrs', 3),
 ('Binary indicator sunny day', 3),
 ('Temperature-12hrs', 2),
 ('Temperature over 18hrs', 2),
 ('Temperature-36hrs', 1),
 ('Daily cycle (sine wave)', 1)]