In [2]:
from datetime import datetime
from datetime import timedelta
import os

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
%matplotlib inline

from prediction import helpers
from prediction.predictor import Predictor
import utils.data as dutils

Load dataset, add new feauters upon which we will experiment and split to train and test.

In [5]:
# start day
START_DAY = datetime(2015,1,1)
WINDOW = timedelta(days=14)
EVAL_DAYS = 365

# get list of all user datasets
PATH = '.exports/user_data_joined'
USER_PATHS = [os.path.join(PATH, x) for x in os.listdir(PATH)]

## SVR

First of all, lets do the baseline without added features:

In [15]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []


print('Processing user: ', end='')
for index, user_path in enumerate(USER_PATHS[0:3]):
    data = pd.read_csv(user_path)
    data = dutils.add_artificial_features(data, 'spotreba')
    print('%d' % (index + 1), end=', ')
    
    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')
        
helpers.print_evaluations(results)

Processing user: 1, 2, 3, 

MASE:	1.300935,	2.957778
MSE:	0.553004,	0.596614
RMSE:	0.639341,	0.379799
MAE:	0.409388,	0.236613


## RandomForest

In [22]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], RandomForestRegressor, {
        'n_estimators': 20,
        'max_features': 'log2'
    })

results = []

print('Processing user: ', end='')
for index, user_path in enumerate(USER_PATHS[0:3]):
    
    data = pd.read_csv(user_path)
    data = dutils.add_artificial_features(data, 'spotreba')
    
    print('%d' % (index + 1), end=', ')
    
    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')
helpers.print_evaluations(results)

Processing user: 1, 2, 3, 

MASE:	1.374538,	2.824779
MSE:	0.657955,	0.651956
RMSE:	0.707657,	0.396455
MAE:	0.451952,	0.267850


# First difference

In [None]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour', 'diff'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)
        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['diff'] = test_data_dyn.loc[index, 'diff']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'diff'] = test_data_dyn.loc[index-1]['spotreba'] - test_data_dyn.loc[index-2]['spotreba']
            except KeyError:
                pass
                #print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)

# Second difference

In [None]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour', 'diff', 'diff2'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['diff'] = test_data_dyn.loc[index, 'diff']
            row['diff2'] = test_data_dyn.loc[index, 'diff2']

            predicted = prd.predict(row)

            test_data_dyn.loc[index, 'spotreba'] = predicted[0]

            try:
                test_data_dyn.loc[index+1, 'diff'] = test_data_dyn.loc[index-1]['spotreba'] - test_data_dyn.loc[index-2]['spotreba']
                test_data_dyn.loc[index+1, 'diff2'] = test_data_dyn.loc[index+1]['diff'] - test_data_dyn.loc[index-2]['spotreba'] - test_data_dyn.loc[index-3]['spotreba']
            except KeyError:
                #print('este sa neda')
                pass
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))

helpers.print_evaluations(results)

# Previous hour consumption

In [None]:
prd = Predictor('spotreba',  ['month', 'weekday', 'hour', 'prev_con'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['prev_con'] = test_data_dyn.loc[index, 'prev_con']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'prev_con'] = predicted[0]
            except KeyError:
                print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)

# Previous 2 hours consumption

In [None]:
prd = Predictor('spotreba',  ['month', 'weekday', 'hour', 'prev_con', 'prev_con2'])

results = []


for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)


    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['prev_con'] = test_data_dyn.loc[index, 'prev_con']
            row['prev_con2'] = test_data_dyn.loc[index, 'prev_con2']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'prev_con'] = predicted[0]
                test_data_dyn.loc[index+2, 'prev_con2'] = predicted[0]
            except KeyError:
                print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)

In [None]:
prd = Predictor('spotreba',  ['month', 'weekday', 'hour', 'prev_con', 'prev_con2', 'prev_con3'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['prev_con'] = test_data_dyn.loc[index, 'prev_con']
            row['prev_con2'] = test_data_dyn.loc[index, 'prev_con2']
            row['prev_con3'] = test_data_dyn.loc[index, 'prev_con3']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'prev_con'] = predicted[0]
                test_data_dyn.loc[index+2, 'prev_con2'] = predicted[0]
                test_data_dyn.loc[index+2, 'prev_con3'] = predicted[0]
            except KeyError:
                print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)