In [1]:
from datetime import datetime
from datetime import timedelta
import os

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
%matplotlib inline

from prediction import helpers
from prediction.predictor import Predictor
import utils.data as dutils

In [2]:
def load_data(path):
    data = pd.read_csv(path)

    # first difference
    diff = np.diff(data['spotreba'])
    data['diff'] = [0,0, *diff[:-1]]

    # second difference
    diff2 = np.diff(data['spotreba'], n=2)
    data['diff2'] = [0, 0, 0, *diff2[:-1]]

    # consumption in previous hour
    data['prev_con'] = [0, *data['spotreba'][:-1]]

    # consumption in pre-previous hour
    data['prev_con2'] = [0, 0, *data['spotreba'][:-2]]

    data['prev_con3'] = [0, 0, 0,*data['spotreba'][:-3]]
    
    return data

Load dataset, add new feauters upon which we will experiment and split to train and test.

In [3]:
# start day
start_day = datetime(2015,1,1)
window = timedelta(days=14)
eval_days = 365

# get list of all user datasets
path = '.exports/user_data_joined'
user_paths = [os.path.join(path, x) for x in os.listdir(path)]

## SVR

First of all, lets do the baseline without added features:

In [None]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []


for index, user_path in enumerate(user_paths[0:3]):
    data = load_data(user_path)
    print('Processing user %d' % index)
    
    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data))
        
helpers.print_evaluations(results)

## RandomForest

In [8]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], RandomForestRegressor, {
        'n_estimators': 20,
        'max_features': 'log2'
    })

results = []


for index, user_path in enumerate(user_paths[0:3]):
    data = load_data(user_path)
    print('Processing user %d' % index)
    
    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data))
        
helpers.print_evaluations(results)

Processing user 0
Processing user 1
Processing user 2
MAPE:	56.904509,	45.449332
MSE:	0.656622,	0.653381
RMSE:	0.706412,	0.396993
MAE:	0.450946,	0.268276


# First difference

In [None]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour', 'diff'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)
        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['diff'] = test_data_dyn.loc[index, 'diff']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'diff'] = test_data_dyn.loc[index-1]['spotreba'] - test_data_dyn.loc[index-2]['spotreba']
            except KeyError:
                pass
                #print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)

# Second difference

In [None]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour', 'diff', 'diff2'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['diff'] = test_data_dyn.loc[index, 'diff']
            row['diff2'] = test_data_dyn.loc[index, 'diff2']

            predicted = prd.predict(row)

            test_data_dyn.loc[index, 'spotreba'] = predicted[0]

            try:
                test_data_dyn.loc[index+1, 'diff'] = test_data_dyn.loc[index-1]['spotreba'] - test_data_dyn.loc[index-2]['spotreba']
                test_data_dyn.loc[index+1, 'diff2'] = test_data_dyn.loc[index+1]['diff'] - test_data_dyn.loc[index-2]['spotreba'] - test_data_dyn.loc[index-3]['spotreba']
            except KeyError:
                #print('este sa neda')
                pass
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))

helpers.print_evaluations(results)

# Previous hour consumption

In [None]:
prd = Predictor('spotreba',  ['month', 'weekday', 'hour', 'prev_con'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['prev_con'] = test_data_dyn.loc[index, 'prev_con']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'prev_con'] = predicted[0]
            except KeyError:
                print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)

# Previous 2 hours consumption

In [None]:
prd = Predictor('spotreba',  ['month', 'weekday', 'hour', 'prev_con', 'prev_con2'])

results = []


for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)


    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['prev_con'] = test_data_dyn.loc[index, 'prev_con']
            row['prev_con2'] = test_data_dyn.loc[index, 'prev_con2']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'prev_con'] = predicted[0]
                test_data_dyn.loc[index+2, 'prev_con2'] = predicted[0]
            except KeyError:
                print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)

In [None]:
prd = Predictor('spotreba',  ['month', 'weekday', 'hour', 'prev_con', 'prev_con2', 'prev_con3'])

results = []

for index, user_path in enumerate(user_paths[0:10]):
    data = load_data(user_path)
    print('Processing user %d' % index)

    for i in range(0, eval_days):
        eval_day = start_day + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - window - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        test_data_dyn = test_data.copy()
        predicted_day = []

        for index, row in test_data_dyn.iterrows():
            row['prev_con'] = test_data_dyn.loc[index, 'prev_con']
            row['prev_con2'] = test_data_dyn.loc[index, 'prev_con2']
            row['prev_con3'] = test_data_dyn.loc[index, 'prev_con3']
            predicted = prd.predict(row)
            test_data_dyn.loc[index, 'spotreba'] = predicted[0]
            try:
                test_data_dyn.loc[index+1, 'prev_con'] = predicted[0]
                test_data_dyn.loc[index+2, 'prev_con2'] = predicted[0]
                test_data_dyn.loc[index+2, 'prev_con3'] = predicted[0]
            except KeyError:
                print('este sa neda')
            predicted_day.append(predicted[0])

        results.append(prd.eval(np.array(predicted_day), test_data))
    
helpers.print_evaluations(results)