# Predictions with clustered data

In [1]:
from datetime import datetime
from datetime import timedelta
import os

from sklearn import svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from prediction import helpers
from prediction.predictor import Predictor
import utils.data as dutils
import utils.metrics as metrics

In [2]:
# start day
START_DAY = datetime(2015,1,1)
WINDOW = timedelta(days=14)
EVAL_DAYS = 365

# get list of all user datasets
PATH = '.exports/user_data_joined'
USER_PATHS = [os.path.join(PATH, x) for x in os.listdir(PATH)]

First of all, let's setup baseline - simple SVR model evaluated on 10 users.

In [3]:
def baseline():
    prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)
    
    results = []
    
    print('Processing user: ', end='')
    for index, user_path in enumerate(USER_PATHS[0:10]):
        print('%d'%(index+1), end=', ')
        data = pd.read_csv(user_path)
        
        for i in range(0, EVAL_DAYS):
            eval_day = START_DAY + timedelta(days=i)
            
            train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
            test_data = dutils.select_range(data, eval_day, eval_day)

            prd.train(train_data)

            predicted = prd.predict(test_data)

            results.append(prd.eval(predicted, test_data, train_data))
    
    print('\n')
    
    helpers.print_evaluations(results)
            
baseline()

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 

MASE:	1.446697,	2.239348
MSE:	0.727987,	0.881286
RMSE:	0.729538,	0.442449
MAE:	0.501727,	0.326223


# Cluster 0

In [12]:
assignments = pd.read_csv('.exports/clusters/assignments.csv', names=['id','cluster_id'])

cluster0_ids = assignments[assignments['cluster_id'] == 0]['id'].tolist()

cluster_data = pd.read_csv('.exports/clusters/0.csv', index_col=0)
cluster_data.columns = ['year', 'month', 'day', 'spotreba', 'hour']
cluster_data = dutils.add_weekdays(cluster_data)

## Trained and evaluated on cluster

In [19]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []

for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(cluster_data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(cluster_data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))
        
helpers.print_evaluations(results)

MASE:	1.176497,	0.425597
MSE:	0.142418,	0.116917
RMSE:	0.353846,	0.131192
MAE:	0.279319,	0.108169


## Cluster week average as a feature

In [54]:
%%time

first_year = dutils.select_range(cluster_data, datetime(2014, 1, 1), datetime(2014, 12, 31))
week_average = first_year.groupby(['weekday', 'hour']).mean()['spotreba']

prd = Predictor('spotreba', ['month', 'weekday', 'hour', 'week_average'], svm.SVR)

#userdata = dutils.select_range(pd.read_csv(USER_PATHS[0]), datetime(2014, 1, 1), datetime(2015, 12, 31))


print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = pd.read_csv(USER_PATHS[user_index])
    
    data['week_average'] = 0

    for indices, val in week_average.items():
        #print(indices, val)
        weekday = indices[0]
        hour = indices[1]

        data.loc[((data['weekday'] == weekday) & (data['hour'] == hour)),'week_average'] = val

    results = []

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')
        
helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	1.392215,	0.911413
MSE:	4.833897,	5.243744
RMSE:	1.943702,	1.027580
MAE:	1.389036,	0.761725


## Cluster monthly average as a feature

In [55]:
%%time

first_year = dutils.select_range(cluster_data, datetime(2014, 1, 1), datetime(2014, 12, 31))
week_average = first_year.groupby(['month', 'weekday', 'hour']).mean()['spotreba']

prd = Predictor('spotreba', ['month', 'weekday', 'hour', 'week_average'], svm.SVR)

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = pd.read_csv(USER_PATHS[user_index])
    
    data['month_average'] = 0

    for indices, val in week_average.items():
        #print(indices, val)
        month = indices[0]
        weekday = indices[1]
        hour = indices[2]

        data.loc[(
                (data['month'] == month)
                & (data['weekday'] == weekday) 
                & (data['hour'] == hour)
            ),'week_average'] = val

    results = []

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')
        
helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	1.392648,	0.911868
MSE:	4.836204,	5.255245
RMSE:	1.944012,	1.028116
MAE:	1.389413,	0.762155
Wall time: 5min 43s


## Cluster weekend average as a feature

In [58]:
%%time

first_year = dutils.select_range(cluster_data, datetime(2014, 1, 1), datetime(2014, 12, 31))
weekend_average = first_year.groupby(['weekend', 'hour']).mean()['spotreba']

prd = Predictor('spotreba', ['weekend', 'hour', 'weekend_average'], svm.SVR)

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = dutils.add_weekdays(pd.read_csv(USER_PATHS[user_index]))
    
    data['weekend_average'] = 0

    for indices, val in weekend_average.items():
        #print(indices, val)
        weekend = indices[0]
        hour = indices[1]

        data.loc[(
                (data['weekend'] == weekend) 
                & (data['hour'] == hour)
            ),'weekend_average'] = val

    results = []

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')
        
helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	1.281207,	0.950426
MSE:	4.527612,	5.243757
RMSE:	1.832503,	1.081456
MAE:	1.275556,	0.794679
Wall time: 3min 3s


## Trained on personal data - month, weekday, hour

In [6]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = pd.read_csv(USER_PATHS[user_index])

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')

helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	1.371523,	1.292701
MSE:	2.012039,	2.778121
RMSE:	1.238078,	0.692245
MAE:	0.915845,	0.525045


## Trained on personal data - weekend, hour

In [60]:
%%time

prd = Predictor('spotreba', ['weekend', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = dutils.add_weekdays(pd.read_csv(USER_PATHS[user_index]))

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')

helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	1.303226,	1.297499
MSE:	1.921200,	2.778209
RMSE:	1.195899,	0.700732
MAE:	0.866833,	0.524111
Wall time: 2min 52s


## Trained on personal data - all

In [61]:
%%time

prd = Predictor('spotreba', ['month', 'day', 'weekday', 'weekend', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = dutils.add_weekdays(pd.read_csv(USER_PATHS[user_index]))

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')

helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	1.507634,	1.021880
MSE:	2.252021,	2.652018
RMSE:	1.339654,	0.676276
MAE:	1.033734,	0.512194
Wall time: 3min 37s


## Trained on cluster data

In [5]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = pd.read_csv(USER_PATHS[user_index])

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(cluster_data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')

helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	6.345158,	3.774479
MSE:	4.715070,	5.024109
RMSE:	1.915637,	1.022450
MAE:	1.528209,	0.887734


## Trained on mixed data

In [44]:
mixed_data 

prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_index in enumerate(cluster0_ids):
    print('%d'%(index+1), end=', ')
    data = dutils.select_range(pd.read_csv(USER_PATHS[user_index]), datetime(2014, 1, 1), datetime(2015,12,31))

    mixed_data = cluster_data.copy()
    mixed_data['spotreba'] = np.c_[cluster_data['spotreba'].as_matrix(), data['spotreba']].mean(axis=1)

    
    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)
        
        train_data = dutils.select_range(mixed_data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')

helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

MASE:	2.632462,	1.502354
MSE:	2.641510,	3.255659
RMSE:	1.426142,	0.779505
MAE:	1.072722,	0.607509


# Cluster 1

In [9]:
cluster1_ids = assignments[assignments['cluster_id'] == 1]['id'].tolist()

## Trained on personal data

In [15]:
data_paths = np.array(USER_PATHS)[cluster1_ids].tolist()

In [17]:
prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_path in enumerate(data_paths):
    print('%d'%(index+1), end=', ')
    data = pd.read_csv(user_path)

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')
        
helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, MASE:	1.305228,	1.057415
MSE:	0.997915,	1.245958
RMSE:	0.876413,	0.479391
MAE:	0.608321,	0.342043


## Trained on cluster average

In [7]:
cluster_data = pd.read_csv('.exports/clusters/1.csv', index_col=0)
cluster_data.columns = ['year', 'month', 'day', 'spotreba', 'hour']
cluster_data = dutils.add_weekdays(cluster_data)

prd = Predictor('spotreba', ['month', 'weekday', 'hour'], svm.SVR)

results = []

print('Processing user: ', end='')
for index, user_index in enumerate(cluster1_ids):
    print('%d'%(index+1), end=', ')
    data = pd.read_csv(USER_PATHS[user_index])

    for i in range(0, EVAL_DAYS):
        eval_day = START_DAY + timedelta(days=i)

        train_data = dutils.select_range(cluster_data, eval_day - WINDOW - timedelta(days=1), eval_day - timedelta(days=1))
        test_data = dutils.select_range(data, eval_day, eval_day)

        prd.train(train_data)

        predicted = prd.predict(test_data)

        results.append(prd.eval(predicted, test_data, train_data))

print('\n')

helpers.print_evaluations(results)

Processing user: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 

MASE:	3.177730,	1.477857
MSE:	1.217662,	1.478113
RMSE:	0.991021,	0.485325
MAE:	0.753836,	0.358887
