In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
import theano
import theano.tensor as T
import crbm as C
import time

In [2]:
allData = pd.read_csv('../household_power_consumption.txt',';',index_col=0,na_values='?',header=0,parse_dates=[[0, 1]],infer_datetime_format=True)

allData.dtypes

Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtype: object

In [3]:
#reduce the number of data coputing the max of each hour
groupedByH = allData.groupby(pd.TimeGrouper('H')).max()
#groupedByH

In [4]:
#split the dataset in traininig,validation and test set
def splitDataset(dt):
    y = dt.year
    if(y>=2006 and y<=2008):
        return 'training'
    if(y==2009):
        return 'validation'
    if(y==2010):
        return 'test'

#remove the null rows
def removeNullRows(dataSet):
    idxNAN = pd.isnull(dataSet).any(1).nonzero()[0]

#since there are the Nan, we should remove it before the training
#therefore, we split the traiing set as sequnces of series without Nan
    start = 0
    idxSequences = []
    seqlen = []
    for idx in idxNAN:
        if(start < idx):
            #print str(start) + '-' + str(idx-1)
            idxSequences += range(start,idx)
            seqlen += [idx-start]
            start = idx+1
        else:
            start = start +1
    #print str(start) + '-' + str(len(dataSet))
    idxSequences += range(start,len(dataSet))
    seqlen +=  [len(dataSet)-start]
    #print idxSequences
    return dataSet.iloc[idxSequences],seqlen

#normalize the values
def normalizeValues(dataSet):
    return (dataSet - dataSet.mean())/ dataSet.std()


In [5]:
splittedDataset = groupedByH.groupby(splitDataset)

#split dataset
trainingSet = splittedDataset.get_group('training')
validationSet = splittedDataset.get_group('validation')
testSet = splittedDataset.get_group('test')

#remove null values
trainingSet,seqlenTR = removeNullRows(trainingSet)
validationSet,seqlenVAL = removeNullRows(validationSet)
testSet,seqlenTE = removeNullRows(testSet)

#normaliza all values with 0 mean and 1 std. dev.
trainingSet = normalizeValues(trainingSet)
validationSet = normalizeValues(validationSet)
testSet = normalizeValues(testSet)

In [6]:
#build batch data for the training
batchdata = trainingSet.values

#day and month of the batchdata
idx_train = trainingSet.index
dow = idx_train.dayofweek
m = idx_train.month
h = idx_train.hour
d = idx_train.dayofyear

season_year = np.cos(((24 * (d-1) + h)*2*np.pi/(365*24-1))+3*np.pi/2)
season_week = np.cos((dow-1)*2*np.pi/6)
season_day = np.cos(h*2*np.pi/23)

#now create a matrix s.t. the column are seasonYear | seasonWeek | seasonDay | allOtherData
batchdata = np.column_stack((season_day,batchdata))
batchdata = np.column_stack((season_week,batchdata))
batchdata = np.column_stack((season_year,batchdata))

batchdata = np.asmatrix(batchdata)
batchdata = theano.shared(np.asarray(batchdata, dtype=theano.config.floatX))

# compute number of visible units
n_dim = batchdata.get_value(borrow=True).shape[1]

In [7]:
#build validation set

validationSet_matrix = validationSet.values

#add seasonality
idx_val = validationSet.index
dow = idx_val.dayofweek
m = idx_val.month
h = idx_val.hour
d = idx_val.dayofyear

season_year = np.cos(((24 * (d-1) + h)*2*np.pi/(365*24-1))+3*np.pi/2)
season_week = np.cos((dow-1)*2*np.pi/6)
season_day = np.cos(h*2*np.pi/23)

#now create a matrix s.t. the column are seasonYear | seasonWeek | seasonDay | allOtherData
validationSet_matrix = np.column_stack((season_day,validationSet_matrix))
validationSet_matrix = np.column_stack((season_week,validationSet_matrix))
validationSet_matrix = np.column_stack((season_year,validationSet_matrix))

In [8]:
def my_training(n_hidden = 5, delay=3):

    print '\nN_HIDDEN='+str(n_hidden)+' DELAY='+str(delay) 
    #learning rate
    learning_rate = 0.001;

    #the size of the trainining epoch
    batch_size = 24;

    #number of training epochs
    training_epochs=200;

    # allocate symbolic variables for the data
    index = T.lvector()    # index to a [mini]batch
    index_hist = T.lvector()  # index to history
    x = T.matrix('x')  # the data
    x_history = T.matrix('x_history')

    #theano.config.compute_test_value='warn'
    #x.tag.test_value = np.random.randn(batch_size, n_dim)
    #x_history.tag.test_value = np.random.randn(batch_size, n_dim*delay)

    # initialize storage for the persistent chain
    # (state = hidden layer of chain)

    # construct the CRBM class
    crbm = C.CRBM(input=x, input_history=x_history, n_visible=n_dim, n_hidden=n_hidden, delay=delay)

    # get the cost and the gradient corresponding to one step of CD-15
    cost, updates = crbm.get_cost_updates(lr=learning_rate, k=1)

    batchdataindex = []
    last = 0
    for s in seqlenTR:
        batchdataindex += range(last + delay, last + s)
        last += s
    permindex = np.array(batchdataindex)
    n_train_batches = len(permindex)/ batch_size

    train_crbm = theano.function([index, index_hist], cost,
               updates=updates,
               givens={
                        x: batchdata[index],
                        x_history: batchdata[index_hist].reshape((batch_size, delay * n_dim))
                      },
               name='train_crbm')

    plotting_time = 0.
    start_time = time.clock()

    # go through training epochs
    for epoch in xrange(training_epochs):

        # go through the training set
        mean_cost = []
        for batch_index in xrange(n_train_batches):
            #print '\n'
            # indexing is slightly complicated
            # build a linear index to the starting frames for this batch
            # (i.e. time t) gives a batch_size length array for data
            data_idx = permindex[batch_index * batch_size:(batch_index + 1)* batch_size]
            #print batch_index
            #print data_idx
            # now build a linear index to the frames at each delay tap
            # (i.e. time t-1 to t-delay)
            # gives a batch_size x delay array of indices for history
            hist_idx = np.array([data_idx - n for n in xrange(1, delay + 1)]).T
            #print hist_idx
            this_cost = train_crbm(data_idx, hist_idx.ravel())
            #print batch_index, this_cost
            mean_cost += [this_cost]

        print '\rTraining epoch %d, cost is ' % epoch, np.mean(mean_cost),

    end_time = time.clock()

    pretraining_time = (end_time - start_time)

    print ('\nTraining took %f minutes' % (pretraining_time / 60.))
    
    return crbm

In [9]:
#validate on the whole validation set
def my_validation(crbm):
    n_samples=1
    delay = crbm.delay
    
    data_idx = []
    last = 0
    for s in seqlenVAL:
        data_idx += range(last + delay, last + s)
        last += s

    data_idx = np.asarray(data_idx)
    orig_data = np.asarray(validationSet_matrix[data_idx],dtype=theano.config.floatX)


    hist_idx = np.array([data_idx - n for n in xrange(1, crbm.delay + 1)]).T
    hist_idx = hist_idx.ravel()

    orig_history = np.asarray(validationSet_matrix[hist_idx].reshape((len(data_idx), crbm.delay * crbm.n_visible)),dtype=theano.config.floatX)

    generated_series = crbm.generate(orig_data, orig_history, n_samples=n_samples,n_gibbs=30)

    MSE=[None]*crbm.n_visible
    SMAPE=[None]*crbm.n_visible
    for i in range(crbm.n_visible):
        plotGEN = generated_series[:,n_samples-1,i]
        #plt.subplot(crbm.n_visible, 1, i+1)
        #plt.plot(plotGEN)
        #plt.plot(bd[start:end,i])  
        MSE[i] = np.sum(np.power(plotGEN - orig_data[:,i],2))/(len(orig_data))
        SMAPE[i] = np.sum(np.abs(plotGEN - orig_data[:,i]) / (np.abs(plotGEN) + np.abs(orig_data[:,i]))) / len(orig_data) *100                                                                                               
    return MSE,SMAPE

In [10]:
n_hidden_values = [3,5,7,10,20]
delay_values = [1,2,3,4,5]

#in the computation we should ignore the value about seasonlaity, i.e. the first 3
store_MSE = np.zeros((len(n_hidden_values),len(delay_values),n_dim-3))
store_SMAPE = np.zeros_like(store_MSE)

for idx_nh in range(len(n_hidden_values)):
    nh = n_hidden_values[idx_nh]
    for idx_d in range(len(delay_values)):
        d = delay_values[idx_d]
        crbmMdl = my_training(n_hidden=nh,delay=d)
        MSE,SMAPE = my_validation(crbm=crbmMdl)
        store_MSE[idx_nh,idx_d,:] = MSE[3:]
        store_SMAPE[idx_nh,idx_d,:] = SMAPE[3:]



N_HIDDEN=3 DELAY=1
Training epoch 199, cost is  3.6694582108 
Training took 2.122001 minutes
Generating frame 0

N_HIDDEN=3 DELAY=2
Training epoch 199, cost is  3.33951425762 
Training took 2.070955 minutes
Generating frame 0

N_HIDDEN=3 DELAY=3
Training epoch 199, cost is  3.32537270082 
Training took 2.105050 minutes
Generating frame 0

N_HIDDEN=3 DELAY=4
Training epoch 199, cost is  3.30644145663 
Training took 2.115609 minutes
Generating frame 0

N_HIDDEN=3 DELAY=5
Training epoch 199, cost is  3.01180724315 
Training took 2.154035 minutes
Generating frame 0

N_HIDDEN=5 DELAY=1
Training epoch 199, cost is  3.59389780973 
Training took 2.104648 minutes
Generating frame 0

N_HIDDEN=5 DELAY=2
Training epoch 199, cost is  3.38363568755 
Training took 2.193111 minutes
Generating frame 0

N_HIDDEN=5 DELAY=3
Training epoch 199, cost is  3.24341882943 
Training took 2.263880 minutes
Generating frame 0

N_HIDDEN=5 DELAY=4
Training epoch 199, cost is  3.03703211904 
Training took 2.297712 mi

  ), axis=1)})


In [11]:
best_idx_nh,best_idx_d = np.unravel_index(store_MSE.mean(axis=2).argmin(),(len(n_hidden_values),len(delay_values)))
best_d = delay_values[best_idx_d]
best_nh = n_hidden_values[best_idx_nh]
print 'BEST N HIDDEN = '+str(best_nh)
print 'BEST DELAY = '+str(best_d)
print 'BEST MSE = '+str(store_MSE[best_idx_nh,best_idx_d,:].mean())

BEST N HIDDEN = 3
BEST DELAY = 4
BEST MSE = 0.980655235337


In [12]:
best_idx_nh,best_idx_d = np.unravel_index(store_SMAPE.mean(axis=2).argmin(),(len(n_hidden_values),len(delay_values)))
best_d = delay_values[best_idx_d]
best_nh = n_hidden_values[best_idx_nh]
print 'BEST N HIDDEN = '+str(best_nh)
print 'BEST DELAY = '+str(best_d)
print 'BEST SMAPE = '+str(store_SMAPE[best_idx_nh,best_idx_d,:].mean())

BEST N HIDDEN = 3
BEST DELAY = 5
BEST SMAPE = 53.7702167135
