In [None]:
import tbtools.dev as tbdev
import utils.data.design_matrices as dm
import utils.data as ud
import utils.features as uf

import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

#### Gets a memoryerror

In [None]:
boa = uf.bag_of_alarms([1,2], '5 min', '5 min')

# Design matrices

## The issue of sample rate

Try 2 minutes, between 0600 and 2310.

#### Discussion
C, dC, R, bag-of-alarms are all sampled at a frequency of 4 seconds - but only if new that sample contains info.
Due to memory limitations, we cannot resample bag-of-alarms to full 4 seconds over the whole period (September 2012).

But some things can help us:
1. The slaughterhouse is inactive at nighttime
    - Restrict sampling to work hours (approx 0600-2315
2. The data is highly autocorrelated
    - So we can take reasonably distant samples and use them, dropping the samples lying in-between, as they are already really correlated with the surrounding samples.
    
So, using uniformly distributed samples over the whole day, we can get a reasonably good representation of each day.

Depending on the window sizes and lags used, we can then ensure that the samples don't overlap: So for instance, for 
Since we're looking to predict 5 minutes ahead (at first), I think that the frequency should be <= 5 minutes.

But given the following example, where C goes from very little to a lot in 5 minutes, perhaps it should be more often. Let's try 2 minutes?

#### An example of quick change over 5 minutes

In [None]:
c = uf.C(1)

In [None]:
c['2012-09-07 15:50':'2012-09-07 16:10'].plot()

#### What are the working hours?

First C: 0609
Latest C: 2301

In [None]:
df = c.groupby(c.index.date).agg({'min':lambda x: x.index.time.min(), 'max':lambda x: x.index.time.max()})
print('minimum:\n', df.min())
print('maximum:\n', df.max())

## Do it 

In [None]:
ud = tbdev.reload(ud)

#### Overall settings

In [None]:
daily_start = '06:00'
daily_end = '23:10'

#### Code stuff

In [None]:
uf = tbdev.reload(uf)

In [None]:
def get_name(sample_step, split, lag, dcwindow, rn, boawindow):
    return '  '.join(['ss {}', 'L {}', 'dcw {}', 
                      'rn {}', 'bw {}', 'split {}']
                    ).format(sample_step, lag, dcwindow, 
                             rn, boawindow, split)

In [None]:
def get_indices(sample_step, version='16', 
                day_start=daily_start, day_end=daily_end):
    """
    Returns train, validation, test indices as a tuple
    
    sample_step is like '2 min', indicating the time between each index
    version is 16 or 20, telling how many days to take into account.
        Use only 16 if you intend to use bag-of-alarms.
    day_start is the time of day where we start sampling
    day_end is the time of day where we stop sampling
    """
    days = list(map(lambda x: ud.design_matrices.get_split_days(version, x), 
                    ('train', 'val', 'test')))
    
    def convert(day):
        return pd.date_range('{} {}'.format(day, day_start), 
                             '{} {}'.format(day, day_end), 
                             freq=sample_step)
    
    for i in range(len(days)):
        splitdays = days[i]
        v = convert(splitdays[0])
        for d in splitdays[1:]:
            v = v.union(convert(d))
        v = v.sort_values()
        days[i] = v
    
    return tuple(days)

In [None]:
import os

def construct(sample_step, lag, dcwindow, rn, boawindow):
    names = list(map(
                lambda x: get_name(sample_step=sample_step, 
                                   split=x, 
                                   lag=lag, 
                                   dcwindow=dcwindow, 
                                   rn=rn, 
                                   boawindow=boawindow), 
                ('train', 'test', 'val')))
    
    basepath = ud.paths.Paths.design_matrices
    
    if all((os.path.isfile(os.path.join(basepath, n)) for n in names)):
        print('Dumps with settings ({}) already exist'.format(names[0]))
        if input('Proceed anyway? y/n > ') != 'y':
            print('Aborting...')
            return
        else:
            print('Proceeding, overwriting old files')

    def _construct(sampleidx):
        return uf.sample_all(sampleidx,
                        lag=lag,
                        delta_c_kwargs={'window':dcwindow},
                        r_kwargs={'n':rn},
                        bag_of_alarms_kwargs={'window':boawindow})
    
    for name, idx in zip(names, get_indices(sample_step)):
        path = os.path.join(basepath, name)
        df = _construct(idx)
        df.to_pickle(path)
        print(name, 'stored in', path)    

#### Run it!

In [None]:
construct(sample_step='2 min', lag='10 min', dcwindow='2 min', rn=10, boawindow='2 min')