### Preprocessing utilities

Anything I might possibly reuse in preprocessing that is easily separated will be put here

In [None]:
import numpy as np

In [None]:
def batch_subsects(ragged_batch, window, stride, axis=0, **kwargs):
    """
    Takes a list of timeseries samples with differing lengths and creates a new batch if timeseries
    of the same length by taking sliding windows of each timeseries in the batch.
    If the batch isn't ragged, there is undoubtedly a more efficient way to do this 
    
    Parameters
    ----------
    ragged_batch : list of array-likes
        list of timeseries samples with differing lengths. Must be of shape (tseries_i_len, *tseries_shape)
        where tseries_i_len can be unique to each timeseries in the batch but 
    window : int
        size of window desired
    stride : int
        length of stride to take between window starting points (sections overlap if stride < window)
    axis : int, default 0
        axis which will be the new batch size
    
    Returns
    -------
    new_batch : numpy array
        the new batch of timeseries samples. Default shape (new_batch_len, window, *ragged_batch[0].shape[1:])
    """
    
    new_batch_list = []
    
    for series in ragged_batch:
        slen = series.shape[0]
        if slen < window:  # can't use if segment is too smol
            continue
        idxs = np.array([i for i in range(slen)])
        starting_idxs = idxs[:slen-window:stride]
        new_batch_list += [series[starting_idxs[i]:starting_idxs[i]+window] for i in range(len(starting_idxs))]
        
    new_batch = np.stack(new_batch_list, axis=axis) # shove it all together
    
    return new_batch
    

In [None]:
def rebalance_ctrl_group(inputs_list, labels, null_label=0, thinning_factor=.5, seed=27):
    """
    Removes some of the control group of a categorical / series by-datapoint categorical dataset
    
    Parameters
    ----------
    inputs_list : list of arrays of shape (batch_size, ~, samples_length)
        The input to thin (may contain multiple datasets)
    labels : array of shape (batch_size, samples_length)
        The labels for the input dataset. Categories could be one-hot or otherwise
        so long as the null label matches the kwarg passed in (default 0)
    null_label : value or numpy array of values
        the label for the overrepresented (control group) category
    thinning_factor : float
        The factor to reduce. balanced_batch_size = batch_size - int(ctrlgrp_batch_size * thinning_factor)
        Can't be greater than 1 or less than 0.
    seed : int
        Seed for the random selection of nonevents to delete
    
    Outputs
    -------
    balanced_inputs_list : list of arrays of shape (balanced_batch_size,~, samples_length)
        Inputs with less null category overrepresentation
    balanced_labels : array of shape (balanced_batch_size, categorization, samples_length)
        Labels with less null category overrepresentation
    
    """
    rng = np.random.default_rng(2021)  # better than doing np.random.seed() because that is a global change
    if type(inputs_list) is not list:
        raise ValueError("Please format your inputs as a list, even if you have only one input.")
    if not hasattr(null_label, "__len__"):  # single value to vector for comparing
        null_label = np.array([null_label])
    null_label_compat = np.expand_dims(null_label, (0)) # reshape label array to broadcast for comparison w/ labels
    print(null_label_compat)
    print(labels.shape)
    print(f"Total batch: {labels.shape[0]}")
    balanced_inputs_list = []
    non_batch_axes = tuple( i for i in range(1,len(labels.shape)))
    non_null_units = labels != null_label_compat  # null_label_compat gets broadcast from (1,1,categorization)?
    non_null_samples = np.sum(non_null_units, axis=non_batch_axes)  # should be 1d now. Values positive integers or 0
    null_samples = np.nonzero(non_null_samples == 0)[0] # now we know where we need to thin
    # !!! remember np.nonzero returns tuple of tuples !!!
    num_null_samples = len(null_samples)
    print(f"Number of null samples: {num_null_samples}")
    print(f"Number of non-null samples: {np.sum(non_null_samples != 0)}")
    
    # pick null samples to delete via rng
    num_dels = int(thinning_factor*num_null_samples)
    print(f"With thinning factor {thinning_factor} will remove {num_dels} null samples")
    rdels = tuple(rng.choice(null_samples, num_dels, replace=False)) # which batch rows to delete
    
    for inputs in inputs_list:
        balanced_inputs_list.append(np.delete(inputs, rdels, axis=0))
    balanced_labels = np.delete(labels, rdels, axis=0)    
    
    return balanced_inputs_list, balanced_labels

In [None]:
# def rebalance_ctrl_group_tf(inputs_list, labels, null_label=0, thinning_factor=.5, seed=27):
#     """
#     USES TENSORFLOW CONVENTION SHAPE
#     Removes some of the control group of a categorical / series by-datapoint categorical dataset
    
#     Parameters
#     ----------
#     inputs_list : list of arrays of shape (batch_size, samples_length, ~)
#         The input to thin (may contain multiple datasets)
#     labels : array of shape (batch_size, categorization)
#         The labels for the input dataset. Categories could be one-hot or otherwise
#         so long as the null label matches the kwarg passed in (default 0)
#     null_label : value or numpy array of values
#         the label for the overrepresented (control group) category
#     thinning_factor : float
#         The factor to reduce. balanced_batch_size = batch_size - int(ctrlgrp_batch_size * thinning_factor)
#         Can't be greater than 1 or less than 0.
#     seed : int
#         Seed for the random selection of nonevents to delete
    
#     Outputs
#     -------
#     balanced_inputs_list : list of arrays of shape (balanced_batch_size, samples_length, ~)
#         Inputs with less null category overrepresentation
#     balanced_labels : array of shape (balanced_batch_size, samples_length, categorization)
#         Labels with less null category overrepresentation
    
#     """
#     rng = np.random.default_rng(2021)  # better than doing np.random.seed() because that is a global change
#     if type(inputs_list) is not list:
#         raise ValueError("Please format your inputs as a list, even if you have only one input.")
#     if not hasattr(null_label, "__len__"):  # single value to vector for comparing
#         null_label = np.array([null_label])
#     null_label_compat = np.reshape(null_label, (1, 1, *labels.shape[2:])) # reshape label array to broadcast for comparison w/ labels
#     print(f"Total batch: {labels.shape[0]}")
#     balanced_inputs_list = []
#     non_batch_axes = tuple( i for i in range(1,len(labels.shape)))
#     non_null_units = labels != null_label_compat  # null_label_compat gets broadcast from (1,1,categorization)?
#     non_null_samples = np.sum(non_null_units, axis=non_batch_axes)  # should be 1d now. Values positive integers or 0
#     null_samples = np.nonzero(non_null_samples == 0)[0] # now we know where we need to thin
#     # !!! remember np.nonzero returns tuple of tuples !!!
#     num_null_samples = len(null_samples)
#     print(f"Number of null samples: {num_null_samples}")
#     print(f"Number of non-null samples: {np.sum(non_null_samples != 0)}")
    
#     # pick null samples to delete via rng
#     num_dels = int(thinning_factor*num_null_samples)
#     print(f"With thinning factor {thinning_factor} will remove {num_dels} null samples")
#     rdels = tuple(rng.choice(null_samples, num_dels, replace=False)) # which batch rows to delete
    
#     for inputs in inputs_list:
#         balanced_inputs_list.append(np.delete(inputs, rdels, axis=0))
#     balanced_labels = np.delete(labels, rdels, axis=0)    
    
#     return balanced_inputs_list, balanced_labels
    
    
    
    

In [None]:
def batch_unpadded_subsects(ragged_batch, padding, stride, axis=0, **kwargs):
    """
    Takes a list of timeseries samples with differing lengths and creates a new batch of timeseries. 
    Intended for TRUE VALUES used in the loss function evaluation when the INPUT VALUES are taking additional (padding)
    datapoints outside of the prediction timewindow. That way each padded input segment matches correctly to a non-padded 'true value'
    segment.
    If the batch isn't ragged, there is undoubtedly a more efficient way to do this. 
    
    Parameters
    ----------
    ragged_batch : list of array-likes
        list of timeseries samples with differing lengths. Must be of shape (tseries_i_len, *tseries_shape)
        where tseries_i_len can be unique to each timeseries in the batch but 
    padding : int
        amount of datapoints to omit on each side of each array to account for padding on other inputs
    stride : int
        length of stride to take between window starting points (sections overlap if stride < window)
    axis : int, default 0
        axis which will be the new batch size
    
    Returns
    -------
    new_batch : numpy array
        the new batch of timeseries samples. Default shape (new_batch_len, window, *ragged_batch[0].shape[1:])
    """
    new_batch_list = []
    
    for series in ragged_batch:
        slen = series.shape[0]
        idxs = np.array([i for i in range(slen)])
        starting_idxs = idxs[padding:slen-stride-padding:stride]
        new_batch_list += [series[starting_idxs[i]:starting_idxs[i]+stride] for i in range(len(starting_idxs))]
        
    new_batch = np.stack(new_batch_list, axis=axis) # shove it all together
    
    return new_batch
    