In [1]:
# additional section can be used for generation of present/future pairs
def generate_offset_snapshot_list(trajectory_list, offset):
    # if this function is called with offset 0, present and future trajectory list are the same
    # as the trajectory_list and are therefore returned like this without any calculation
    if offset == 0:
        return trajectory_list, trajectory_list
    # Takes in a list or np.array of trajectories and an offset value and generates two np.arrays with respective new versions
    # of the trajectories. 
    # present_trajectory_list contains all snapshots of the trajectories excluding
    # the last n (speficified by offset) of each.
    # future_trajectory_list contains all snapshots of the trajectories excluding
    # the first n (specified by offset) of each.
    # Consequently the both lists can be used as input and desired output of an autoencoder to
    # train for future predictive variables.
    past_trajectory_list = np.array([trajectory[:-offset] for trajectory in trajectory_list])  
    #truncated_present_trajectory_list = 
    trajectory_list = np.array([trajectory[offset:] for trajectory in trajectory_list])
    return past_trajectory_list, trajectory_list

In [None]:
def get_snapshot_label_and_weight_list(trajectory_list, trajectory_label_list, offset = 0, progress_label = False):
    # takes in a list of trajectories and corresponding labels and generates concatenated lists of snapshots, 
    # snapshot label and snapshot progress labels
    # can be used for present/future trajcetory lists by use or offset (same as used for generation of the list)
    # and future = True for the future trajectory list
    snapshot_list = []
    snapshot_label_list = []
    snapshot_weight_list = []
#    len_AA = sum([1 for x in trajectory_label_list if x == 0.0])
#    len_AB = sum([1 for x in trajectory_label_list if x == 0.5])
#    len_BB = len(trajectory_label_list) - len_AA - len_AB
    len_AA = sum([len(trajectory_list[x]) for x in range(len(trajectory_label_list)) \
                  if trajectory_label_list[x] == -1.0])
    len_AB = sum([len(trajectory_list[x]) for x in range(len(trajectory_label_list)) \
                  if trajectory_label_list[x] == -0.5])
    len_BA = sum([len(trajectory_list[x]) for x in range(len(trajectory_label_list)) \
                  if trajectory_label_list[x] == 0.5])
    len_BB = sum([len(trajectory_list[x]) for x in range(len(trajectory_label_list)) \
                  if trajectory_label_list[x] == 1.0])

    reciprocal_len_AA = 1.0/len_AA
    reciprocal_len_AB = 1.0/len_AB
    reciprocal_len_BA = 1.0/len_BA
    reciprocal_len_BB = 1.0/len_BB
    print(len_AA, len_AB, len_BA, len_BB)
    for trajectory_nr in range(len(trajectory_list)):
        trajectory = trajectory_list[trajectory_nr]
        trajectory_label = trajectory_label_list[trajectory_nr]
        for snapshot_nr in range(len(trajectory)):
            snapshot_list.append(trajectory[snapshot_nr])
            """
            Assigns the label and weight for each of the snapshots.
            For AA and BB paths the label is set to 0 or 1 respectively, 
            and the weight set to the corresponding value for this type of path.
            If it is a transition path the corresponding weight is assigned to the snapshot,
            and depending on whether progress_label is true or not either the label copied
            or recalculated based on how far the path has gotten.
            """
            # Calculates the progress along the path for AB paths. If the path label is 1 or 0,
            # all snapshot are assigned the same label. If the path label is different (e.g. 0.5),
            # indicating a sucessfull transition a progress along the snapshots is calculated based on
            # the position within the trajectory and the total trajectory length.
            # For present/future lists, the offset needs to be taken into account in the denominator
            # If the dataset is a future variant of an offset trajectory list the progress label 
            # needs to additionally take the offset into account in the nominator.
            if trajectory_label == -1.0:
                snapshot_weight_list.append(reciprocal_len_AA*LABEL_AA_weight_factor)
                snapshot_label_list.append(-1.0)
#                snapshot_label_list.append(trajectory_label)
            elif trajectory_label == 1.0:
                snapshot_weight_list.append(reciprocal_len_BB*LABEL_BB_weight_factor)
                snapshot_label_list.append(1.0)
            else:
                snapshot_weight_list.append(reciprocal_len_AB*LABEL_AB_weight_factor \
                                          + reciprocal_len_BA*LABEL_BA_weight_factor)
                # leaving out "Future == True" from prior versions of the code
                # means that the snapshot_label_list of the past version will be incorrect
                # since it is not used, however, this should not pose a problem

                # if the path is an AB path, the progress label counts upwards
                # if it is a BA path it counts down
                if trajectory_label == -0.5:
#                    snapshot_weight_list.append(reciprocal_len_AB*LABEL_AB_weight_factor)
                    if progress_label == False:
                        snapshot_label_list.append(0.0)
                    else:
                        snapshot_label_list.append((snapshot_nr + offset)\
                                    /(len(trajectory) - 1.0 + offset))
                elif trajectory_label == 0.5:
#                    snapshot_weight_list.append(reciprocal_len_BA*LABEL_BA_weight_factor)
                    if progress_label == False:
                        snapshot_label_list.append(0.0)
                    else:
                        snapshot_label_list.append((len(trajectory)\
                                    -(snapshot_nr + offset + 1))/(len(trajectory) + offset - 1))

    return np.array(snapshot_list), np.array(snapshot_label_list), np.array(snapshot_weight_list)

In [None]:
def remove_outliers(snapshot_list, lower_bound, upper_bound):
    """Sets the values of a snapshopt that lie outside of the bounds to that 
    bound while leaving the other values unchanged.
    Initially transposes the snapshot_list to a column list
    For each column, it iterates over all entries and compares them to the 
    lower or upper bound of that column. If they are lower or higher, they are changed to
    the value of that bound.
    Returns the transpose of the column list, thereby giving the cleaned snapshot_list.
    """
    
    column_list = np.transpose(snapshot_list)
    column_list = [[min(upper_bound[col_nr],max(lower_bound[col_nr],entry)) \
                                for entry in column_list[col_nr]] \
                                for col_nr in range(len(lower_bound))]
    return np.transpose(column_list)

In [None]:
def normalize_snapshots(snapshot_list, mean, std):
    """Normalizes the snapshot_list by substracting the mean and dividing by the standard deviation."""
    return (snapshot_list - mean)/std

In [None]:
def shuffled_train_test_split(trajectory_list, trajectory_label_list, split_ratio, offset = 0, progress_label = False):
    assert isinstance(split_ratio, float), "Split ratio needs to be a float between 0.0 and 1.0"
    past_trajectory_list, trajectory_list = generate_offset_snapshot_list(trajectory_list, offset)
    past_snapshot_list, _, _ \
            = get_snapshot_label_and_weight_list( \
            past_trajectory_list, trajectory_label_list, offset, \
            progress_label = progress_label)
    snapshot_list, snapshot_label_list, snapshot_weight_list \
            = get_snapshot_label_and_weight_list( \
            trajectory_list, trajectory_label_list, offset, \
            progress_label = progress_label)
    past_snapshot_list, snapshot_list, snapshot_label_list, snapshot_weight_list \
            = shuffle(past_snapshot_list, snapshot_list, snapshot_label_list, \
            snapshot_weight_list)
    
    # could consider removing outliers here, but normialization and bounds are calculated
    # based on the training set and therefore do not exist before the split
    
    train_size = int(len(snapshot_label_list) * split_ratio)
    
    train_past_snapshot_list = past_snapshot_list[:train_size].copy()
    test_past_snapshot_list = past_snapshot_list[train_size:].copy()
    train_snapshot_list = snapshot_list[:train_size].copy()
    test_snapshot_list = snapshot_list[train_size:].copy()
    train_snapshot_label_list = snapshot_label_list[:train_size].copy()
    test_snapshot_label_list = snapshot_label_list[train_size:].copy()    
    train_snapshot_weight_list = snapshot_weight_list[:train_size].copy()
    test_snapshot_weight_list = snapshot_weight_list[train_size:].copy()

    # calculates the lower and upper bound for the dataset according to the OUTLIER_CUTOFF
    lower_bound = np.percentile(train_past_snapshot_list, 100*OUTLIER_CUTOFF, axis = 0)
    upper_bound = np.percentile(train_past_snapshot_list, 100*(1-OUTLIER_CUTOFF), axis = 0)

    # removes outliers
    train_past_snapshot_list = remove_outliers(train_past_snapshot_list, lower_bound, upper_bound)
    test_past_snapshot_list = remove_outliers(test_past_snapshot_list, lower_bound, upper_bound)
    train_snapshot_list = remove_outliers(train_snapshot_list, lower_bound, upper_bound)
    test_snapshot_list = remove_outliers(test_snapshot_list, lower_bound, upper_bound)

    # Calculate mean and std of the test snapshots
    train_mean = np.mean(train_snapshot_list, axis = 0)
    train_std = np.std(train_snapshot_list, axis = 0)
    
    # Normalize the data
    train_past_snapshot_list = normalize_snapshots(train_past_snapshot_list, train_mean, train_std)
    test_past_snapshot_list = normalize_snapshots(test_past_snapshot_list, train_mean, train_std)
    train_snapshot_list = normalize_snapshots(train_snapshot_list, train_mean, train_std)
    test_snapshot_list = normalize_snapshots(test_snapshot_list, train_mean, train_std)

    return train_past_snapshot_list, train_snapshot_list, \
            train_snapshot_label_list, train_snapshot_weight_list, \
            test_past_snapshot_list, test_snapshot_list, \
            test_snapshot_label_list, test_snapshot_weight_list



In [None]:
def generate_ds(trajectory_list, trajectory_label_list, split_ratio, offset = 0, progress_label = False):
    """Insert docstring"""
    train_past_snapshot_list, train_snapshot_list, \
    train_snapshot_label_list, train_snapshot_weight_list, \
    test_past_snapshot_list, test_snapshot_list, \
    test_snapshot_label_list, test_snapshot_weight_list \
        = shuffled_train_test_split(trajectory_list, \
        trajectory_label_list, split_ratio, offset = offset, \
        progress_label = progress_label)    
    
    dataset_size = len(train_snapshot_list) + len(test_snapshot_list)
    # generates the dataset by feeding in a tuple, of dictionaries (alternative would be a tuble of lists)
    train_ds = tf.data.Dataset.from_tensor_slices(({INPUT_NAME: train_past_snapshot_list},
            {OUTPUT_NAME_1: train_snapshot_label_list, 
            OUTPUT_NAME_2: train_snapshot_list},
            {OUTPUT_NAME_1: train_snapshot_weight_list,
            OUTPUT_NAME_2: train_snapshot_weight_list})).shuffle(dataset_size)
    test_ds = tf.data.Dataset.from_tensor_slices(({INPUT_NAME: test_past_snapshot_list},
            {OUTPUT_NAME_1: test_snapshot_label_list, 
            OUTPUT_NAME_2: test_snapshot_list},
            {OUTPUT_NAME_1: test_snapshot_weight_list,
            OUTPUT_NAME_2: test_snapshot_weight_list})).shuffle(dataset_size)

    return train_ds, test_ds, \
            train_past_snapshot_list, train_snapshot_list, \
            train_snapshot_label_list, train_snapshot_weight_list, \
            test_past_snapshot_list, test_snapshot_list, \
            test_snapshot_label_list, test_snapshot_weight_list

In [None]:
def show_batch(dataset):
    for batch, label, weights in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key,value.numpy()))