#### Load the dataset, preprocess it

In [1]:
from utils import *
from pandas_profiling import ProfileReport
np.random.seed(2020)

dataset_df = get_table().dropna()
mask = (dataset_df['battery_plugged'] == 0) | (dataset_df['battery_plugged'] == 1)
dataset_df = dataset_df[mask]
# month in 8-12, 1-3, day in 1-31
# the following replacements keep 'monthday' chronologically sorted when hashed later
dataset_df['month'][dataset_df['month'] == 1] = 13
dataset_df['month'][dataset_df['month'] == 2] = 14
dataset_df['month'][dataset_df['month'] == 3] = 15
dataset_df['monthday'] = dataset_df['month']*100 + dataset_df['day']

text = 'packages_running_'
keep = [i for i in dataset_df.columns if text in i] + ['battery_plugged'] + ['battery_level'] + ['slot'] + ['monthday']
dataset_df = dataset_df[keep[:49] + keep[50:54] + keep[55:]]
dataset_df = dataset_df.dropna().T.drop_duplicates().T.reset_index()
dataset_df['md_key'] = hash_states(dataset_df['monthday'].to_numpy()[:,None])
dataset_df = dataset_df.drop(['monthday', 'slot'], axis=1)
dataset_df = dataset_df.drop(['packages_running_android', 'packages_running_com.android.calculator2',\
                             'packages_running_com.android.keychain','packages_running_com.android.packageinstaller',\
                             'packages_running_com.android.providers.applications', 'packages_running_com.android.providers.downloads',\
                             'packages_running_com.google.android.email', 'packages_running_edu.udo.cs.ess.mobidac.target',\
                             'packages_running_org.openintents.filemanager', 'packages_running_stream.android'], axis=1)

# get indices of dataset elements per day, so that we can use this partitioning of the data in training and validation
num_days = dataset_df['md_key'].to_numpy().max() + 1
# by day is a list that for each day, contains all dataset indices for that day
by_day = [np.array(dataset_df.index[dataset_df['md_key'] == i].tolist()) for i in range(num_days)]
# keep only days with at least 5 samples
by_day_filtered = [item for item in by_day if len(item) > 4]
# we can access day i by calling dataset_df.loc[by_day[i]]


# in this state space, battery plugged is the last column: activity_vectors[:,-1]
activity_vectors = dataset_df.drop(['index', 'battery_level', 'md_key'], axis=1).to_numpy()
targets = dataset_df['battery_level'].to_numpy()
print('Activity vectors shape:', activity_vectors.shape)
print('Targets shape:', targets.shape)
#profile = ProfileReport(dataset_df, title="Filtered Profiling Report")
#profile.to_file("filtered_report.html")

Activity vectors shape: (2445, 35)
Targets shape: (2445,)


#### Transform the state space

This is an interface for attempting some clustering, or some other informed pruning of the state space. It is however necessary that the outputs of the state space transform are still binary vectors, as this guarantees compatibility with later modules. Also, the final column activity_vectors$[:,-1]$ should remain untouched, as it stores the battery_plugged state which we need for the regressor split later.

In [2]:
def state_space_transform(activity_vectors, mode='Id'):
    if mode == 'Id':
        out = activity_vectors[:,:-1]
    elif mode == 'Clown':
        out = np.ones_like(activity_vectors[:,:-1])
    else:
        raise NotImplementedError("Unknown state space transform!")
    return np.concatenate([out, activity_vectors[:,-1, None]], axis=1)
        
states = state_space_transform(activity_vectors, mode='Id')
print('States shape:', states.shape)

States shape: (2445, 35)


#### Assign labels to the states

This assigns an integer label to each unique state, allowing us to connect different data structures in a way that we always know which states they are representing.

In [3]:
out_labels = hash_states(states)
dataset_df['out_labels'] = out_labels
helper_states = lookup_states(np.arange(0, out_labels.max()+1), dataset_df['out_labels'].values, states)
num_unique_states = out_labels.max()+1
print('Number of unique states:', num_unique_states)

hash_by_day = [dataset_df['out_labels'].loc[inds].values for inds in by_day_filtered]
print('by_day_filtered example:', by_day_filtered[18])
print('hash_by_day example:    ', hash_by_day[18])

Number of unique states: 637
by_day_filtered example: [448 449 450 451 452 453 454]
hash_by_day example:     [ 26 158 158 158 158 156 156]


#### Get training, validation and test sets for the prediction component

This is just a dummy implementation, the validation scheme on the task description can be implemented when we are aware of all interactions between all modules.

In [4]:
ind_set = np.arange(len(hash_by_day))
ind_set = np.random.permutation(ind_set).astype(int)

split = 0.90
ind_set_train = ind_set[:int(split*len(ind_set))]
ind_set_valid = ind_set[int(split*len(ind_set)):]

print('Number of sequences in training set:', len(ind_set_train))
print('Number of sequences in validation set:', len(ind_set_valid))

train_set_prediction = [hash_by_day[ind] for ind in ind_set_train]
valid_set_prediction = [hash_by_day[ind] for ind in ind_set_valid]

P = fit_predictor(train_set_prediction, num_unique_states, mode='MAP')
print('Transition matrix shape:', P.shape)

Number of sequences in training set: 94
Number of sequences in validation set: 11
Transition matrix shape: (637, 637)


#### Prediction output transform

Apply the transition models to generate distributions over future states. These distributions need to be sent back to the state space, because we need to plug them into a regressor trained in the state space later. There are several concievable ways to go from a distribution over states to an element of the state space, and this cell provides an interface for that.

In [5]:
# document these!
def predict(list_of_seqs, P, N_steps, num_unique_states):
    in_flat = np.concatenate([item[:-N_steps] for item in list_of_seqs])
    one_hot_prediction_input = one_hot_encode(in_flat, num_states = num_unique_states)
    pred = np.power(P, N_steps)
    # calculate a distribution over future states
    return pred @ one_hot_prediction_input

def list_to_prediction_targets(list_of_seqs, N_steps, labels, state_set):
    targets = np.concatenate([item[N_steps:] for item in list_of_seqs])
    return lookup_states(targets, labels, state_set)

def prediction_output_transform(pred_out, labels, state_set, mode):
    if mode == 'activity_dist':
        return state_dist_to_activity_dist(pred_out, labels, state_set)
    elif mode == 'argmax':
        return state_dist_to_most_likely_state(pred_out, labels, state_set)
    elif mode == 'nearest_neighbor':
        distribution = state_dist_to_activity_dist(pred_out, labels, state_set)
        return distribution >= 0.5
    else:
        raise NotImplementedError("Unknown prediction output transform!")
        
def bit_flips(prediction_dist, target):
    dist_rounded = prediction_dist >= 0.5
    loss = dist_rounded.astype(int) == target.astype(int)
    return 1 - loss.mean(axis=-1)

In [6]:
N_steps = 1
estimate = predict(valid_set_prediction, P, N_steps, num_unique_states)
# convert to sets of individual distributions over activity components
prediction = prediction_output_transform(estimate, dataset_df['out_labels'].values, states, 'activity_dist')
# simple targets - just time-shifted the input by N_steps
pred_targets = list_to_prediction_targets(valid_set_prediction, N_steps, dataset_df['out_labels'].values, states)
# calculate loss
loss = BCE(prediction, pred_targets) #percentage bitflips as alternatives
loss_bit = bit_flips(prediction, pred_targets) #percentage bitflips as alternatives
# print reduced loss
print('Prediction loss over the data for', N_steps, 'time step(s): mean:', loss.mean(), 'std:', loss.std())
print('Prediction bit flips over the data for', N_steps, 'time step(s): mean:', loss_bit.mean(), 'std:', loss_bit.std())

Prediction loss over the data for 1 time step(s): mean: 0.4523963185924837 std: 0.09233427479952037
Prediction bit flips over the data for 1 time step(s): mean: 0.21349206349206348 std: 0.07196635546289737


#### Transform the target space

Sebastian already offered some advice here, like taking the square root of the values to allow for a better line fit through them. This cell provides an interface for such techniques. Note that these should always be invertible, because we need to convert the regressor output back to the original target space. This should be implemented in the 'Backward' direction case.

This part is a little hacky, and might need some changes to be able to elegantly account for using two regressors.

In [7]:
def target_space_transform(targets, mode='Id', direction='Forward'):
    if mode == 'Id':
        if direction == 'Forward':
            return targets
        elif direction == 'Backward':
            return targets
        else:
            raise NotImplementedError("Unknown direction!")
    elif mode == 'Sqrt':
        if direction == 'Forward':
            return np.sqrt(np.abs(targets))
        elif direction == 'Backward':
            return -np.power(targets, 2)
        else:
            raise NotImplementedError("Unknown direction!")   
    else:
        raise NotImplementedError("Unknown target space transform!")
        
targets_transformed = target_space_transform(targets)

#### Get training, validation and test sets for the regression component

Do this for negative and positive targets separately!

In [8]:
print('Regression component train/valid/test here!')
ind_set_regressor = np.arange(targets_transformed.shape[0])
ind_set_regressor = np.random.permutation(ind_set_regressor).astype(int)

split = 0.9
ind_set_regressor_train = ind_set_regressor[:int(split*len(ind_set_regressor))]
ind_set_regressor_valid = ind_set_regressor[int(split*len(ind_set_regressor)):]

print('Number of sequences in training set:', len(ind_set_regressor_train))
print('Number of sequences in validation set:', len(ind_set_regressor_valid))

train_states = states[ind_set_regressor_train]
train_targets = targets_transformed[ind_set_regressor_train]

valid_states = states[ind_set_regressor_valid]
valid_targets = targets_transformed[ind_set_regressor_valid]

Regression component train/valid/test here!
Number of sequences in training set: 2200
Number of sequences in validation set: 245


In [9]:
# maybe check what is X, and what is X.T here...
def build_state_mat(states):
    return np.concatenate([np.ones((states.shape[0],1)), states], axis=-1).T

def fit_regressor(states, targets, mode='MLE', eps=1e-25):
    X = build_state_mat(states)
    if mode == 'MLE':
        return np.linalg.inv(X @ X.T + eps*np.eye(X.shape[0])) @ X @ targets
    elif mode == 'MAP':
        var = targets.var()
        return np.linalg.inv(X @ X.T + var*np.eye(X.shape[0])) @ X @ targets
    else:
        raise NotImplementedError("Unknown regressor mode!")

In [10]:
split_regressor = True
regressor_mode = 'MAP'
if split_regressor:
    charging_transform_mode = 'Id'
    discharging_transform_mode = 'Sqrt'
    train_charging_mask = train_states[:,-1] > 0.5
    train_discharging_mask = np.invert(train_charging_mask)
    train_states_charging = train_states[train_charging_mask][:,:-1]
    train_targets_charging = train_targets[train_charging_mask]
    train_states_discharging = train_states[train_discharging_mask][:,:-1]
    train_targets_discharging = train_targets[train_discharging_mask]
    train_targets_charging_transformed = target_space_transform(train_targets_charging, mode=charging_transform_mode)
    train_targets_discharging_transformed = target_space_transform(train_targets_discharging, mode=discharging_transform_mode)
    
    print('Number of samples in the charging training set:', train_targets_charging.shape[0])
    print('Number of samples in the discharging training set:', train_targets_discharging.shape[0])
    
    w_charging = fit_regressor(train_states_charging, train_targets_charging_transformed, mode=regressor_mode)
    w_discharging = fit_regressor(train_states_discharging, train_targets_discharging_transformed, mode=regressor_mode)
    
    valid_charging_mask = valid_states[:,-1] > 0.5
    valid_discharging_mask = np.invert(valid_charging_mask)
    valid_states_charging = valid_states[valid_charging_mask][:,:-1]
    valid_states_discharging = valid_states[valid_discharging_mask][:,:-1]
    valid_targets_charging = valid_targets[valid_charging_mask]
    valid_targets_discharging = valid_targets[valid_discharging_mask]
    valid_state_mat_charging = build_state_mat(valid_states_charging)
    valid_state_mat_discharging = build_state_mat(valid_states_discharging)
    valid_charging_out_transformed = valid_state_mat_charging.T @ w_charging
    valid_discharging_out_transformed = valid_state_mat_discharging.T @ w_discharging
    valid_charging_out = target_space_transform(valid_charging_out_transformed, mode=charging_transform_mode, direction='Backward')
    valid_discharging_out = target_space_transform(valid_discharging_out_transformed, mode=discharging_transform_mode, direction='Backward')
    loss_regressor_charging = np.abs(valid_charging_out - valid_targets_charging)
    loss_regressor_discharging = np.abs(valid_discharging_out - valid_targets_discharging)
    
    print('Number of samples in the charging valid set:', valid_targets_charging.shape[0])
    print('Number of samples in the discharging valid set:', valid_targets_discharging.shape[0])
    
    print('Regressor L1 over the charging data: mean:', loss_regressor_charging.mean(), 'std:', loss_regressor_charging.std())
    print('Regressor L1 over the discharging data: mean:', loss_regressor_discharging.mean(), 'std:', loss_regressor_discharging.std())
    
    loss_regressor = np.concatenate([loss_regressor_charging, loss_regressor_discharging])
    
else:
    transform_mode = 'Id'
    train_targets_transformed = target_space_transform(train_targets, mode=transform_mode)
    w = fit_regressor(train_states, train_targets_transformed, mode=regressor_mode)
    
    valid_state_mat = build_state_mat(valid_states)
    valid_out_transformed = valid_state_mat.T @ w
    valid_out = target_space_transform(valid_out_transformed, mode=transform_mode, direction='Backward')
    loss_regressor = np.abs(valid_out - valid_targets)
    
print('Regressor L1 over the data: mean:', loss_regressor.mean(), 'std:', loss_regressor.std())

Number of samples in the charging training set: 88
Number of samples in the discharging training set: 2112
Number of samples in the charging valid set: 14
Number of samples in the discharging valid set: 231
Regressor L1 over the charging data: mean: 6.824713463756572 std: 4.93255997852312
Regressor L1 over the discharging data: mean: 1.126126339002913 std: 3.267995721260056
Regressor L1 over the data: mean: 1.4517598889888363 std: 3.6344779665913904


### TODO: MAP regression mode

Need to implement this.

### Combine Prediction and Regression

Core idea: Pick one specific predictor, then do the regression on the same training set. Hopefully the validation scheme for the regressor will provide some justification for this. 