#### Load the dataset, preprocess it

In [1]:
from utils import *
from pandas_profiling import ProfileReport
np.random.seed(2020)

dataset_df = get_table().dropna()
mask = (dataset_df['battery_plugged'] == 0) | (dataset_df['battery_plugged'] == 1)
dataset_df = dataset_df[mask]
# month in 8-12, 1-3, day in 1-31
# the following replacements keep 'monthday' chronologically sorted when hashed later
dataset_df['month'][dataset_df['month'] == 1] = 13
dataset_df['month'][dataset_df['month'] == 2] = 14
dataset_df['month'][dataset_df['month'] == 3] = 15
dataset_df['monthday'] = dataset_df['month']*100 + dataset_df['day']

text = 'packages_running_'
keep = [i for i in dataset_df.columns if text in i] + ['battery_plugged'] + ['battery_level'] + ['slot'] + ['monthday']
dataset_df = dataset_df[keep[:49] + keep[50:54] + keep[55:]]
dataset_df = dataset_df.dropna().T.drop_duplicates().T.reset_index()
dataset_df['md_key'] = hash_states(dataset_df['monthday'].to_numpy()[:,None])
dataset_df = dataset_df.drop(['monthday', 'slot'], axis=1)
dataset_df = dataset_df.drop(['packages_running_android', 'packages_running_com.android.calculator2',\
                             'packages_running_com.android.keychain','packages_running_com.android.packageinstaller',\
                             'packages_running_com.android.providers.applications', 'packages_running_com.android.providers.downloads',\
                             'packages_running_com.google.android.email', 'packages_running_edu.udo.cs.ess.mobidac.target',\
                             'packages_running_org.openintents.filemanager', 'packages_running_stream.android'], axis=1)

# get indices of dataset elements per day, so that we can use this partitioning of the data in training and validation
num_days = dataset_df['md_key'].to_numpy().max() + 1
# by day is a list that for each day, contains all dataset indices for that day
by_day = [np.array(dataset_df.index[dataset_df['md_key'] == i].tolist()) for i in range(num_days)]
# keep only days with at least 5 samples
by_day_filtered = [item for item in by_day if len(item) > 4]
# we can access day i by calling dataset_df.loc[by_day[i]]


# in this state space, battery plugged is the last column: activity_vectors[:,-1]
activity_vectors = dataset_df.drop(['index', 'battery_level', 'md_key'], axis=1).to_numpy()
targets = dataset_df['battery_level'].to_numpy()
print('Activity vectors shape:', activity_vectors.shape)
print('Targets shape:', targets.shape)
#profile = ProfileReport(dataset_df, title="Filtered Profiling Report")
#profile.to_file("filtered_report.html")

i = 101
j = 3
print(by_day_filtered[i][j])
A = activity_vectors[by_day_filtered[i]][j]
B = dataset_df.iloc[by_day_filtered[i][j]].values[1:-2]
print(B)
print(np.allclose(A,B))

Activity vectors shape: (2445, 35)
Targets shape: (2445,)
2320
[1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0.]
True


##### Transform the state space

This is an interface for attempting some clustering, or some other informed pruning of the state space. It is however necessary that the outputs of the state space transform are still binary vectors, as this guarantees compatibility with later modules. Also, the final column activity_vectors$[:,-1]$ should remain untouched, as it stores the battery_plugged state which we need for the regressor split later.

In [2]:
def state_space_transform(activity_vectors, mode='Id'):
    if mode == 'Id':
        out = activity_vectors[:,:-1]
    elif mode == 'Clown':
        out = np.ones_like(activity_vectors[:,:-1])
    else:
        raise NotImplementedError("Unknown state space transform!")
    return np.concatenate([out, activity_vectors[:,-1, None]], axis=1)
        
states = state_space_transform(activity_vectors, mode='Id')
print('States shape:', states.shape)

States shape: (2445, 35)


#### Assign labels to the states

This assigns an integer label to each unique state, allowing us to connect different data structures in a way that we always know which states they are representing.

In [3]:
out_labels = hash_states(states)
dataset_df['out_labels'] = out_labels
helper_states = lookup_states(np.arange(0, out_labels.max()+1), dataset_df['out_labels'].values, states)
num_unique_states = out_labels.max()+1
print('Number of unique states:', num_unique_states)

hash_by_day = [dataset_df['out_labels'].loc[inds].values for inds in by_day_filtered]
print('by_day_filtered example:', by_day_filtered[18])
print('hash_by_day example:    ', hash_by_day[18])

Number of unique states: 637
by_day_filtered example: [448 449 450 451 452 453 454]
hash_by_day example:     [ 26 158 158 158 158 156 156]


In [4]:
def L1(out, target):
    return np.abs(out - target)

In [5]:
ind_set = np.arange(len(hash_by_day)).astype(int)

split_regressor = True
predictor_mode = 'MLE'
regressor_mode = 'MAP'
charging_transform_mode = 'Id'
discharging_transform_mode = 'Sqrt'
transform_mode = 'Id'

train_prediction_losses = []
valid_prediction_losses = []

train_regression_losses = []
valid_regression_losses = []

prediction_criteria = [BCE, bit_flips]
regression_criteria = [L1]

for ind in ind_set:
    train_inds = np.delete(ind_set, ind)
    valid_inds = [ind]
    train_set_prediction = [hash_by_day[ind] for ind in train_inds]
    valid_set_prediction = [hash_by_day[ind] for ind in valid_inds]
    train_ind_prediction = np.concatenate([by_day_filtered[ind] for ind in train_inds])
    valid_ind_prediction = np.concatenate([by_day_filtered[ind] for ind in valid_inds])
    
    train_states = states[train_ind_prediction]
    train_targets = targets[train_ind_prediction]
    valid_states = states[valid_ind_prediction]
    valid_targets = targets[valid_ind_prediction]
    
    P = fit_predictor(train_set_prediction, num_unique_states, mode=predictor_mode)
    
    N_steps = 1
    estimate = predict(valid_set_prediction, P, N_steps, num_unique_states)
    # convert to sets of individual distributions over activity components
    prediction = prediction_output_transform(estimate, dataset_df['out_labels'].values, states, 'activity_dist')
    # simple targets - just time-shifted the input by N_steps
    pred_targets = list_to_prediction_targets(valid_set_prediction, N_steps, dataset_df['out_labels'].values, states)
    # calculate loss
    loss = BCE(prediction, pred_targets) #percentage bitflips as alternatives
    loss_bit = bit_flips(prediction, pred_targets) #percentage bitflips as alternatives
    valid_prediction_loss = [crit(prediction, pred_targets) for crit in prediction_criteria]
    valid_prediction_losses.append(valid_prediction_loss)
    # print reduced loss
    print('Prediction loss over the data for', N_steps, 'time step(s): mean:', loss.mean(), 'std:', loss.std())
    print('Prediction bit flips over the data for', N_steps, 'time step(s): mean:', loss_bit.mean(), 'std:', loss_bit.std())
    
    if split_regressor:
        train_charging_mask = train_states[:,-1] > 0.5
        train_discharging_mask = np.invert(train_charging_mask)
        train_states_charging = train_states[train_charging_mask][:,:-1]
        train_targets_charging = train_targets[train_charging_mask]
        train_states_discharging = train_states[train_discharging_mask][:,:-1]
        train_targets_discharging = train_targets[train_discharging_mask]
        train_targets_charging_transformed = target_space_transform(train_targets_charging, mode=charging_transform_mode)
        train_targets_discharging_transformed = target_space_transform(train_targets_discharging, mode=discharging_transform_mode)
        print('Number of samples in the charging training set:', train_targets_charging.shape[0])
        print('Number of samples in the discharging training set:', train_targets_discharging.shape[0])
        w_charging = fit_regressor(train_states_charging, train_targets_charging_transformed, mode=regressor_mode)
        w_discharging = fit_regressor(train_states_discharging, train_targets_discharging_transformed, mode=regressor_mode)
        valid_charging_mask = valid_states[:,-1] > 0.5
        valid_discharging_mask = np.invert(valid_charging_mask)
        valid_states_charging = valid_states[valid_charging_mask][:,:-1]
        valid_states_discharging = valid_states[valid_discharging_mask][:,:-1]
        valid_targets_charging = valid_targets[valid_charging_mask]
        valid_targets_discharging = valid_targets[valid_discharging_mask]
        valid_state_mat_charging = build_state_mat(valid_states_charging)
        valid_state_mat_discharging = build_state_mat(valid_states_discharging)
        valid_charging_out_transformed = valid_state_mat_charging.T @ w_charging
        valid_discharging_out_transformed = valid_state_mat_discharging.T @ w_discharging
        valid_charging_out = target_space_transform(valid_charging_out_transformed, mode=charging_transform_mode, direction='Backward')
        valid_discharging_out = target_space_transform(valid_discharging_out_transformed, mode=discharging_transform_mode, direction='Backward')
        loss_regressor_charging = np.abs(valid_charging_out - valid_targets_charging)
        loss_regressor_discharging = np.abs(valid_discharging_out - valid_targets_discharging)
        print('Number of samples in the charging valid set:', valid_targets_charging.shape[0])
        print('Number of samples in the discharging valid set:', valid_targets_discharging.shape[0])
        print('Regressor L1 over the charging data: mean:', loss_regressor_charging.mean(), 'std:', loss_regressor_charging.std())
        print('Regressor L1 over the discharging data: mean:', loss_regressor_discharging.mean(), 'std:', loss_regressor_discharging.std())
        loss_regressor = np.concatenate([loss_regressor_charging, loss_regressor_discharging])
    
    else:
        train_targets_transformed = target_space_transform(train_targets, mode=transform_mode)
        w = fit_regressor(train_states, train_targets_transformed, mode=regressor_mode)
        valid_state_mat = build_state_mat(valid_states)
        valid_out_transformed = valid_state_mat.T @ w
        valid_out = target_space_transform(valid_out_transformed, mode=transform_mode, direction='Backward')
        loss_regressor = np.abs(valid_out - valid_targets)
    
    print('Regressor L1 over the data: mean:', loss_regressor.mean(), 'std:', loss_regressor.std())
    print()
    

Prediction loss over the data for 1 time step(s): mean: 11.019514373614362 std: 1.6281735335151468
Prediction bit flips over the data for 1 time step(s): mean: 0.319047619047619 std: 0.04714045207910319
Number of samples in the charging training set: 98
Number of samples in the discharging training set: 2312
Number of samples in the charging valid set: 3
Number of samples in the discharging valid set: 22
Regressor L1 over the charging data: mean: 9.704060464198784 std: 6.671881792187041
Regressor L1 over the discharging data: mean: 1.4834314835374711 std: 1.4615136783884093
Regressor L1 over the data: mean: 2.4699069612168283 std: 3.789154971044653

Prediction loss over the data for 1 time step(s): mean: 12.862716726380533 std: 1.383075623445517
Prediction bit flips over the data for 1 time step(s): mean: 0.3724137931034484 std: 0.04004414075448588
Number of samples in the charging training set: 98
Number of samples in the discharging training set: 2307
Number of samples in the chargin

  print('Regressor L1 over the charging data: mean:', loss_regressor_charging.mean(), 'std:', loss_regressor_charging.std())
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)


Prediction loss over the data for 1 time step(s): mean: 12.142203378583506 std: 0.6128104338725463
Prediction bit flips over the data for 1 time step(s): mean: 0.35155279503105596 std: 0.017742679325572306
Number of samples in the charging training set: 100
Number of samples in the discharging training set: 2311
Number of samples in the charging valid set: 1
Number of samples in the discharging valid set: 23
Regressor L1 over the charging data: mean: 3.9352391575041654 std: 0.0
Regressor L1 over the discharging data: mean: 1.5411580223924943 std: 3.1621867183122823
Regressor L1 over the data: mean: 1.6409114030221472 std: 3.1323551984927693

Prediction loss over the data for 1 time step(s): mean: 11.841866192540806 std: 1.642171375168823
Prediction bit flips over the data for 1 time step(s): mean: 0.34285714285714286 std: 0.04754573110501964
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2307
Number of samples in the charging vali

Prediction loss over the data for 1 time step(s): mean: 10.484985691312172 std: 2.8744107684731697
Prediction bit flips over the data for 1 time step(s): mean: 0.30357142857142855 std: 0.0832227156980789
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2309
Number of samples in the charging valid set: 0
Number of samples in the discharging valid set: 25
Regressor L1 over the charging data: mean: nan std: nan
Regressor L1 over the discharging data: mean: 1.0342330743695214 std: 0.9932676447933119
Regressor L1 over the data: mean: 1.0342330743695214 std: 0.9932676447933119

Prediction loss over the data for 1 time step(s): mean: 17.76279928881121 std: 1.3955773144415544
Prediction bit flips over the data for 1 time step(s): mean: 0.5142857142857143 std: 0.04040610178208844
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2327
Number of samples in the charging valid set: 0
Number of

Prediction loss over the data for 1 time step(s): mean: 10.930953408499205 std: 0.26295787053734476
Prediction bit flips over the data for 1 time step(s): mean: 0.31648351648351636 std: 0.0076134101431599025
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2320
Number of samples in the charging valid set: 0
Number of samples in the discharging valid set: 14
Regressor L1 over the charging data: mean: nan std: nan
Regressor L1 over the discharging data: mean: 0.7280556130275181 std: 1.4046106593771341
Regressor L1 over the data: mean: 0.7280556130275181 std: 1.4046106593771341

Prediction loss over the data for 1 time step(s): mean: 14.210239431048967 std: 0.789457746169387
Prediction bit flips over the data for 1 time step(s): mean: 0.4114285714285715 std: 0.022857142857142864
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2328
Number of samples in the charging valid set: 0
Numb

Prediction loss over the data for 1 time step(s): mean: 12.78921548794407 std: 0.19337686516026395
Prediction bit flips over the data for 1 time step(s): mean: 0.37028571428571433 std: 0.0055988336977901235
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2308
Number of samples in the charging valid set: 0
Number of samples in the discharging valid set: 26
Regressor L1 over the charging data: mean: nan std: nan
Regressor L1 over the discharging data: mean: 0.34563664220997886 std: 0.606980096494267
Regressor L1 over the data: mean: 0.34563664220997886 std: 0.606980096494267

Prediction loss over the data for 1 time step(s): mean: 12.069594388551208 std: 0.6305508224130915
Prediction bit flips over the data for 1 time step(s): mean: 0.34945054945054943 std: 0.018256316182237532
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2307
Number of samples in the charging valid set: 0
Num

Prediction loss over the data for 1 time step(s): mean: 18.142346282161874 std: 0.6166908700689838
Prediction bit flips over the data for 1 time step(s): mean: 0.5252747252747253 std: 0.017855029460738414
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2320
Number of samples in the charging valid set: 0
Number of samples in the discharging valid set: 14
Regressor L1 over the charging data: mean: nan std: nan
Regressor L1 over the discharging data: mean: 0.48462934146612613 std: 0.2652846802184471
Regressor L1 over the data: mean: 0.48462934146612613 std: 0.2652846802184471

Prediction loss over the data for 1 time step(s): mean: 16.154644620688384 std: 5.372574232657589
Prediction bit flips over the data for 1 time step(s): mean: 0.4677248677248677 std: 0.15555195619058587
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2306
Number of samples in the charging valid set: 0
Number

Prediction loss over the data for 1 time step(s): mean: 17.493665966253467 std: 2.9929109860006666
Prediction bit flips over the data for 1 time step(s): mean: 0.5064935064935064 std: 0.08665364840318066
Number of samples in the charging training set: 100
Number of samples in the discharging training set: 2312
Number of samples in the charging valid set: 1
Number of samples in the discharging valid set: 22
Regressor L1 over the charging data: mean: 12.485157308359419 std: 0.0
Regressor L1 over the discharging data: mean: 2.7586514580752985 std: 5.924411549181129
Regressor L1 over the data: mean: 3.181543016783305 std: 6.1242994535501545

Prediction loss over the data for 1 time step(s): mean: 15.29574383203187 std: 0.4934110913558678
Prediction bit flips over the data for 1 time step(s): mean: 0.44285714285714295 std: 0.014285714285714289
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2305
Number of samples in the charging valid s

Prediction loss over the data for 1 time step(s): mean: 4.55979905115077 std: 7.636920307225418
Prediction bit flips over the data for 1 time step(s): mean: 0.13201970443349753 std: 0.2211114898775257
Number of samples in the charging training set: 98
Number of samples in the discharging training set: 2307
Number of samples in the charging valid set: 3
Number of samples in the discharging valid set: 27
Regressor L1 over the charging data: mean: 10.5195958921728 std: 9.22713607089081
Regressor L1 over the discharging data: mean: 2.459285945615029 std: 6.410172074847473
Regressor L1 over the data: mean: 3.2653169402708064 std: 7.165364827587245

Prediction loss over the data for 1 time step(s): mean: 8.041729466853203 std: 8.345378426238122
Prediction bit flips over the data for 1 time step(s): mean: 0.23280423280423282 std: 0.24165012265882505
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2306
Number of samples in the charging val

Prediction loss over the data for 1 time step(s): mean: 14.533199418118256 std: 2.992910986000666
Prediction bit flips over the data for 1 time step(s): mean: 0.4207792207792209 std: 0.08665364840318066
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2311
Number of samples in the charging valid set: 0
Number of samples in the discharging valid set: 23
Regressor L1 over the charging data: mean: nan std: nan
Regressor L1 over the discharging data: mean: 0.4035324098928435 std: 0.618669691735666
Regressor L1 over the data: mean: 0.4035324098928435 std: 0.618669691735666

Prediction loss over the data for 1 time step(s): mean: 14.484003004317385 std: 2.523854986169467
Prediction bit flips over the data for 1 time step(s): mean: 0.4193548387096776 std: 0.07307308624116057
Number of samples in the charging training set: 101
Number of samples in the discharging training set: 2302
Number of samples in the charging valid set: 0
Number of sa

In [6]:
print(len(valid_prediction_losses))

105
