## Set up the dependencies

In [None]:
# for reading and validating data
import emeval.input.spec_details as eisd
import emeval.input.phone_view as eipv
import emeval.input.eval_view as eiev

In [None]:
# Visualization helpers
import emeval.viz.phone_view as ezpv
import emeval.viz.eval_view as ezev

In [None]:
# For plots
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
%matplotlib inline

In [None]:
# For maps
import folium
import branca.element as bre

In [None]:
# For easier debugging while working on modules
import importlib

In [None]:
import pandas as pd
pd.options.display.float_format = '{:.6f}'.format
import arrow
import numpy as np

In [None]:
THIRTY_MINUTES = 30 * 60
TEN_MINUTES = 10 * 60

## The spec

The spec defines what experiments were done, and over which time ranges. Once the experiment is complete, most of the structure is read back from the data, but we use the spec to validate that it all worked correctly. The spec also contains the ground truth for the legs. Here, we read the spec for the trip to UC Berkeley.

In [None]:
DATASTORE_URL = "http://cardshark.cs.berkeley.edu"
AUTHOR_EMAIL = "shankari@eecs.berkeley.edu"
sd_la = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "unimodal_trip_car_bike_mtv_la")
sd_sj = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "car_scooter_brex_san_jose")
sd_ucb = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "train_bus_ebike_mtv_ucb")

## The views

There are two main views for the data - the phone view and the evaluation view. 

### Phone view

In the phone view, the phone is primary, and then there is a tree that you can traverse to get the data that you want. Traversing that tree typically involves nested for loops; here's an example of loading the phone view and traversing it. You can replace the print statements with real code. When you are ready to check this in, please move the function to one of the python modules so that we can invoke it more generally

In [None]:
importlib.reload(eipv)

In [None]:
pv_la = eipv.PhoneView(sd_la)

In [None]:
pv_sj = eipv.PhoneView(sd_sj)

In [None]:
pv_ucb = eipv.PhoneView(sd_ucb)

In [None]:
def get_transition_mask_android(df):
    # print(df.zzbhB.diff())
    return df.zzbhB.diff().abs().fillna(1) > 0

In [None]:
def get_transition_mask_ios(df):
    if len(df) == 0:
        return np.array([])
    
    ret_list = [True]
    valid_modes = ["walking", "cycling", "running", "automotive"]
    # print("df = %s" % df[valid_modes])
    # print("changes = %s" % np.diff(df[valid_modes], axis=0))
    for row in np.diff(df[valid_modes], axis=0):
        ret_list.append(row.any())
    ret_array = np.array(ret_list)
    # print(df.shape, ret_array.shape, ret_array)
    return ret_array

In [None]:
def get_count_start_end_diff(sr, ma_df, jba_df, transition_mask_fn, start_query, end_query):
#    count = np.count_nonzero(ma_df[transition_mask_fn(ma_df)] > 0)
#    ma_transition_points = ma_df[transition_mask_fn(ma_df)]
#    print("Transition points = %s" % ma_transition_points.index)
#    if len(ma_transition_points) > 0:
#        start_ts_diff = abs(sr["start_ts"] - ma_transition_points.iloc[0].ts)
#        end_ts_diff = abs(sr["end_ts"] - ma_transition_points.iloc[-1].ts)
#    else:
#        start_ts_diff = THIRTY_MINUTES
#        end_ts_diff = THIRTY_MINUTES

    jba_transition_points = jba_df[transition_mask_fn(jba_df)]
    print("Transition points = %s" % jba_transition_points.index)
    count_mask = np.logical_and(jba_transition_points.ts > sr["start_ts"], jba_transition_points.ts < sr["end_ts"])
    print("matching transitions = %s" % jba_transition_points[count_mask].index)
    count = len(jba_transition_points[count_mask])
    if len(jba_transition_points) > 0:
        start_diff_series = (jba_transition_points.query(start_query).ts - sr["start_ts"]).abs()
        jba_start_ts_diff = start_diff_series.min()
        print("start_diff_series = %s" % start_diff_series.index)
        end_diff_series = (jba_transition_points.query(end_query).ts - sr["end_ts"]).abs()
        print("end_diff_series = %s" % end_diff_series.index)
        jba_end_ts_diff = end_diff_series.min()
        print("Found mins at index start = %s, end = %s" %
              ((start_diff_series[start_diff_series == jba_start_ts_diff]).index,
               (end_diff_series[end_diff_series == jba_end_ts_diff]).index))
        # print("start_ts_diff = %s" % jba_start_ts_diff)
    else:
        jba_start_ts_diff = THIRTY_MINUTES
        jba_end_ts_diff = THIRTY_MINUTES
    return {
        "count": count,
        "expanded_start_diff_mins": jba_start_ts_diff / 60,
        "expanded_end_diff_mins": jba_end_ts_diff / 60
    }

In [None]:
def get_tradeoff_entries(pv):
    tradeoff_entry_list = []
    for phone_os, phone_map in pv.map().items():
        print(15 * "=*")
        print(phone_os, phone_map.keys())
        for phone_label, phone_detail_map in phone_map.items():
            print(4 * ' ', 15 * "-*")
            print(4 * ' ', phone_label, phone_detail_map.keys())
            if "control" in phone_detail_map["role"]:
                print("Ignoring %s phone %s since they are always on" % (phone_detail_map["role"], phone_label))
                continue
            # this spec does not have any calibration ranges, but evaluation ranges are actually cooler
            for r in phone_detail_map["evaluation_ranges"]:
                print(8 * ' ', 30 * "=")
                print(8 * ' ',r.keys())
                print(8 * ' ',r["trip_id"], r["eval_common_trip_id"], r["eval_role"], len(r["evaluation_trip_ranges"]))
                bcs = r["battery_df"]["battery_level_pct"]
                delta_battery = bcs.iloc[0] - bcs.iloc[-1]
                print("Battery starts at %d, ends at %d, drain = %d" % (bcs.iloc[0], bcs.iloc[-1], delta_battery))
                for tr in r["evaluation_trip_ranges"]:
                    for section in tr["evaluation_section_ranges"]:
                        print(12 * ' ',section["trip_id"], section["trip_id_base"], tr["trip_id"])
                        ma_df = section["motion_activity_df"]
                        # Expand the range in order to allow for detection beyond ground truth bounds
                        # jba_df = r["motion_activity_df"].query("ts > %s & ts < %s" % (sr["start_ts"] - THIRTY_MINUTES, sr["end_ts"] + THIRTY_MINUTES))
                        jba_df = r['motion_activity_df']
                        if section["trip_id_base"] == "walk_start":
                            gt_mode = "WALKING"
                        else:
                            gt_mode = pv.spec_details.get_ground_truth_for_leg(tr["trip_id_base"], section["trip_id_base"])["mode"]
                        if gt_mode == "STOPPED":
                            # This is the "WAIT FOR X" leg which should largely be stationary, but can involve
                            # walking to the door, walking inside the vehicle, etc
                            # so we ignore it for now
                            print("Ignoring STOPPED = waiting for transit vehicle")
                            continue
                        if phone_os == "android":
                            android_mode_map = {"WALKING": 2, "BICYCLING": 1, "CAR": 0, "ESCOOTER": 0, "BUS": 0,
                                               "TRAIN": 0, "SUBWAY": 0, "LIGHT_RAIL": 0}
                            gt_mode_start_query = "zzbhB == %s" % android_mode_map[gt_mode]
                            gt_mode_end_query = "zzbhB != %s" % android_mode_map[gt_mode]
                            valid_entries_query = "zzbhB not in [4,5]"
                            csed_df_entry = get_count_start_end_diff(section,
                                                ma_df.query(valid_entries_query),
                                                jba_df.query(valid_entries_query),
                                                get_transition_mask_android,
                                                gt_mode_start_query, gt_mode_end_query)
                        else:
                            valid_entries_query = "automotive == True | cycling == True | running == True | walking == True | stationary == True"
                            # print(jba_df.query(valid_entries_query))
                            ios_mode_map = {"WALKING": "walking", "BICYCLING": "cycling", "CAR": "automotive",
                                            "ESCOOTER": "automotive", "BUS": "automotive",
                                               "TRAIN": "automotive", "SUBWAY": "automotive", "LIGHT_RAIL": "automotive"}
                            gt_mode_start_query = "%s == True" % ios_mode_map[gt_mode]
                            gt_mode_end_query = "%s == False" % ios_mode_map[gt_mode]                            
                            csed_df_entry = get_count_start_end_diff(section,
                                                ma_df.query(valid_entries_query),
                                                jba_df.query(valid_entries_query),
                                                get_transition_mask_ios,
                                                gt_mode_start_query, gt_mode_end_query)
                            print(csed_df_entry)
                        tradeoff_entry = {"phone_os": phone_os, "phone_label": phone_label,
                                      "timeline": pv.spec_details.curr_spec["id"],
                                     "run": r["trip_run"], "duration": r["duration"],
                                     "role": r["eval_role_base"], "battery_drain": delta_battery,
                                      "trip_id": tr["trip_id"], "section_id": section["trip_id"]}
                        tradeoff_entry.update(csed_df_entry)
                        tradeoff_entry_list.append(tradeoff_entry)

    return tradeoff_entry_list

In [None]:
importlib.reload(eisd)

In [None]:
# We are not going to look at battery life at the evaluation trip level; we will end with evaluation range
# since we want to capture the overall drain for the timeline
tradeoff_entries_list = []
tradeoff_entries_list.extend(get_tradeoff_entries(pv_la))
tradeoff_entries_list.extend(get_tradeoff_entries(pv_sj))
tradeoff_entries_list.extend(get_tradeoff_entries(pv_ucb))
tradeoff_df = pd.DataFrame(tradeoff_entries_list)

## Add in other entries to the dataframe to allow us to plot better

In [None]:
r2q_map = {"power_control": 0, "HAMFDC": 1, "MAHFDC": 1, "HAHFDC": 2, "accuracy_control": 3}
q2r_android_list = ["power_control", "HAMFDC", "HAHFDC", "accuracy_control"]
q2r_ios_list = ["power_control", "MAHFDC", "HAHFDC", "accuracy_control"]

In [None]:
# Make a number so that can get the plots to come out in order
tradeoff_df["quality"] = tradeoff_df.role.apply(lambda r: r2q_map[r])

## Number of transitions in a section

We should ideally have only one transition in every TRAVEL section

In [None]:
tradeoff_df.query("timeline=='unimodal_trip_car_bike_mtv_la' & run == 1 & role == 'HAMFDC'").section_id

In [None]:
def plot_count_with_errors(ax_array, phone_os):
    for i, (tl, trip_gt) in enumerate(timeline_trip_gt.items()):
        idx = 0
        for q in range(1,3):
            section_count_sum_list = []
            curr_df = tradeoff_df.query("timeline == @tl & phone_os == @phone_os & quality == @q")
            section_list = curr_df.section_id.unique()
            for sid in section_list:
                section_df = curr_df.query("section_id == @sid")
                # print(section_df["count"].min(), section_df["count"].median(), section_df["count"].max())
                lower_error = section_df["count"].min() - section_df["count"].median()
                upper_error = section_df["count"].median() - section_df["count"].max()
                ax_array[i].barh(y=idx, width=section_df["count"].median(), xerr=[[lower_error], [upper_error]])
                idx = idx + 1
            tick_labels = section_list.tolist()
            tick_labels.extend(section_list.tolist())
            ax_array[i].set_yticks(range(2 * len(section_list)))
            ax_array[i].set_yticklabels(tick_labels)

In [None]:
ifig, ax_array = plt.subplots(nrows=2,ncols=3,figsize=(16,16), sharex=False, sharey=False)
timeline_trip_gt = {"train_bus_ebike_mtv_ucb": 17,
                    "car_scooter_brex_san_jose": 8,
                    "unimodal_trip_car_bike_mtv_la": 6}

plot_count_with_errors(ax_array[0], "android")
plot_count_with_errors(ax_array[1], "ios")

# for ax in ax_array[0]:
    # ax.set_xticks(range(0,3))
    # ax.set_xticklabels(["truth"] + q2r_android_list[1:-1])
    # ax.set_yticks(range(0,tradeoff_df.trip_count.max(),3))
    
# for ax in ax_array[1]:
    # ax.set_xticks(range(0,3))
    # ax.set_xticklabels(["truth"] + q2r_ios_list[1:-1])
    # ax.set_yticks(range(0,tradeoff_df.trip_count.max(),3))
    
for ax in ax_array[:,0]:
    ax.set_ylabel("Number of trips")
    
ifig.tight_layout(pad=1.5)

In [None]:
tradeoff_df.head(n=20)

In [None]:
out_of_battery_phones = tradeoff_df.query("timeline=='train_bus_ebike_mtv_ucb' & role=='HAHFDC' & trip_id=='berkeley_to_mtv_SF_express_bus_0' & phone_os == 'android' & (section_id == 'commuter_rail_with_tunnels_0' | section_id == 'inner_suburb_downtown_walk_0')")
print(out_of_battery_phones.index)
for i in out_of_battery_phones.index:
    tradeoff_df.loc[i,"expanded_end_diff_mins"] = float('nan')

In [None]:
ifig, ax_array = plt.subplots(nrows=2,ncols=3,figsize=(11,5.5), sharex=False, sharey=False)
timeline_list = ["train_bus_ebike_mtv_ucb", "car_scooter_brex_san_jose", "unimodal_trip_car_bike_mtv_la"]
for i, tl in enumerate(timeline_list):
    tradeoff_df.query("timeline == @tl & phone_os == 'android'").boxplot(ax = ax_array[0][i], column=["expanded_start_diff_mins"], by=["quality"])
    ax_array[0][i].set_title(tl)
    # tradeoff_df.query("timeline == @tl & phone_os == 'android'").boxplot(ax = ax_array[1][i], column=["expanded_end_diff_mins"], by=["quality"])
    # ax_array[1][i].set_title("")
    # tradeoff_df.query("timeline == @tl & phone_os == 'ios'").boxplot(ax = ax_array[2][i], column=["expanded_start_diff_mins"], by=["quality"])
    # ax_array[2][i].set_title("")
    tradeoff_df.query("timeline == @tl & phone_os == 'ios'").boxplot(ax = ax_array[1][i], column=["expanded_end_diff_mins"], by=["quality"])
    ax_array[1][i].set_title("")

    # print(android_ax_returned.shape, ios_ax_returned.shape)

for ax in ax_array[0]:
    ax.set_xticklabels(q2r_android_list[1:])
    ax.set_xlabel("")

# for ax in ax_array[1]:
#     ax.set_xticklabels(q2r_android_list[1:])
#     ax.set_xlabel("")

# for ax in ax_array[2]:
#     ax.set_xticklabels(q2r_ios_list[1:])
#     ax.set_xlabel("")

for ax in ax_array[1]:
    ax.set_xticklabels(q2r_ios_list[1:])
    ax.set_xlabel("")    

ax_array[0][0].set_ylabel("start diff (mins, android)")
# ax_array[1][0].set_ylabel("end diff (android)")
# ax_array[2][0].set_ylabel("start diff (ios)")
ax_array[1][0].set_ylabel("end diff (mins, ios)")

# ax_array[2][0].set_ylabel("Battery drain (ios)")
# ax_array[3][0].set_ylabel("Difference in trip counts (ios)")
ifig.suptitle("Ground truth section start and end times v/s sensed activity transitions at different quality levels")
# ifig.tight_layout()

### Anomaly checks

We can clearly see that there are several outliers with the start/end timestamps for the sections. Let us explore these in greater detail and see if we can find any patterns.

In [None]:
fmt = lambda ts: arrow.get(ts).to("America/Los_Angeles")
android_gt_mode_start_query = "zzbhB == %s"
android_gt_mode_end_query = "zzbhB != %s"
ios_gt_mode_start_query = "%s == True"
ios_gt_mode_end_query = "%s == False"   
valid_android_query = "zzbhB not in [4,5]"
valid_ios_query = "automotive == True | cycling == True | running == True | walking == True | stationary == True"

#### Easy trip: unimodal, ios

MAHFDC does significantly worse than HAHFDC, and this is primarily because iOS never generates a transition between walk and bicycling on the return leg. It stays at walk for the entire return bicycling trip. So the bicycling start is nan, and the walk segment just before it has a very high start diff because the last walk transition is essentially the end of the car trip.

In [None]:
tradeoff_df.query("timeline == 'unimodal_trip_car_bike_mtv_la' & phone_os == 'ios'").boxplot(column=["expanded_start_diff_mins"], by=["quality"])

In [None]:
tradeoff_df.query("timeline == @tl & phone_os == 'ios' & role == 'MAHFDC' & run == 2")

In [None]:
eval_range = pv_la.map()['ios']['ucb-sdb-ios-3']["evaluation_ranges"][2]
eval_range["motion_activity_df"]["fmt_time"] = eval_range["motion_activity_df"].ts.apply(lambda ts: fmt(ts))

In [None]:
eval_trip_range = eval_range["evaluation_trip_ranges"][0]
eval_section_range = eval_trip_range["evaluation_section_ranges"][0]

In [None]:
eval_section_range["trip_id"], fmt(eval_section_range["start_ts"]), fmt(eval_section_range["end_ts"])

In [None]:
eval_range["motion_activity_df"].query(valid_ios_query).iloc[0:5]

In [None]:
get_count_start_end_diff(eval_section_range, None, eval_range["motion_activity_df"].query(valid_ios_query), get_transition_mask_ios,
                        ios_gt_mode_end_query % "walking", ios_gt_mode_start_query % "walking")

In [None]:
eval_trip_range = eval_range["evaluation_trip_ranges"][1]
eval_section_range = eval_trip_range["evaluation_section_ranges"][0]
fmt(eval_section_range["start_ts"]), fmt(eval_section_range["end_ts"])

In [None]:
eval_range["motion_activity_df"].query(valid_ios_query).loc[8:220]

In [None]:
get_count_start_end_diff(eval_section_range, None, eval_range["motion_activity_df"].query(valid_ios_query), get_transition_mask_ios,
                        ios_gt_mode_end_query % "walking", ios_gt_mode_start_query % "walking")

#### Easy trip: unimodal, android

Checking some of the outliers in the HAHFDC case.
- case #1: turns out to be due to a bad timestamp. The bicycling activity is detected as starting from `10:34:51` which was actually the end of the previous trip. For this trip, we walked from `11:48:06` to `11:48:49` and biked after that. So the closest walk start transition was after the bike ride, and the closest walk end transition was the end of the prior car trip.

In [None]:
tradeoff_df.query("timeline == 'unimodal_trip_car_bike_mtv_la' & phone_os == 'android'").boxplot(column=["expanded_end_diff_mins"], by=["quality"])

In [None]:
tradeoff_df.query("timeline == 'unimodal_trip_car_bike_mtv_la' & phone_os == 'android' & role == 'HAHFDC' & run == 1")

In [None]:
eval_range = pv_la.map()['android']['ucb-sdb-android-2']["evaluation_ranges"][1]
eval_range["motion_activity_df"]["fmt_time"] = eval_range["motion_activity_df"].ts.apply(lambda ts: fmt(ts))

In [None]:
eval_trip_range = eval_range["evaluation_trip_ranges"][1]
eval_section_range = eval_trip_range["evaluation_section_ranges"][0]
eval_section_range["trip_id"], fmt(eval_section_range["start_ts"]), fmt(eval_section_range["end_ts"])

In [None]:
eval_range["motion_activity_df"].query(valid_android_query).loc[56:72]

In [None]:
get_count_start_end_diff(eval_section_range, None, eval_range["motion_activity_df"].query(valid_android_query), get_transition_mask_android,
                        android_gt_mode_start_query % 2, android_gt_mode_end_query % 2)

In [None]:
eval_trip_range = eval_range["evaluation_trip_ranges"][1]
eval_section_range = eval_trip_range["evaluation_section_ranges"][2]
fmt(eval_section_range["start_ts"]), fmt(eval_section_range["end_ts"])

In [None]:
get_count_start_end_diff(eval_section_range, None, eval_range["motion_activity_df"].query(valid_android_query), get_transition_mask_android,
                        android_gt_mode_start_query % 2, android_gt_mode_end_query % 2)

In [None]:
eval_range["motion_activity_df"].query(valid_android_query).loc[144:154]

In [None]:
fmt(eval_range["start_ts"]), fmt(eval_range["end_ts"])

#### Hard trip: multi-modal, ios

Checking some of the outliers in the MAHFDC case.
- case #1: turns out that the entire range from `18:12:28` -> `18:25:05` was detected as walking, so this is in fact correct

In [None]:
tradeoff_df.query("timeline == 'train_bus_ebike_mtv_ucb' & phone_os == 'ios'").boxplot(column=["expanded_start_diff_mins"], by=["quality"])

In [None]:
tradeoff_df.query("timeline == 'train_bus_ebike_mtv_ucb' & phone_os == 'ios' & role == 'MAHFDC' & run == 1")

In [None]:
eval_range = pv_ucb.map()['ios']['ucb-sdb-ios-3']["evaluation_ranges"][1]
eval_range["motion_activity_df"]["fmt_time"] = eval_range["motion_activity_df"].ts.apply(lambda ts: fmt(ts))

In [None]:
eval_trip_range = eval_range["evaluation_trip_ranges"][2]
eval_section_range = eval_trip_range["evaluation_section_ranges"][7]
eval_section_range["trip_id"], fmt(eval_section_range["start_ts"]), fmt(eval_section_range["end_ts"])

In [None]:
get_count_start_end_diff(eval_section_range, None, eval_range["motion_activity_df"].query(valid_ios_query), get_transition_mask_ios,
                        ios_gt_mode_start_query % "automotive", ios_gt_mode_end_query % "automotive")

In [None]:
eval_range["motion_activity_df"].query(valid_ios_query).loc[457:503]

#### Hard trip: multi-modal, android

Checking some of the outliers in the MAHFDC case.
- case #1: turns out that the entire range from `10:29:23` -> `14:15:19` was detected as walking, so we had no transition at the start of this trip section and this is correct

```
271 	1563989363.918000 	2 	82 	1563989363.918000 	2019-07-24T10:29:23.918000-07:00 	2.684241
272 	1564002919.547000 	2 	96 	1564002919.547000 	2019-07-24T14:15:19.547000-07:00 	6.449693
```

In [None]:
tradeoff_df.query("timeline == 'train_bus_ebike_mtv_ucb' & phone_os == 'android'").boxplot(column=["expanded_start_diff_mins"], by=["quality"])

In [None]:
tradeoff_df.query("timeline == 'train_bus_ebike_mtv_ucb' & phone_os == 'android' & role == 'HAMFDC' & run == 0")

In [None]:
eval_range = pv_ucb.map()['android']['ucb-sdb-android-3']["evaluation_ranges"][0]
eval_range["motion_activity_df"]["fmt_time"] = eval_range["motion_activity_df"].ts.apply(lambda ts: fmt(ts))

In [None]:
eval_trip_range = eval_range["evaluation_trip_ranges"][1]
eval_section_range = eval_trip_range["evaluation_section_ranges"][0]
eval_section_range["trip_id"], fmt(eval_section_range["start_ts"]), fmt(eval_section_range["end_ts"])

In [None]:
get_count_start_end_diff(eval_section_range, None, eval_range["motion_activity_df"].query(valid_android_query), get_transition_mask_android,
                        android_gt_mode_start_query % 2, android_gt_mode_end_query % 2)

In [None]:
eval_range["motion_activity_df"].query(valid_android_query).loc[250:300]