## Set up the dependencies

In [None]:
# for reading and validating data
import emeval.input.spec_details as eisd
import emeval.input.phone_view as eipv
import emeval.input.eval_view as eiev

In [None]:
# Visualization helpers
import emeval.viz.phone_view as ezpv
import emeval.viz.eval_view as ezev

In [None]:
# Analytics results
import emeval.metrics.segmentation as ems

In [None]:
# For plots
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
%matplotlib inline

import IPython.display as ipyd

In [None]:
# For maps
import folium
import branca.element as bre

In [None]:
# For easier debugging while working on modules
import importlib

In [None]:
import pandas as pd
pd.options.display.float_format = '{:.6f}'.format
import arrow
import numpy as np

In [None]:
THIRTY_MINUTES = 30 * 60
TEN_MINUTES = 10 * 60

## The spec

The spec defines what experiments were done, and over which time ranges. Once the experiment is complete, most of the structure is read back from the data, but we use the spec to validate that it all worked correctly. The spec also contains the ground truth for the legs. Here, we read the spec for the trip to UC Berkeley.

In [None]:
DATASTORE_LOC = "bin/data/"
AUTHOR_EMAIL = "shankari@eecs.berkeley.edu"
sd_la = eisd.FileSpecDetails(DATASTORE_LOC, AUTHOR_EMAIL, "unimodal_trip_car_bike_mtv_la")
sd_sj = eisd.FileSpecDetails(DATASTORE_LOC, AUTHOR_EMAIL, "car_scooter_brex_san_jose")
sd_ucb = eisd.FileSpecDetails(DATASTORE_LOC, AUTHOR_EMAIL, "train_bus_ebike_mtv_ucb")

## The views

There are two main views for the data - the phone view and the evaluation view. 

### Phone view

In the phone view, the phone is primary, and then there is a tree that you can traverse to get the data that you want. Traversing that tree typically involves nested for loops; here's an example of loading the phone view and traversing it. You can replace the print statements with real code. When you are ready to check this in, please move the function to one of the python modules so that we can invoke it more generally

In [None]:
importlib.reload(eipv)

In [None]:
pv_la = eipv.PhoneView(sd_la)

In [None]:
pv_sj = eipv.PhoneView(sd_sj)

In [None]:
pv_ucb = eipv.PhoneView(sd_ucb)

In [None]:
ANDROID_MODE_MAP = {0: "AUTOMOTIVE", 1: "CYCLING", 2: "WALKING", 3: "STATIONARY"}
ANDROID_MAP_FN = lambda t: ANDROID_MODE_MAP[t["zzbhB"]]

def IOS_MAP_FN(t):
    t_series = pd.Series(t)
    all_true = t_series[t_series == True].index.tolist()
    if len(all_true) == 1:
        return all_true[0].upper()
    else:
        # Do something more sophisticated here?
        return "INVALID"

MAP_FNS = {"android": ANDROID_MAP_FN, "ios": IOS_MAP_FN}
TRANSITION_FNS = {"android": ems.get_transition_mask_android, "ios": ems.get_transition_mask_ios}

In [None]:
importlib.reload(ems)

In [None]:
ems.fill_sensed_section_ranges(pv_la)
ems.fill_sensed_section_ranges(pv_sj)
ems.fill_sensed_section_ranges(pv_ucb)

In [None]:
importlib.reload(ems)

In [None]:
BASE_MODE = {"WALKING": "WALKING", "BICYCLING": "CYCLING", "ESCOOTER": "CYCLING", "BUS": "AUTOMOTIVE", "TRAIN": "AUTOMOTIVE", "LIGHT_RAIL": "AUTOMOTIVE", "SUBWAY": "AUTOMOTIVE", "CAR": "AUTOMOTIVE"}

In [None]:
def get_tradeoff_entries(pv):
    tradeoff_entry_list = []
    for phone_os, phone_map in pv.map().items():
        print(15 * "=*")
        print(phone_os, phone_map.keys())
        for phone_label, phone_detail_map in phone_map.items():
            print(4 * ' ', 15 * "-*")
            print(4 * ' ', phone_label, phone_detail_map.keys())
            if "control" in phone_detail_map["role"]:
                print("Ignoring %s phone %s since they are always on" % (phone_detail_map["role"], phone_label))
                continue
            # this spec does not have any calibration ranges, but evaluation ranges are actually cooler
            for r in phone_detail_map["evaluation_ranges"]:
                print(8 * ' ', 30 * "=")
                print(8 * ' ',r.keys())
                print(8 * ' ',r["trip_id"], r["eval_common_trip_id"], r["eval_role"], len(r["evaluation_trip_ranges"]))
                bcs = r["battery_df"]["battery_level_pct"]
                delta_battery = bcs.iloc[0] - bcs.iloc[-1]
                print("Battery starts at %d, ends at %d, drain = %d" % (bcs.iloc[0], bcs.iloc[-1], delta_battery))

                for tr in r["evaluation_trip_ranges"]:
                    matching_section_map = ems.find_matching_segments(tr["evaluation_section_ranges"], 
                                                                      "trip_id", tr["sensed_section_ranges"])
                    print("For trip %s, found matching ranges %s" % (tr["trip_id"], matching_section_map))
                    for section in tr["evaluation_section_ranges"]:
                        section_gt_leg = pv.spec_details.get_ground_truth_for_leg(tr["trip_id_base"],
                                                                                  section["trip_id_base"])
                        if section_gt_leg["type"] == "WAITING":
                            print("Skipping WAITING section %s %s with potential partway transitions" %
                                  (tr["trip_id"], section["trip_id"]))
                            continue
                        result = ems.get_mode_check_results(section, section_gt_leg, matching_section_map)
                        tradeoff_entry = {"phone_os": phone_os, "phone_label": phone_label,
                                      "timeline": pv.spec_details.curr_spec["id"],
                                      "range_id": r["trip_id"],
                                     "run": r["trip_run"], "duration": r["duration"],
                                     "role": r["eval_role_base"], "battery_drain": delta_battery,
                                     "section_count": len(tr["sensed_section_ranges"]),
                                      "trip_id": tr["trip_id"],
                                      "section_id": section["trip_id"]}
                        tradeoff_entry.update(result)
                        tradeoff_entry_list.append(tradeoff_entry)

    return tradeoff_entry_list

In [None]:
importlib.reload(ems)

In [None]:
# We are not going to look at battery life at the evaluation trip level; we will end with evaluation range
# since we want to capture the overall drain for the timeline
tradeoff_entries_list = []
tradeoff_entries_list.extend(get_tradeoff_entries(pv_la))
tradeoff_entries_list.extend(get_tradeoff_entries(pv_sj))
tradeoff_entries_list.extend(get_tradeoff_entries(pv_ucb))
tradeoff_df = pd.DataFrame(tradeoff_entries_list)

## Add in other entries to the dataframe to allow us to plot better

In [None]:
r2q_map = {"power_control": 0, "HAMFDC": 1, "MAHFDC": 2, "HAHFDC": 3, "MAMFDC": 4, "accuracy_control": 5}
q2r_map = {0: "power", 1: "HAMFDC", 2: "MAHFDC", 3: "HAHFDC", 4: "MAMFDC", 5: "accuracy"}

In [None]:
# Make a number so that can get the plots to come out in order
tradeoff_df["quality"] = tradeoff_df.role.apply(lambda r: r2q_map[r])
tradeoff_df["gt_duration_mins"] = tradeoff_df.gt_duration // 60

## Timeline + section count variations

We should ideally have only one transition in every TRAVEL section

In [None]:
tradeoff_df.query("timeline=='unimodal_trip_car_bike_mtv_la' & run == 1 & role == 'HAMFDC'").section_id

In [None]:
tradeoff_df.head()

In [None]:
# tradeoff_df_filtered = tradeoff_df.query("gt_duration > (20*60) & ((section_id != 'commuter_rail_with_tunnels_0' & section_id != 'inner_suburb_downtown_walk_0') | phone_os != 'android')")
tradeoff_df_filtered = tradeoff_df.query("((section_id != 'commuter_rail_with_tunnels_0' & section_id != 'inner_suburb_downtown_walk_0') | phone_os != 'android')")
tradeoff_df_filtered.section_id.unique()

In [None]:
'tt' not in 'tt_city_escooter_city_bus_rapid_transit_0'

In [None]:
ifig, ax_array = plt.subplots(nrows=2,ncols=3,figsize=(9,6), sharex=False, sharey=False)
timeline_list = ["train_bus_ebike_mtv_ucb", "car_scooter_brex_san_jose", "unimodal_trip_car_bike_mtv_la"]
for i, tl in enumerate(timeline_list):
    print(len(tradeoff_df_filtered.query("timeline == @tl & phone_os == 'android'")))
    tradeoff_df_filtered.query("timeline == @tl & phone_os == 'android'").boxplot(ax = ax_array[0][i], column=["matching_pct"], by=["quality"])
    ax_array[0][i].set_title(tl)
    print(len(tradeoff_df_filtered.query("timeline == @tl & phone_os == 'ios'")))
    tradeoff_df_filtered.query("timeline == @tl & phone_os == 'ios'").boxplot(ax = ax_array[1][i], column=["matching_pct"], by=["quality"])
    ax_array[1][i].set_title("")
    # tradeoff_df.query("timeline == @tl & phone_os == 'ios'").boxplot(ax = ax_array[2][i], column=["visit_reports"], by=["quality"])
    # ax_array[2][i].set_title("")

    # print(android_ax_returned.shape, ios_ax_returned.shape)

for i, ax in enumerate(ax_array[0]):
    ax.set_xticklabels([q2r_map[int(t.get_text())] for t in ax.get_xticklabels()])
    ax.set_xlabel("")

for i, ax in enumerate(ax_array[1]):
    ax.set_xticklabels([q2r_map[int(t.get_text())] for t in ax.get_xticklabels()])
    ax.set_xlabel("")

# for ax in ax_array[1]:
#     ax.set_xticklabels(q2r_ios_list[1:])
#     ax.set_xlabel("")

# for ax in ax_array[2]:
#     ax.set_xticklabels(q2r_ios_list[1:])
#     ax.set_xlabel("")

ax_array[0][0].set_ylabel("Difference in trip counts (android)")
ax_array[1][0].set_ylabel("Difference in trip counts (ios)")
# ax_array[2][0].set_ylabel("Difference in visit reports (ios)")
ifig.suptitle("Section count differences v/s configured quality over multiple timelines")
# ifig.tight_layout()

In [None]:
tradeoff_df_filtered.plot(x="gt_duration", y="matching_pct", kind='scatter')

In [None]:
tradeoff_df_filtered.query("matching_pct > 1").plot(x="gt_duration", y="matching_pct", kind='scatter')

In [None]:
tradeoff_df_filtered.query("matching_pct <= 1").plot(x="gt_duration", y="matching_pct", kind='scatter')

In [None]:
matching_pct_range_list = []
for k, df in tradeoff_df_filtered.groupby("gt_duration_mins"):
    print (k, df.matching_pct.mean(), df.matching_pct.min(), df.matching_pct.max())
    matching_pct_range_list.append({"gt_duration_mins": k, "mean": df.matching_pct.mean(), "min": df.matching_pct.min(), "max": df.matching_pct.max()})
matching_pct_range_df = pd.DataFrame(matching_pct_range_list)
ifig, ax = plt.subplots(1,1, figsize=(4,4), squeeze=True)
ax.errorbar(matching_pct_range_df.gt_duration_mins, y=matching_pct_range_df["mean"],  yerr = [matching_pct_range_df["mean"] - matching_pct_range_df["min"],
                                                                                              matching_pct_range_df["max"] - matching_pct_range_df["mean"]])

In [None]:
matching_pct_range_df_filtered = matching_pct_range_df.query('gt_duration_mins > 10')
ifig, ax = plt.subplots(1,1, figsize=(4,4), squeeze=True)
ax.errorbar(matching_pct_range_df_filtered.gt_duration_mins, y=matching_pct_range_df_filtered["mean"],  yerr = [matching_pct_range_df_filtered["mean"] - matching_pct_range_df_filtered["min"],
                                                                                              matching_pct_range_df_filtered["max"] - matching_pct_range_df_filtered["mean"]])

In [None]:
np.array(list(zip(np.repeat([1], 37), np.repeat([10], 37)))).shape

In [None]:
tradeoff_df_filtered.sort_values(by="matching_pct", ascending=False)

In [None]:
tradeoff_df.matching_pct.min(), tradeoff_df.matching_pct.max()

In [None]:
out_of_battery_phones = tradeoff_df.query("timeline=='train_bus_ebike_mtv_ucb' & role=='HAHFDC' & trip_id=='berkeley_to_mtv_SF_express_bus_0' & phone_os == 'android'")
for i in out_of_battery_phones.index:
    tradeoff_df.loc[i,"end_diff_mins"] = float('nan')

### Anomaly checks

We can clearly see that there are several outliers with the start/end timestamps for the sections. Let us explore these in greater detail and see if we can find any patterns.

In [None]:
fmt = lambda ts: arrow.get(ts).to("America/Los_Angeles")


def check_outlier(eval_range, trip_idx, section_id, base_mode):
    eval_trip = eval_range["evaluation_trip_ranges"][trip_idx]
    eval_range["motion_activity_df"]["fmt_time"] = eval_range["motion_activity_df"].ts.apply(lambda ts: fmt(ts))
    eval_trip["motion_activity_df"]["fmt_time"] = eval_trip["motion_activity_df"].ts.apply(lambda ts: fmt(ts))
    eval_section = [s for s in eval_trip["evaluation_section_ranges"] if s["trip_id"] == section_id][0]
    print(fmt(eval_section["start_ts"]), "->", fmt(eval_section["end_ts"]))
    print([(fmt(ssr["start_ts"]), fmt(ssr["end_ts"]), ssr["mode"]) for ssr in eval_trip["sensed_section_ranges"]])
    matching_section_map = ems.find_matching_segments(eval_trip["evaluation_section_ranges"], "trip_id", eval_trip["sensed_section_ranges"])
    sensed_section_range = matching_section_map[section_id]["match"]
    print([(fmt(cm["start_ts"]), fmt(cm["end_ts"]), cm["mode"]) for cm in sensed_section_range])
    matching_sections = [s for s in sensed_section_range if s["mode"] == base_mode]
    print("For %s (%s -> %s) %s, matching_sections = %s" % 
        (eval_section["trip_id"], eval_section["start_ts"], eval_section["end_ts"], base_mode,
        matching_sections))
    matching_ts = sum([(s["end_ts"] - s["start_ts"]) for s in matching_sections])
    print("matching_ts = %s, ground_truth ts = %s" % (matching_ts, (eval_section["end_ts"] - eval_section["start_ts"])))
    matching_pct = matching_ts / (eval_section["end_ts"] - eval_section["start_ts"])
    print(matching_pct)
    print("section activity head")
    ipyd.display(eval_section["motion_activity_df"].head(n=3))
    print("section activity tail")
    ipyd.display(eval_section["motion_activity_df"].tail(n=3))
    section_end_ts = eval_section["end_ts"]
    print("post-section end activity head")
    ipyd.display(eval_range["motion_activity_df"].query("@section_end_ts <= ts <= @section_end_ts + 30 * 60").head())

In [None]:
def check_outlier_expanded(eval_range, trip_idx, section_id, base_mode):
    eval_trip = eval_range["evaluation_trip_ranges"][trip_idx]
    eval_range["motion_activity_df"]["fmt_time"] = eval_range["motion_activity_df"].ts.apply(lambda ts: fmt(ts))
    eval_trip["motion_activity_df"]["fmt_time"] = eval_trip["motion_activity_df"].ts.apply(lambda ts: fmt(ts))
    eval_section = [s for s in eval_trip["evaluation_section_ranges"] if s["trip_id"] == section_id][0]
    print(fmt(eval_section["start_ts"]), "->", fmt(eval_section["end_ts"]))
    print([(fmt(ssr["start_ts"]), fmt(ssr["end_ts"]), ssr["mode"]) for ssr in eval_trip["sensed_section_ranges"]])
    trip_ma_df = eval_trip["motion_activity_df"]
    # we may get some transitions after the trip ends 
    # let's expand the activity range to account for that
    trip_end_ts = eval_trip["end_ts"]
    extended_ma_df = eval_range["motion_activity_df"].query("@trip_end_ts <= ts <= @trip_end_ts + 30 * 60")
    ma_df = pd.concat([trip_ma_df, extended_ma_df],
            axis="index")

    curr_trip_section_transitions = ems.find_section_transitions(
        ma_df.query(ems.VALID_QUERIES_NO_STILL["android"]), ems.TRANSITION_FNS["android"])
    
    ipyd.display(curr_trip_section_transitions)
    
    last_section = eval_trip["evaluation_section_ranges"][-1]
    last_section_gt = pv_ucb.spec_details.get_ground_truth_for_leg(eval_trip["trip_id_base"], last_section["trip_id_base"])
    if last_section_gt["mode"] == "WALKING":
        # For trip that end in walking, we need to include still transitions as valid
        # otherwise, there is no end transition from walking to a valid mode
        if len(curr_trip_section_transitions) > 0:
            curr_last_transition_ts = curr_trip_section_transitions.iloc[-1].ts
        else:
            curr_last_transition_ts = 0
        print("Trip ending in walking found, checking for any final still transitions > %s" % curr_last_transition_ts)
        still_section_transitions = extended_ma_df.query("ts > @curr_last_transition_ts").query(ems.STILL_ENTRIES["android"])
        if len(still_section_transitions) > 0:
            curr_trip_section_transitions = curr_trip_section_transitions.append(still_section_transitions.iloc[0])
            
    ipyd.display(curr_trip_section_transitions)

    matching_section_map = ems.find_matching_segments(eval_trip["evaluation_section_ranges"], "trip_id", eval_trip["sensed_section_ranges"])
    sensed_section_range = matching_section_map[section_id]["match"]
    print([(fmt(cm["start_ts"]), fmt(cm["end_ts"]), cm["mode"]) for cm in sensed_section_range])
    matching_sections = [s for s in sensed_section_range if s["mode"] == base_mode]
    print("For %s (%s -> %s) %s, matching_sections = %s" % 
        (eval_section["trip_id"], eval_section["start_ts"], eval_section["end_ts"], base_mode,
        matching_sections))
    matching_ts = sum([(s["end_ts"] - s["start_ts"]) for s in matching_sections])
    print("matching_ts = %s, ground_truth ts = %s" % (matching_ts, (eval_section["end_ts"] - eval_section["start_ts"])))
    matching_pct = matching_ts / (eval_section["end_ts"] - eval_section["start_ts"])
    print(matching_pct)
    print("section activity head")
    ipyd.display(eval_section["motion_activity_df"].head(n=3))
    print("section activity tail")
    ipyd.display(eval_section["motion_activity_df"].tail(n=3))
    section_end_ts = eval_section["end_ts"]
    print("post-section end activity head")
    ipyd.display(eval_range["motion_activity_df"].query("@section_end_ts <= ts <= @section_end_ts + 30 * 60").head())

#### sections which have matching pct > 1

This is mainly caused by 

- most of the highest values are from `walk_start` and `walk_end`. This is because we end up matching them with sections that correspond to the entire trip and not just the transitions. For e.g. `walk_end` is from `19:20:31 -> 19:20:57` but it matches the section from `19:01:53 -> 19:27:21` because it is all WALKING.

- looking at longer sections, the first "real" section is `walk to the bikeshare location_0`. Again, it was from `16:37:07 -> 2019-07-24T16:41:54` but we matched the entire `WALKING` range of `16:38:36 -> 17:21:13`

In [None]:
tradeoff_df.query("matching_pct > 0").sort_values(by="matching_pct")

In [None]:
check_outlier(pv_la.map()['ios']['ucb-sdb-ios-3']["evaluation_ranges"][0], 1, "walk_end_0", "WALKING")

In [None]:
check_outlier(pv_ucb.map()['ios']['ucb-sdb-ios-3']["evaluation_ranges"][0], 2, "walk to the bikeshare location_0", "WALKING")

#### sections which have 0 matching_pct

- suburb_city_driving_weekend_0: matches a walking trip, no motion activity until tracking actually stops. GT trip end for the `walk_start` section is `17:40:03`, first point in the motion activity df for the **range** is at `17:46:39`. The AUTOMOTIVE range GT end is `17:52:26`; the sensed range is from `18:33:45 -> 19:41:13`.
- similarly for `city_escooter`

   ```
   Ground truth = 16:18:07 -> 16:38:14
   (<Arrow [2019-07-22T16:11:09.955601-07:00]>, <Arrow [2019-07-22T16:59:30.826229-07:00]>, 'WALKING'
   (<Arrow [2019-07-22T16:59:30.826229-07:00]>, <Arrow [2019-07-22T17:01:30.321116-07:00]>, 'AUTOMOTIVE'
   (<Arrow [2019-07-22T17:01:30.321116-07:00]>, <Arrow [2019-07-22T17:02:54.217346-07:00]>, 'WALKING'
   (<Arrow [2019-07-22T17:02:54.217346-07:00]>, <Arrow [2019-07-22T17:34:33.386226-07:00]>, 'AUTOMOTIVE'
   (<Arrow [2019-07-22T17:34:33.386226-07:00]>, <Arrow [2019-07-22T17:46:59.568747-07:00]>, 'WALKING')
   ```

- for `commuter_rail_with_tunnels_0`

Phone ran out during this section. Need to exclude

- similarly for `inner_suburb_downtown_walk_0`

- for `suburb_city_driving_weekend_0`, classified as `CYCLING`


```

2019-07-27T17:40:03.318182-07:00 -> 2019-07-27T17:52:26.823849-07:00
[(<Arrow [2019-07-27T17:43:45.507000-07:00]>, <Arrow [2019-07-27T17:51:10.151000-07:00]>, 'CYCLING'
(<Arrow [2019-07-27T17:51:10.151000-07:00]>, <Arrow [2019-07-27T17:53:44.761000-07:00]>, 'AUTOMOTIVE')]
```

In [None]:
tradeoff_df.query("matching_pct == 0").head()

In [None]:
check_outlier(pv_la.map()['android']['ucb-sdb-android-3']["evaluation_ranges"][0], 0, "walk_start_0", "WALKING")

In [None]:
tradeoff_df.query("matching_pct == 0 & section_id != 'walk_start_0' and section_id != 'walk_end_0' & ((section_id != 'commuter_rail_with_tunnels_0' & section_id != 'inner_suburb_downtown_walk_0') | phone_os != 'android')")

In [None]:
check_outlier(pv_sj.map()['ios']['ucb-sdb-ios-3']["evaluation_ranges"][0], 1, "city_escooter_0", "CYCLING")

In [None]:
check_outlier_expanded(pv_ucb.map()['android']['ucb-sdb-android-2']["evaluation_ranges"][0], 2, "commuter_rail_with_tunnels_0", "AUTOMOTIVE")

In [None]:
check_outlier(pv_la.map()['android']['ucb-sdb-android-2']["evaluation_ranges"][0], 0, "suburb_city_driving_weekend_0", "AUTOMOTIVE")