## Classification Unimodal Metric Analysis

The goal of this notebook is to asses the accuracy of the classification assignment in the ML algorithm with regards to different travel modes.

## Set up the dependencies

In [None]:
# for reading and validating data
import emeval.input.spec_details as eisd
import emeval.input.phone_view as eipv
import emeval.input.eval_view as eiev

In [None]:
# Visualization helpers
import emeval.viz.phone_view as ezpv
import emeval.viz.eval_view as ezev
import emeval.viz.geojson as ezgj

In [None]:
# Analytics results
import emeval.metrics.segmentation as ems

In [None]:
import pandas as pd
pd.options.display.float_format = '{:.6f}'.format
import arrow
import numpy as np

In [None]:
# For plots
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# For maps
import folium
import branca.element as bre

In [None]:
# For easier debugging while working on modules
import importlib

In [None]:
import arrow

## The spec

The spec defines what experiments were done, and over which time ranges. Once the experiment is complete, most of the structure is read back from the data, but we use the spec to validate that it all worked correctly. The spec also contains the ground truth for the legs. Here, we read the spec for 
* the trip to LA for a unimodal timeline analysis
* the trip to San Jose for a multimodal analysis
* the trip to ... for a ... analysis of data combined over a trip or section

In [None]:
AUTHOR_EMAIL = "shankari@eecs.berkeley.edu"
DATASTORE_LOC = "http://localhost:8080"
sd_la  = eisd.ServerSpecDetails(DATASTORE_LOC, AUTHOR_EMAIL, "unimodal_trip_car_bike_mtv_la")
sd_sj  = eisd.ServerSpecDetails(DATASTORE_LOC, AUTHOR_EMAIL, "car_scooter_brex_san_jose")
sd_ucb = eisd.ServerSpecDetails(DATASTORE_LOC, AUTHOR_EMAIL, "train_bus_ebike_mtv_ucb")

In [None]:
pv_la = eipv.PhoneView(sd_la)

In [None]:
pv_sj = eipv.PhoneView(sd_sj)

In [None]:
pv_ucb = eipv.PhoneView(sd_ucb)

Get the sensed section data for each trip

In [None]:
ems.fill_sensed_section_ranges(pv_la)
ems.fill_sensed_section_ranges(pv_sj)
ems.fill_sensed_section_ranges(pv_ucb)

`TODO:` We want a sensed_section_mode entry, I will look to see if any notebooks have a good map for this

In [None]:
def get_sensed_mode_entries(pv):
    sensed_mode_entry_list = []
    for phone_os, phone_map in pv.map().items():
        for phone_label, phone_detail_map in phone_map.items():
            if "control" in phone_detail_map["role"]:
                print("Ignoring %s phone %s since they are always on" % (phone_detail_map["role"], phone_label))
                continue
            # this spec does not have any calibration ranges, but evaluation ranges are actually cooler
            for r in phone_detail_map["evaluation_ranges"]:
                for tr in r["evaluation_trip_ranges"]:
                    matching_section_map = ems.find_matching_segments(tr["evaluation_section_ranges"], 
                                                                      "trip_id", tr["sensed_section_ranges"])
                    for section in tr["evaluation_section_ranges"]:
                        section_gt_leg = pv.spec_details.get_ground_truth_for_leg(tr['trip_id_base'],
                                                                                  section['trip_id_base'],
                                                                                  tr['start_ts'],
                                                                                  tr['end_ts'])
                        if section_gt_leg["type"] == "WAITING":
                            print("Skipping WAITING section %s %s with potential partway transitions" %
                                  (tr["trip_id"], section["trip_id"]))
                            continue
                        # this calulcates the metric for the mode
                        result = ems.get_mode_check_results(section, section_gt_leg, matching_section_map)
                        sensed_mode_entry = {"phone_os": phone_os, 
                                             "phone_label": phone_label,
                                             "timeline": pv.spec_details.curr_spec["id"],
                                             "range_id": r["trip_id"],
                                             "run": r["trip_run"], 
                                             "duration": r["duration"],
                                             "role": r["eval_role_base"],
                                             "section_count": len(tr["sensed_section_ranges"]),
                                             "trip_id": tr["trip_id"],
                                             "section_id": section["trip_id"]}
                        sensed_mode_entry.update(result)
                        sensed_mode_entry_list.append(sensed_mode_entry)

    return sensed_mode_entry_list

In [None]:
sensed_mode_entries_list = []
for pv in [pv_la, pv_sj, pv_ucb]:
    sensed_mode_entries_list.extend(get_sensed_mode_entries(pv))
sensed_mode_df = pd.DataFrame(sensed_mode_entries_list)

## Data Frame

For the purpose of this study, we focus on the the ... headers

In [None]:
sensed_mode_df.head()

In [None]:
# This is an array of our travel modes
modes = sensed_mode_df.gt_mode.unique(); modes

# Plot Mode vs. Matching Percentage data

#### First, we plot just the base modes

In [None]:
ifig, ax_array = plt.subplots(nrows=1, ncols=3, figsize=(24,12), dpi=300, sharex=False, sharey=False)
for i, mode in enumerate(sensed_mode_df.gt_base_mode.unique()):
    sensed_mode_df.query(f"gt_base_mode == '{mode}'").boxplot(ax=ax_array[i], 
                                                              column=['matching_pct'], 
                                                              by=["gt_base_mode"])

#### Now we plot all the available modes

In [None]:
ifig, ax_array = plt.subplots(nrows=2, ncols=4, figsize=(24,12), dpi=300, sharex=False, sharey=False)
for i, mode in enumerate(sensed_mode_df.gt_mode.unique()):
    sensed_mode_df.query(f"gt_mode == '{mode}'").boxplot(ax=ax_array.flatten()[i], 
                                                              column=['matching_pct'], 
                                                              by=["gt_mode"])

#### Finaly, we see if any indavidual modes stick out amung the respective base modes

In [None]:
ifig, ax_array = plt.subplots(nrows=1, ncols=3, figsize=(24,12), dpi=300, sharex=False, sharey=False)
for i, mode in enumerate(sensed_mode_df.gt_base_mode.unique()):
    sensed_mode_df.query(f"gt_base_mode == '{mode}'").boxplot(ax=ax_array[i], 
                                                              column=['matching_pct'], 
                                                              by=["gt_mode"])
ifig.suptitle("Boxplot grouped by gt_mode distributed by gt_base_mode")

## And for thoes who care about the numbers

In [None]:
for i, mode in enumerate(sensed_mode_df.gt_base_mode.unique()):
    display(sensed_mode_df.query(f"gt_base_mode == '{mode}'").describe().style.set_caption(f"BASE MODE: {mode}"))

In [None]:
for i, mode in enumerate(sensed_mode_df.gt_mode.unique()):
    display(sensed_mode_df.query(f"gt_mode == '{mode}'").describe().style.set_caption(f"MODE: {mode}"))

`TODO:` handel outlieres

`TODO:` figure out what modes are most likley to get mixed up