In [None]:
# for reading and validating data
import emeval.input.spec_details as eisd
import emeval.input.phone_view as eipv
import emeval.input.eval_view as eiev

In [None]:
# Visualization helpers
import emeval.viz.phone_view as ezpv
import emeval.viz.eval_view as ezev
import emeval.viz.geojson as ezgj
import pandas as pd

In [None]:
# Metrics helpers
import emeval.metrics.dist_calculations as emd

In [None]:
# For computation
import numpy as np
import math
import scipy.stats as stats
import matplotlib.pyplot as plt

In [None]:
import geopandas as gpd
import shapely as shp
import folium

In [None]:
DATASTORE_URL = "http://cardshark.cs.berkeley.edu"
AUTHOR_EMAIL = "shankari@eecs.berkeley.edu"
sd_la = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "unimodal_trip_car_bike_mtv_la")
sd_sj = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "car_scooter_brex_san_jose")
sd_ucb = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "train_bus_ebike_mtv_ucb")
sd_ucb_reroute = eisd.SpecDetails(DATASTORE_URL, AUTHOR_EMAIL, "train_bus_ebike_sm_reroute_mtv_ucb")

In [None]:
import importlib
importlib.reload(eisd)

In [None]:
pv_la = eipv.PhoneView(sd_la)

In [None]:
pv_sj = eipv.PhoneView(sd_sj)

In [None]:
pv_ucb = eipv.PhoneView(sd_ucb)

In [None]:
pv_ucb_reroute = eipv.PhoneView(sd_ucb_reroute)

In [None]:
import emeval.analysed.phone_view as eapv
importlib.reload(eapv)

In [None]:
av_la = eapv.create_analysed_view(pv_la, "http://localhost:8080", "analysis/recreated_location", "analysis/cleaned_trip", "analysis/cleaned_section")
av_sj = eapv.create_analysed_view(pv_sj, "http://localhost:8080", "analysis/recreated_location", "analysis/cleaned_trip", "analysis/cleaned_section")
av_ucb = eapv.create_analysed_view(pv_ucb, "http://localhost:8080", "analysis/recreated_location", "analysis/cleaned_trip", "analysis/cleaned_section")
av_ucb_reroute = eapv.create_analysed_view(pv_ucb_reroute, "http://localhost:8080", "analysis/recreated_location", "analysis/cleaned_trip", "analysis/cleaned_section")

In [None]:
[se["metadata"]["write_fmt_time"] for se in av_sj.map()["ios"]["ucb-sdb-ios-2"]["evaluation_ranges"][0]["sensed_trip_ranges"]]

### Validate distance calculations

Our x,y coordinates are in degrees (lon, lat). So when we calculate the distance between two points, it is also in degrees. In order for this to be meaningful, we need to convert it to a regular distance metric such as meters.

This is a complicated problem in general because our distance calculation applies 2-D spatial operations to a 3-D curved space. However, as documented in the shapely documentation, since our areas of interest are small, we can use a 2-D approximation and get reasonable results.

In order to get distances from degree-based calculations, we can use the following options:
- perform the calculations in degrees and then convert them to meters. As an approximation, we can use the fact that 360 degrees represents the circumference of the earth. Therefore `dist = degree_dist * (C/360)`
- convert degrees to x,y coordinates using utm (https://en.wikipedia.org/wiki/Universal_Transverse_Mercator_coordinate_system) and then calculate the distance
- since we calculate the distance from the ground truth linestring, calculate the closest ground truth point in (lon,lat) and then use the haversine formula (https://en.wikipedia.org/wiki/Haversine_formula) to calculate the distance between the two points

Let us quickly all three calculations for three selected test cases and:
- check whether they are largely consistent
- compare with other distance calculators to see which are closer

### Results and method choice

We find that the `manual_utm` and `project` methods are pretty consistent, and are significantly different from the `circumference` method. The `circumference` method appears to be consistently greater than the other two and the difference appears to be around 25%. The manual checks also appear to be closer to the `manual_utm` and `project` values. The `manual_utm` and `project` values are consistently within ~ 5% of each other, so we could really use either one.

**We will use the utm approach** since it is correct, is consistent with the shapely documentation (https://shapely.readthedocs.io/en/stable/manual.html#coordinate-systems) and applicable to operations beyond distance calculation

> Even though the Earth is not flat – and for that matter not exactly spherical – there are many analytic problems that can be approached by transforming Earth features to a Cartesian plane, applying tried and true algorithms, and then transforming the results back to geographic coordinates. This practice is as old as the tradition of accurate paper maps.

## Spatial error calculation

In [None]:
def get_spatial_errors(pv):
    spatial_error_df = pd.DataFrame()
    
    for phone_os, phone_map in pv.map().items():
        for phone_label, phone_detail_map in phone_map.items():
            for (r_idx, r) in enumerate(phone_detail_map["evaluation_ranges"]):
                run_errors = []
                for (tr_idx, tr) in enumerate(r["evaluation_trip_ranges"]):
                    trip_errors = []
                    for (sr_idx, sr) in enumerate(tr["evaluation_section_ranges"]):
                        # This is a Shapely LineString
                        
                        section_gt_leg = pv.spec_details.get_ground_truth_for_leg(tr["trip_id_base"], sr["trip_id_base"])
                        section_gt_shapes = gpd.GeoSeries(eisd.SpecDetails.get_shapes_for_leg(section_gt_leg))
                        if len(section_gt_shapes) == 1:
                            print("No ground truth route for %s %s, must be polygon, skipping..." % (tr["trip_id_base"], sr["trip_id_base"]))
                            assert section_gt_leg["type"] != "TRAVEL", "For %s, %s, %s, %s, %s found type %s" % (phone_os, phone_label, r_idx, tr_idx, sr_idx, section_gt_leg["type"])
                            continue
                        if len(sr['location_df']) == 0:
                            print("No sensed locations found, role = %s skipping..." % (r["eval_role_base"]))
                            # assert r["eval_role_base"] == "power_control", "Found no locations for %s, %s, %s, %s, %s" % (phone_os, phone_label, r_idx, tr_idx, sr_idx)
                            continue
                            
                        print("Processing travel leg %s, %s, %s, %s, %s" %
                              (phone_os, phone_label, r["eval_role_base"], tr["trip_id_base"], sr["trip_id_base"]))
                        # This is a GeoDataFrame
                        section_geo_df = emd.to_geo_df(sr["location_df"])
                        
                        # After this point, everything is in UTM so that 2-D inside/filtering operations work
                        utm_section_geo_df = emd.to_utm_df(section_geo_df)
                        utm_section_gt_shapes = section_gt_shapes.apply(lambda s: shp.ops.transform(emd.to_utm_coords, s))
                        filtered_us_gpdf = emd.filter_geo_df(utm_section_geo_df, utm_section_gt_shapes.loc["start_loc":"end_loc"])
                        filtered_gt_linestring = emd.filter_ground_truth_linestring(utm_section_gt_shapes)
                        meter_dist = filtered_us_gpdf.geometry.distance(filtered_gt_linestring)
                        ne = len(meter_dist)
                        curr_spatial_error_df = gpd.GeoDataFrame({"error": meter_dist,
                                                                  "ts": section_geo_df.ts,
                                                                  "geometry": section_geo_df.geometry,
                                                                  "phone_os": np.repeat(phone_os, ne),
                                                                  "phone_label": np.repeat(phone_label, ne),
                                                                  "role": np.repeat(r["eval_role_base"], ne),
                                                                  "timeline": np.repeat(pv.spec_details.CURR_SPEC_ID, ne), 
                                                                  "run": np.repeat(r_idx, ne),
                                                                  "trip_id": np.repeat(tr["trip_id_base"], ne),
                                                                  "section_id": np.repeat(sr["trip_id_base"], ne)})
                        spatial_error_df = pd.concat([spatial_error_df, curr_spatial_error_df], axis="index")
    return spatial_error_df

In [None]:
spatial_errors_df = pd.DataFrame()
spatial_errors_df = pd.concat([spatial_errors_df, get_spatial_errors(av_la)], axis="index")
spatial_errors_df = pd.concat([spatial_errors_df, get_spatial_errors(av_sj)], axis="index")
spatial_errors_df = pd.concat([spatial_errors_df, get_spatial_errors(av_ucb)], axis="index")
spatial_errors_df = pd.concat([spatial_errors_df, get_spatial_errors(av_ucb_reroute)], axis="index")

In [None]:
spatial_errors_df.head()

In [None]:
# Let's merge the reroutes
spatial_errors_df.timeline.replace("train_bus_ebike_sm_reroute_mtv_ucb", "train_bus_ebike_mtv_ucb", inplace=True)

In [None]:
r2q_map = {"power_control": 0, "HAMFDC": 1, "MAHFDC": 2, "HAHFDC": 3, "accuracy_control": 4}
q2r_map = {0: "power", 1: "HAMFDC", 2: "MAHFDC", 3: "HAHFDC", 4: "accuracy"}

In [None]:
spatial_errors_df["quality"] = spatial_errors_df.role.apply(lambda r: r2q_map[r])
spatial_errors_df["label"] = spatial_errors_df.role.apply(lambda r: r.replace('_control', ''))
timeline_list = ["train_bus_ebike_mtv_ucb", "car_scooter_brex_san_jose", "unimodal_trip_car_bike_mtv_la"]

In [None]:
spatial_errors_df.tail()

## Overall stats

In [None]:
ifig, ax_array = plt.subplots(nrows=1,ncols=2,figsize=(8,2), sharey=True)

spatial_errors_df.query("phone_os == 'android' & quality > 0").boxplot(ax = ax_array[0], column=["error"], by=["quality"], showfliers=False)
ax_array[0].set_title('android')
spatial_errors_df.query("phone_os == 'ios' & quality > 0").boxplot(ax = ax_array[1], column=["error"], by=["quality"], showfliers=False)
ax_array[1].set_title("ios")

for i, ax in enumerate(ax_array):
    # print([t.get_text() for t in ax.get_xticklabels()])
    ax.set_xticklabels([q2r_map[int(t.get_text())] for t in ax.get_xticklabels()])
    ax.set_xlabel("")

ax_array[0].set_ylabel("Spatial error (meters)")
# ax_array[1][0].set_ylabel("Spatial error (meters)")
ifig.suptitle("Spatial trajectory error v/s quality (excluding outliers)", y = 1.1)
# ifig.tight_layout()

In [None]:
ifig, ax_array = plt.subplots(nrows=1,ncols=2,figsize=(8,2), sharey=True)

spatial_errors_df.query("phone_os == 'android' & quality > 0").boxplot(ax = ax_array[0], column=["error"], by=["quality"])
ax_array[0].set_title('android')
spatial_errors_df.query("phone_os == 'ios' & quality > 0").boxplot(ax = ax_array[1], column=["error"], by=["quality"])
ax_array[1].set_title("ios")

for i, ax in enumerate(ax_array):
    # print([t.get_text() for t in ax.get_xticklabels()])
    ax.set_xticklabels([q2r_map[int(t.get_text())] for t in ax.get_xticklabels()])
    ax.set_xlabel("")

ax_array[0].set_ylabel("Spatial error (meters)")
# ax_array[1][0].set_ylabel("Spatial error (meters)")
ifig.suptitle("Spatial trajectory error v/s quality", y = 1.1)
# ifig.tight_layout()

### Split out results by timeline

In [None]:
ifig, ax_array = plt.subplots(nrows=2,ncols=3,figsize=(12,6), sharex=False, sharey=False)
timeline_list = ["train_bus_ebike_mtv_ucb", "car_scooter_brex_san_jose", "unimodal_trip_car_bike_mtv_la"]
for i, tl in enumerate(timeline_list):
    spatial_errors_df.query("timeline == @tl & phone_os == 'android' & quality > 0").boxplot(ax = ax_array[0][i], column=["error"], by=["quality"])
    ax_array[0][i].set_title(tl)
    spatial_errors_df.query("timeline == @tl & phone_os == 'ios' & quality > 0").boxplot(ax = ax_array[1][i], column=["error"], by=["quality"])
    ax_array[1][i].set_title("")

for i, ax in enumerate(ax_array[0]):
    ax.set_xticklabels([q2r_map[int(t.get_text())] for t in ax.get_xticklabels()])
    ax.set_xlabel("")

for i, ax in enumerate(ax_array[1]):
    ax.set_xticklabels([q2r_map[int(t.get_text())] for t in ax.get_xticklabels()])
    ax.set_xlabel("")

ax_array[0][0].set_ylabel("Spatial error (android)")
ax_array[1][0].set_ylabel("Spatial error (iOS)")
ifig.suptitle("Spatial trajectory error v/s quality over multiple timelines")
# ifig.tight_layout()

### Split out results by section for the most complex timeline (train_bus_ebike_mtv_ucb)

In [None]:
ifig, ax_array = plt.subplots(nrows=2,ncols=4,figsize=(25,10), sharex=True, sharey=True)
timeline_list = ["train_bus_ebike_mtv_ucb"]
for i, tl in enumerate(timeline_list):
    for q in range(1,5):
        sel_df = spatial_errors_df.query("timeline == @tl & phone_os == 'android' & quality == @q")
        if len(sel_df) > 0:
            sel_df.boxplot(ax = ax_array[2*i][q-1], column=["error"], by=["section_id"])
        ax_array[2*i][q-1].tick_params(axis="x", labelrotation=45)
        sel_df = spatial_errors_df.query("timeline == @tl & phone_os == 'ios' & quality == @q")
        if len(sel_df) > 0:
            sel_df.boxplot(ax = ax_array[2*i+1][q-1], column=["error"], by=["section_id"])
#        ax_array[i][].set_title("")

def make_acronym(s):
    ssl = s.split("_")
    # print("After splitting %s, we get %s" % (s, ssl))
    if len(ssl) == 0 or len(ssl[0]) == 0:
        return ""
    else:
        return "".join([ss[0] for ss in ssl])

for q in range(1,5):
    ax_array[0][q-1].set_title(q2r_map[q])
    curr_ticks = [t.get_text() for t in ax_array[1][q-1].get_xticklabels()]
    new_ticks = [make_acronym(t) for t in curr_ticks]
    ax_array[1][q-1].set_xticklabels(new_ticks)
    
print(list(zip(curr_ticks, new_ticks)))
# fig.text(0,0,"%s"% list(zip(curr_ticks, new_ticks)))

In [None]:
timeline_list = ["train_bus_ebike_mtv_ucb"]
for i, tl in enumerate(timeline_list):
    unique_sections = spatial_errors_df.query("timeline == @tl").section_id.unique()
    ifig, ax_array = plt.subplots(nrows=2,ncols=len(unique_sections),figsize=(40,10), sharex=True, sharey=False)
    for sid, s_name in enumerate(unique_sections):
        sel_df = spatial_errors_df.query("timeline == @tl & phone_os == 'android' & section_id == @s_name & quality > 0")
        if len(sel_df) > 0:
            sel_df.boxplot(ax = ax_array[2*i][sid], column=["error"], by=["quality"])
        ax_array[2*i][sid].set_title(s_name)
        sel_df = spatial_errors_df.query("timeline == @tl & phone_os == 'ios' & section_id == @s_name & quality > 0")
        if len(sel_df) > 0:
            sel_df.boxplot(ax = ax_array[2*i+1][sid], column=["error"], by=["quality"])
        ax_array[2*i+1][sid].set_title("")
#        ax_array[i][].set_title("")

### Focus only on sections where the max error is > 1000 meters

In [None]:
timeline_list = ["train_bus_ebike_mtv_ucb"]
for i, tl in enumerate(timeline_list):
    unique_sections = pd.Series(spatial_errors_df.query("timeline == @tl").section_id.unique())
    sections_with_outliers_mask = unique_sections.apply(lambda s_name: spatial_errors_df.query("timeline == 'train_bus_ebike_mtv_ucb' & section_id == @s_name").error.max() > 1000)
    sections_with_outliers = unique_sections[sections_with_outliers_mask]   
    ifig, ax_array = plt.subplots(nrows=2,ncols=len(sections_with_outliers),figsize=(17,4), sharex=True, sharey=False)
    for sid, s_name in enumerate(sections_with_outliers):
        sel_df = spatial_errors_df.query("timeline == @tl & phone_os == 'android' & section_id == @s_name & quality > 0")
        if len(sel_df) > 0:
            sel_df.boxplot(ax = ax_array[2*i][sid], column=["error"], by=["quality"])
        ax_array[2*i][sid].set_title(s_name)
        ax_array[2*i][sid].set_xlabel("")
        sel_df = spatial_errors_df.query("timeline == @tl & phone_os == 'ios' & section_id == @s_name & quality > 0")
        if len(sel_df) > 0:
            sel_df.boxplot(ax = ax_array[2*i+1][sid], column=["error"], by=["quality"])
        ax_array[2*i+1][sid].set_title("")
        print([t.get_text() for t in ax_array[2*i+1][sid].get_xticklabels()])
        ax_array[2*i+1][sid].set_xticklabels([q2r_map[int(t.get_text())] for t in ax_array[2*i+1][sid].get_xticklabels() if len(t.get_text()) > 0])
        ax_array[2*i+1][sid].set_xlabel("")
    ifig.suptitle("")

### Validation of outliers

#### (express bus iOS, MAHFDC)

ok, so it looks like the error is non-trivial across all runs, but run #1 is the worst and is responsible for the majority of the outliers. And this is borne out by the map, where on run #1, we end up with points in San Leandro!!

In [None]:
spatial_errors_df.query("phone_os == 'ios' & quality == 2 & section_id == 'express_bus' & error > 500").run.unique()

In [None]:
spatial_errors_df.query("phone_os == 'ios' & quality == 2 & section_id == 'express_bus'").boxplot(column="error", by="run")

In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("berkeley_to_mtv_SF_express_bus", "express_bus"); print(gt_leg["id"])
curr_map = ezgj.get_map_for_geojson(sd_ucb.get_geojson_for_leg(gt_leg), name="ground_truth")
ezgj.get_fg_for_loc_df(emd.linestring_to_geo_df(eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"]),
                       name="gt_points", color="green").add_to(curr_map)

name_err_time = lambda lr: "%d: %d, %s, %s" % (lr["index"], lr["df_idx"], lr["error"], sd_ucb.fmt(lr["ts"], "MM-DD HH:mm:ss"))
error_df = emd.to_loc_df(spatial_errors_df.query("phone_os == 'ios' & quality == 2 & section_id == 'express_bus' & run == 1"))
gt_16k = lambda lr: lr["error"] == error_df.error.max()
folium.GeoJson(ezgj.get_geojson_for_loc_df(error_df, color="red"), name="sensed_values").add_to(curr_map)
ezgj.get_fg_for_loc_df(error_df, name="sensed_points", color="red", popupfn=name_err_time, stickyfn=gt_16k).add_to(curr_map)
folium.LayerControl().add_to(curr_map)
curr_map

In [None]:
importlib.reload(ezgj)

In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("berkeley_to_mtv_SF_express_bus", "express_bus"); print(gt_leg["id"])
curr_map = ezgj.get_map_for_geojson(sd_ucb.get_geojson_for_leg(gt_leg), name="ground_truth")
ezgj.get_fg_for_loc_df(emd.linestring_to_geo_df(eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"]),
                       name="gt_points", color="green").add_to(curr_map)

name_err_time = lambda lr: "%d: %d, %s, %s" % (lr["index"], lr["df_idx"], lr["error"], sd_ucb.fmt(lr["ts"], "MM-DD HH:mm:ss"))

colors = ["red", "yellow", "blue"]
for run in range(3):
    error_df = emd.to_loc_df(spatial_errors_df.query("phone_os == 'ios' & quality == 2 & section_id == 'express_bus' & run == @run"))
    gt_16k = lambda lr: lr["error"] == error_df.error.max()
    print("max error for run %d is %s" % (run, error_df.error.max()))
    folium.GeoJson(ezgj.get_geojson_for_loc_df(error_df, color=colors[run]), name="sensed_values").add_to(curr_map)
    ezgj.get_fg_for_loc_df(error_df, name="sensed_points", color=colors[run], popupfn=name_err_time, stickyfn=gt_16k).add_to(curr_map)

folium.LayerControl().add_to(curr_map)
curr_map

#### (commuter rail aboveground android, HAMFDC)

Run 0: Multiple outliers at the start in San Jose. After that, everything is fine.

In [None]:
spatial_errors_df.query("phone_os == 'android' & quality == 1 & section_id == 'commuter_rail_aboveground' & error > 500").run.unique()

In [None]:
spatial_errors_df.query("phone_os == 'android' & quality == 1 & section_id == 'commuter_rail_aboveground' & error > 500").boxplot(column="error", by="run")

In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("mtv_to_berkeley_sf_bart", "commuter_rail_aboveground"); print(gt_leg["id"])
curr_map = ezgj.get_map_for_geojson(sd_ucb.get_geojson_for_leg(gt_leg), name="ground_truth")
ezgj.get_fg_for_loc_df(emd.linestring_to_geo_df(eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"]),
                       name="gt_points", color="green").add_to(curr_map)

name_err_time = lambda lr: "%d: %d, %s, %s" % (lr["index"], lr["df_idx"], lr["error"], sd_ucb.fmt(lr["ts"], "MM-DD HH:mm:ss"))

error_df = emd.to_loc_df(spatial_errors_df.query("phone_os == 'android' & quality == 1 & section_id == 'commuter_rail_aboveground' & run == 0"))
maxes = [error_df.error.max(), error_df[error_df.error < 10000].error.max(), error_df[error_df.error < 1000].error.max()]
gt_16k = lambda lr: lr["error"] in maxes
folium.GeoJson(ezgj.get_geojson_for_loc_df(error_df, color="red"), name="sensed_values").add_to(curr_map)
ezgj.get_fg_for_loc_df(error_df, name="sensed_points", color="red", popupfn=name_err_time, stickyfn=gt_16k).add_to(curr_map)

folium.LayerControl().add_to(curr_map)
curr_map

In [None]:
spatial_errors_df.query("phone_os == 'android' & quality == 1 & section_id == 'commuter_rail_aboveground' & error > 10000")

#### (walk_to_bus android, HAMFDC, HAHFDC)

Huge zig zag when we get out of the BART station

In [None]:
spatial_errors_df.query("phone_os == 'android' & (quality == 1 | quality == 3) & section_id == 'walk_to_bus' & error > 500").run.unique()

In [None]:
spatial_errors_df.query("phone_os == 'android' & (quality == 1 | quality == 3) & section_id == 'walk_to_bus' & error > 500")

In [None]:
spatial_errors_df.query("phone_os == 'android' & (quality == 1 | quality == 3) & section_id == 'walk_to_bus'").boxplot(column="error", by="run")

In [None]:
spatial_errors_df.query("phone_os == 'android' & (quality == 1 | quality == 3) & section_id == 'walk_to_bus'").error.max()

In [None]:
error_df

In [None]:
ucb_and_back = pv_ucb.map()["android"]["ucb-sdb-android-2"]["evaluation_ranges"][0]; ucb_and_back["trip_id"]
to_trip = ucb_and_back["evaluation_trip_ranges"][0]; print(to_trip["trip_id"])
wb_leg = to_trip["evaluation_section_ranges"][6]; print(wb_leg["trip_id"])
gt_leg = sd_ucb.get_ground_truth_for_leg(to_trip["trip_id_base"], wb_leg["trip_id_base"]); gt_leg["id"]

In [None]:
importlib.reload(ezgj)

In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("mtv_to_berkeley_sf_bart", "walk_to_bus"); print(gt_leg["id"])
curr_map = ezgj.get_map_for_geojson(sd_ucb.get_geojson_for_leg(gt_leg), name="ground_truth")
ezgj.get_fg_for_loc_df(emd.linestring_to_geo_df(eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"]),
                       name="gt_points", color="green").add_to(curr_map)

name_err_time = lambda lr: "%d: %d, %s, %s" % (lr["index"], lr["df_idx"], lr["error"], sd_ucb.fmt(lr["ts"], "MM-DD HH:mm:ss"))

error_df = emd.to_loc_df(spatial_errors_df.query("phone_os == 'android' & quality == 1 & section_id == 'walk_to_bus'").sort_index(axis="index"))
maxes = [error_df.error.max(), error_df[error_df.error < 16000].error.max(), error_df[error_df.error < 5000].error.max()]
gt_16k = lambda lr: lr["error"] in maxes
print("Checking errors %s" % maxes)
folium.GeoJson(ezgj.get_geojson_for_loc_df(error_df, color="red"), name="sensed_values").add_to(curr_map)
ezgj.get_fg_for_loc_df(error_df, name="sensed_points", color="red", popupfn=name_err_time, stickyfn=gt_16k).add_to(curr_map)

folium.LayerControl().add_to(curr_map)
curr_map

#### (light_rail_below_above_ground, android, accuracy_control)

ok, so it looks like the error is non-trivial across all runs, but run #1 is the worst and is responsible for the majority of the outliers. And this is borne out by the map, where on run #1, we end up with points in San Leandro!!

In [None]:
spatial_errors_df.query("phone_os == 'android' & quality == 4 & section_id == 'light_rail_below_above_ground' & error > 100").run.unique()

In [None]:
# spatial_errors_df.query("phone_os == 'android' & (quality == 4) & section_id == 'light_rail_below_above_ground'").boxplot(column="error", by="run")

In [None]:
ucb_and_back = pv_ucb.map()["android"]["ucb-sdb-android-2"]["evaluation_ranges"][0]; ucb_and_back["trip_id"]
back_trip = ucb_and_back["evaluation_trip_ranges"][2]; print(back_trip["trip_id"])
lt_leg = back_trip["evaluation_section_ranges"][7]; print(lt_leg["trip_id"])
gt_leg = sd_ucb.get_ground_truth_for_leg(back_trip["trip_id_base"], lt_leg["trip_id_base"]); gt_leg["id"]

In [None]:
import folium

In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("berkeley_to_mtv_SF_express_bus", "light_rail_below_above_ground"); print(gt_leg["id"])
curr_map = ezgj.get_map_for_geojson(sd_ucb.get_geojson_for_leg(gt_leg), name="ground_truth")
ezgj.get_fg_for_loc_df(emd.linestring_to_geo_df(eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"]),
                       name="gt_points", color="green").add_to(curr_map)

name_err_time = lambda lr: "%d: %d, %s, %s" % (lr["index"], lr["df_idx"], lr["error"], sd_ucb.fmt(lr["ts"], "MM-DD HH:mm:ss"))

colors = ["red", "yellow", "blue"]
for run in range(3):
    error_df = emd.to_loc_df(spatial_errors_df.query("phone_os == 'android' & quality == 2 & section_id == 'light_rail_below_above_ground' & run == @run"))
    gt_16k = lambda lr: lr["error"] == error_df.error.max()
    print("max error for run %d is %s" % (run, error_df.error.max()))
    folium.GeoJson(ezgj.get_geojson_for_loc_df(error_df, color=colors[run]), name="sensed_values").add_to(curr_map)
    ezgj.get_fg_for_loc_df(error_df, name="sensed_points", color=colors[run], popupfn=name_err_time, stickyfn=gt_16k).add_to(curr_map)

folium.LayerControl().add_to(curr_map)
curr_map

In [None]:
spatial_errors_df.query("phone_os == 'android' & quality == 2 & section_id == 'light_rail_below_above_ground' & run == @run")

#### (subway, android, HAMFDC)

This is the poster child for temporal accuracy tracking

In [None]:
spatial_errors_df.query("phone_os == 'android' & (quality == 1 | quality == 3) & section_id == 'subway_underground' & error > 8000")

In [None]:
bart_leg = pv_ucb.map()["android"]["ucb-sdb-android-3"]["evaluation_ranges"][0]["evaluation_trip_ranges"][0]["evaluation_section_ranges"][5]
gt_leg = sd_ucb.get_ground_truth_for_leg("mtv_to_berkeley_sf_bart", "subway_underground"); gt_leg["id"]

In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("mtv_to_berkeley_sf_bart", "subway_underground"); print(gt_leg["id"])
curr_map = ezgj.get_map_for_geojson(sd_ucb.get_geojson_for_leg(gt_leg), name="ground_truth")
ezgj.get_fg_for_loc_df(emd.linestring_to_geo_df(eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"]),
                       name="gt_points", color="green").add_to(curr_map)

name_err_time = lambda lr: "%d: %d, %s, %s" % (lr["index"], lr["df_idx"], lr["error"], sd_ucb.fmt(lr["ts"], "MM-DD HH:mm:ss"))

error_df = emd.to_loc_df(spatial_errors_df.query("phone_os == 'android' & quality == 3 & section_id == 'subway_underground' & run == 2").sort_index(axis="index"))
maxes = [error_df.error.max(), error_df[error_df.error < 16000].error.max(), error_df[error_df.error < 5000].error.max()]
gt_16k = lambda lr: lr["error"] in maxes
print("Checking errors %s" % maxes)
folium.GeoJson(ezgj.get_geojson_for_loc_df(error_df, color="red"), name="sensed_values").add_to(curr_map)
ezgj.get_fg_for_loc_df(error_df, name="sensed_points", color="red", popupfn=name_err_time, stickyfn=gt_16k).add_to(curr_map)

folium.LayerControl().add_to(curr_map)
curr_map


In [None]:
gt_leg = sd_ucb.get_ground_truth_for_leg("mtv_to_berkeley_sf_bart", "subway_underground"); gt_leg["id"]
eisd.SpecDetails.get_shapes_for_leg(gt_leg)["route"].is_simple

In [None]:
pd.concat([
    error_df.iloc[40:50],
    error_df.iloc[55:60],
    error_df.iloc[65:75],
    error_df.iloc[70:75]])

In [None]:
import pyproj

In [None]:
latlonProj = pyproj.Proj(init="epsg:4326")
xyProj = pyproj.Proj(init="epsg:3395")

In [None]:
xy = pyproj.transform(latlonProj, xyProj, -122.08355963230133, 37.39091642895306); xy

In [None]:
pyproj.transform(xyProj, latlonProj, xy[0], xy[1])

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({"a": [1,2,3], "b": [4,5,6]}); df

In [None]:
pd.concat([pd.DataFrame([{"a": 10, "b": 14}]), df, pd.DataFrame([{"a": 20, "b": 24}])], axis='index').reset_index(drop=True)