# H9


**Obj:** Modal distances
<br>
To explore how many urban trips are short distances.

## Questions

- [Q1](#Q1): What is the cumulative distribution of leg distances?
- [Q2](#Q2): How many car trips were short (e.g. less than 5km and less than 10km) and what are the traveller characteristics of those users?
- [Q3](#Q3): What are the negative experience factors of cyclists and users of public transport for the same short trip legs performed by car?


<a id='Q3' ></a>
### Q3: What are the negative experience factors of cyclists and users of public transport for the same short trip legs performed by car?

What is the potential for shifting to other modes?

In [None]:
# Import libraries

import os
import sys
import json
import time
from datetime import date, datetime

# numerical libraries
import pandas as pd
import numpy as np

# plotting libraries
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [None]:
# global variables
cutting_date = "2019-05-01"  # remove trips and data published before this date
meta_data_path = "../../data-campaigns/meta-data/"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/hypothesis/H9/"
img_path = "../../2019-12-16.out/hypothesis/H9/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

try:
    os.makedirs(os.path.abspath(img_path))
except FileExistsError:
    print("Directory '{}' already exists".format(img_path), file=sys.stderr)

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"
users_with_trips = "users_df_with_trips.pkl"

# read datasets
legs_df = pd.read_pickle(input_path + legs)
trips_users_df = pd.read_pickle(input_path + trips_users)
trips_df = pd.read_pickle(input_path + trips)
users_df_with_trips = pd.read_pickle(input_path + users_with_trips)

Procedure:
1. select trip legs performed by car;
2. get users that have performed at least a trip as per point 1. above;
3. among users from point 2., get the ones that have chosen at least one preferred transport mode within the transport categories: "biking". "public transport (short)", "public transport (long)"
4. for the users from point 3., select all trips that were not perfomed by car and look at the top negative experience factors.

In [None]:
legs_df.transp_category.unique()

In [None]:
# 1. select trips with at least one leg that is private motorized
pm_tripids = legs_df.loc[
    legs_df["transp_category"] == "private_motorized"
].tripid.unique()

In [None]:
# 2. select users that have performed at least a trip as per point 1. above;
pm_userids = legs_df.loc[legs_df["tripid"].isin(pm_tripids)].userid.unique()

In [None]:
# 3. among users from point 2., get the ones that have chosen at least one preferred transport mode
#    within the trasport categories: "biking". "public transport (short)", "public transport (long)"
user_prefmots = users_df_with_trips.loc[users_df_with_trips["userid"].isin(pm_userids)][
    ["userid", "preferedMots"]
]

In [None]:
import itertools


def list_prefmots(pm):
    prefmots = []

    for pmdict in pm:
        prefmots.append(pmdict["Mot"])

    return prefmots

In [None]:
# create transport category according to spreadsheet
category_transp_mode_dict = {
    "walking": [2, 7, 8, 34, 37],
    "cycling_emerging_micromobility": [1, 16, 17, 18, 19, 31, 35],
    "public_transp_short_dist": [10, 11, 12, 15, 30],
    "public_transp_long_dist": [14, 13, 28, 33, 27],
    "private_motorized": [0, 9, 20, 21, 22, 23, 25, 26, 32, 36],
}

transp_mode_category_dict = {}
for tc, motlist in category_transp_mode_dict.items():
    for mot in motlist:
        transp_mode_category_dict[mot] = tc

In [None]:
def map_prefmot_to_tranpcats(prefmot_list):
    tranpcats = []

    for pmot in prefmot_list:
        tc = transp_mode_category_dict.get(pmot, "Unknown")
        tranpcats.append(tc)

    return set(tranpcats)

In [None]:
user_prefmots["prefmots_transp_categories"] = user_prefmots.preferedMots.apply(
    lambda pm: map_prefmot_to_tranpcats(list_prefmots(pm))
)

In [None]:
user_prefmots.head(3)

In [None]:
ALTERNATIVE_TRANSPORT_CATEGORIES = [
    "cycling_emerging_micromobility",
    "public_transp_short_dist",
    "public_transp_long_dist",
]

In [None]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if a_set & b_set:
        return True
    else:
        return False

In [None]:
def has_aternative_transp_categories(tc):
    return common_member(tc, ALTERNATIVE_TRANSPORT_CATEGORIES)

In [None]:
user_prefmots[
    "has_aternative_transp_categories"
] = user_prefmots.prefmots_transp_categories.apply(has_aternative_transp_categories)

In [None]:
user_prefmots.head(3)

In [None]:
# df.loc[df['column_name'] == some_value]
alt_userids = user_prefmots.loc[
    user_prefmots["has_aternative_transp_categories"] == True
].userid.unique()

In [None]:
# users that have performed a trip by car but that have also preferred modes that are bike and public transport
common_users = set(pm_userids).intersection(set(alt_userids))
print(
    "Number of that have performed a trip by car but have also alternative preferred modes:",
    len(common_users),
)

In [None]:
common_users_legs_df = legs_df.loc[legs_df["userid"].isin(common_users)]

In [None]:
common_users_legs_df.head(3)

In [None]:
common_users_noncar_legs_df = common_users_legs_df.loc[
    common_users_legs_df["transp_category"].isin(ALTERNATIVE_TRANSPORT_CATEGORIES)
]

In [None]:
common_users_noncar_legs_df.head(3)

### Read Factors

In [None]:
### read data for reliability
all_factors = pd.read_pickle(input_path + "all_factors.pkl")

# delete legs with minus=F and plus=F
all_factors = all_factors[
    ~((all_factors["minus"] == False) & (all_factors["plus"] == False))
]

# delete legs with minus=T and plus=T (3% of obs)
all_factors = all_factors[
    ~((all_factors["minus"] == True) & (all_factors["plus"] == True))
]

print("all records:", len(all_factors))
print()

In [None]:
all_factors.columns

In [None]:
all_factors_users_noncar_legs = all_factors.loc[
    all_factors["legid"].isin(common_users_noncar_legs_df["legid"].unique())
]

In [None]:
all_factors_users_noncar_legs.head(3)

In [None]:
# select all trips that were not perfomed by car and look at the top negative experience factor
all_factors_minus_users_noncar_legs = all_factors_users_noncar_legs.loc[
    all_factors_users_noncar_legs["minus"] == True
]

In [None]:
all_factors_minus_users_noncar_legs = all_factors_minus_users_noncar_legs.merge(
    legs_df[["legid", "transp_category"]], on="legid"
)

In [None]:
all_factors_minus_users_noncar_legs.head(3)

In [None]:
results = (
    all_factors_minus_users_noncar_legs.groupby(["transp_category", "factor"])
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)

results.head(5)

In [None]:
transport_categories = list(set(transp_mode_category_dict.values()))

In [None]:
heatmap_df = pd.pivot_table(
    results, values="nlegs", index=["factor"], columns=["transp_category"], fill_value=0
)
heatmap_df.head(5)

In [None]:
heatmap_df = heatmap_df.reindex(
    heatmap_df.sort_values(by="cycling_emerging_micromobility", ascending=False).index
)
heatmap_df.head(5)

In [None]:
plt.figure(figsize=(12, 24))

# plot heatmap
sns.heatmap(heatmap_df, annot=True, fmt="d")

# set title and style
plt.title("Top negative factors for alternative trips")
plt.tight_layout()

# Save image
plt.savefig(img_path + "h9_q3.png", bbox_to_anchor=True, bbox_inches="tight")

### Identification of Similar Legs

In [None]:
all_legs_coords_filename = "all_legs_final_ds_user_info_urban_class.pkl"
all_legs_coords = pd.read_pickle(os.path.join(input_path, all_legs_coords_filename))

In [None]:
gps_cities_filename = "gps_cities.pkl"
gps_cities = pd.read_pickle(os.path.join(input_path, gps_cities_filename))

In [None]:
gps_cities.head(3)

In [None]:
gps_cities.columns

In [None]:
legs_coords_df = gps_cities[
    [
        "legid",
        "StartLat",
        "StartLon",
        "country_start",
        "start_class",
        "EndLat",
        "EndLon",
        "country_end",
        "end_class",
    ]
]
legs_coords_df = legs_coords_df.rename(
    columns={
        "StartLat": "lat_start",
        "StartLon": "lon_start",
        "start_class": "class_start",
        "EndLat": "lat_end",
        "EndLon": "lon_end",
        "end_class": "class_end",
    }
)
legs_coords_df.drop_duplicates(keep="first", inplace=True)
legs_coords_df.head(3)

### Coordinate rounding procedure

Following the information on the Wikipedia page [Decimal degree](https://en.wikipedia.org/w/index.php?title=Decimal_degrees&oldid=937245621#Precision) and the question on StackOverflow ["Measuring accuracy of latitude and longitude?"](https://gis.stackexchange.com/q/8650/18292), we have that:
> The third decimal place is worth up to 110 m: it can identify a large agricultural field or institutional campus.
```
3        0.001            111  m
```

We will proceed like this: we consider each point (lat, lon) to be represented by a square given with the following vertices:
* `A (lat-0.002, lon+0.002)`
* `B (lat+0.002, lon+0.002)`
* `C (lat+0.002, lon-0.002)`
* `D (lat-0.002, lon-0.002)`

In this way each point is effectively transformed in a square - or, rather a curved square, each side is an arc - with sides of lenght o.004 degrees.

If the the squares representing two points intersect we consider them equal. In this way two points are distant at most:
```
sqrt(2)·(0.004 deg)·(111.32 km/deg) = 629,72102 m ~ 630 m
```

Graphical example:
![coordinate_intersection.png](https://i.imgur.com/fSh5ISh.png)

In [None]:
# Find if two rectangles overlap
# https://www.geeksforgeeks.org/find-two-rectangles-overlap/


class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y


# Returns true if two rectangles(l1, r1) and (l2, r2) overlap
def rect_overlap(l1, r1, l2, r2):
    # If one rectangle is on left side of other
    if l1.x > r2.x or l2.x > r1.x:
        return False

    # If one rectangle is above other
    if l1.y < r2.y or l2.y < r1.y:
        return False

    return True


def equivalent_points(p1lat, p1lon, p2lat, p2lon):
    # A (lat-0.002, lon+0.002)
    # B (lat+0.002, lon+0.002)
    # C (lat+0.002, lon-0.002)
    # D (lat-0.002, lon-0.002)

    a1 = Point(p1lat - 0.002, p1lon + 0.002)
    b1 = Point(p1lat + 0.002, p1lon + 0.002)
    c1 = Point(p1lat + 0.002, p1lon - 0.002)
    d1 = Point(p1lat - 0.002, p1lon - 0.002)

    a2 = Point(p2lat - 0.002, p2lon + 0.002)
    b2 = Point(p2lat + 0.002, p2lon + 0.002)
    c2 = Point(p2lat + 0.002, p2lon - 0.002)
    d2 = Point(p2lat - 0.002, p2lon - 0.002)

    return rect_overlap(a1, c1, a2, c2)

In [None]:
equivalent_points(1.0, 1.0, 1.00405, 1.004)

In [None]:
(
    legs_coords_df[
        ["legid", "country_start", "country_end", "class_start", "class_end"]
    ]
    .fillna("NONE")
    .groupby(["country_start", "country_end", "class_start", "class_end"])
    .size()
    .sort_values(ascending=False)
    .reset_index()
).head(10)

In [None]:
legs_coords_df.groupby(["country_start"]).size().sort_values(
    ascending=False
).reset_index().head(10)

In [None]:
legs_coords_df.groupby(["country_start", "class_start"]).size().sort_values(
    ascending=False
).reset_index().head(10)

In [None]:
legs_coords_df.groupby(["country_end", "class_end"]).size().sort_values(
    ascending=False
).reset_index().head(10)

In [None]:
countries = set(legs_coords_df.country_start.fillna("NONE").unique()).union(
    set(legs_coords_df.country_end.fillna("NONE").unique())
)
countries.discard("NONE")
print("Number of different countries:", len(countries))
print(countries)

point_classes = set(legs_coords_df.class_start.fillna("NONE").unique()).union(
    set(legs_coords_df.class_end.fillna("NONE").unique())
)
point_classes.discard("NONE")
print("Number of classes:", len(point_classes))
print(point_classes)

In [None]:
legs_coords_df.columns

In [None]:
PRINT_NROWS = 1000000


def select_legs(coords1_df, coords2_df, country, pc):
    tmp1_df = coords1_df.loc[
        ((coords1_df["country"] == country) | (coords1_df["country"] == "NONE"))
        & ((coords1_df["class"] == pc) | (coords1_df["class"] == "NONE"))
    ].drop_duplicates()

    tmp2_df = coords2_df.loc[
        ((coords2_df["country"] == country) | (coords2_df["country"] == "NONE"))
        & ((coords2_df["class"] == pc) | (coords2_df["class"] == "NONE"))
    ].drop_duplicates()

    npoints1 = tmp1_df.legid.nunique()
    npoints2 = tmp2_df.legid.nunique()
    print(
        "- Points 1: {}, Points 2: {}, To Process: {} - ".format(
            npoints1, npoints2, npoints1 * npoints2
        ),
        end="",
    )

    if npoints1 > 0 and npoints2 > 0:
        i = 0
        # iterating over multiple columns
        for row1 in tmp1_df.itertuples():
            for row2 in tmp2_df.itertuples():
                i = i + 1
                if (i % PRINT_NROWS) == 0:
                    print(".", end="")
                if (i % (10 * PRINT_NROWS)) == 0:
                    print(" ", end="")

                # equivalent_points(p1lat, p1lon, p2lat, p2lon):
                if row1.legid > row2.legid and equivalent_points(
                    row1.lat, row1.lon, row2.lat, row2.lon
                ):
                    yield (row1.legid, row2.legid)

    print()

In [None]:
COMPUTE_MATCHING_LEGS = False
BIG_MEMORY = False

In [None]:
%%time

import csv

legs_start_coords_df = legs_coords_df[
    ["legid", "lat_start", "lon_start", "country_start", "class_start"]
].copy()
legs_start_coords_df["country_start"] = legs_start_coords_df["country_start"].fillna(
    "NONE"
)
legs_start_coords_df["class_start"] = legs_start_coords_df["class_start"].fillna("NONE")
legs_start_coords_df.columns = ["legid", "lat", "lon", "country", "class"]

legs_end_coords_df = legs_coords_df[
    ["legid", "lat_end", "lon_end", "country_end", "class_end"]
].copy()
legs_end_coords_df["country_end"] = legs_end_coords_df["country_end"].fillna("NONE")
legs_end_coords_df["class_end"] = legs_end_coords_df["class_end"].fillna("NONE")
legs_end_coords_df.columns = ["legid", "lat", "lon", "country", "class"]

matching_points = []
if COMPUTE_MATCHING_LEGS:
    matching_points_filename = "matching_points.csv"
    with open(os.path.join(input_path, "matching_points.csv"), "w+") as outfp:
        for country in sorted(countries):
            for pc in sorted(point_classes):

                print("Processing: {} ({})".format(country, pc))

                writer = csv.writer(outfp)

                writer.writerow(["legid1", "type1", "legid2", "type2"])
                for match in select_legs(
                    legs_start_coords_df, legs_start_coords_df, country, pc
                ):
                    legid1 = match[0]
                    legid2 = match[1]
                    if BIG_MEMORY:
                        matching_points.append((legid1, "start", legid2, "start"))
                    else:
                        writer.writerow((legid1, "start", legid2, "start"))

                for match in select_legs(
                    legs_start_coords_df, legs_end_coords_df, country, pc
                ):
                    legid1 = match[0]
                    legid2 = match[1]
                    if BIG_MEMORY:
                        matching_points.append((legid1, "start", legid2, "end"))
                    else:
                        writer.writerow((legid1, "start", legid2, "end"))

                for match in select_legs(
                    legs_end_coords_df, legs_end_coords_df, country, pc
                ):
                    legid1 = match[0]
                    legid2 = match[1]
                    if BIG_MEMORY:
                        matching_points.append((legid1, "end", legid2, "end"))
                    else:
                        writer.writerow((legid1, "end", legid2, "end"))

                if BIG_MEMORY:
                    for match in matching_points:
                        writer.writerow(match)

else:
    if BIG_MEMORY:
        matching_points_filename = "matching_points.csv"
    else:
        matching_points_filename = "matching_points_10M.csv"

    with open(os.path.join(input_path, matching_points_filename), "r") as infp:
        reader = csv.reader(infp)

        # skip header
        next(reader)

        matching_points = [line for line in reader]

In [None]:
print("Number of matching points pairs:", len(matching_points))

In [None]:
matching_points[:5]

In [None]:
pm_legids = legs_df.loc[legs_df["tripid"].isin(pm_tripids)].legid.unique()

In [None]:
import itertools

matching_points_legids = set(
    itertools.chain.from_iterable([(el[0], el[2]) for el in matching_points])
)

In [None]:
common_legids = set(matching_points_legids).intersection(pm_legids)
print("Number of legids in common_legids:", len(common_legids))

In [None]:
from collections import defaultdict

ss_points = defaultdict(list)
se_points = defaultdict(list)
es_points = defaultdict(list)
ee_points = defaultdict(list)

for legid1, type1, legid2, type2 in matching_points:
    if type1 == "start" and type2 == "start":
        ss_points[legid1].append(legid2)
    elif type1 == "start" and type2 == "end":
        se_points[legid1].append(legid2)
        es_points[legid2].append(legid1)
    elif type1 == "end" and type2 == "end":
        ee_points[legid1].append(legid2)
    else:
        print("Unepected types: ({}, {}, {}, {})".format(legid1, type1, legid2, type2))

In [None]:
from itertools import islice


def take(iterable, n):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
print("ss_points:", end="")
print([(k, len(v)) for k, v in take(ss_points.items(), 5)])

print("se_points:", end="")
print([(k, len(v)) for k, v in take(se_points.items(), 5)])

print("es_points:", end="")
print([(k, len(v)) for k, v in take(es_points.items(), 5)])

print("ee_points:", end="")
print([(k, len(v)) for k, v in take(ee_points.items(), 5)])

In [None]:
matching_legs = defaultdict(list)

for legid in ss_points.keys():
    matching_start_ids = ss_points[legid]
    matching_end_ids = ee_points[legid]

    common_matches = set(matching_start_ids).intersection(matching_end_ids)
    if len(common_matches) > 0:
        matching_legs[legid].extend(common_matches)

In [None]:
print("Number of matching legs (start-end):", len(matching_legs))

In [None]:
from pprint import pprint

print("matching_legs:", end="")
pprint([(k, v) for k, v in take(matching_legs.items(), 5)])

In [None]:
legs_df.loc[legs_df["legid"] == "#24:23124"].userid

In [None]:
legs_coords_df.loc[legs_coords_df["legid"] == "#24:23124"]

In [None]:
legs_df.loc[legs_df["legid"] == matching_legs["#24:23124"][0]].userid

In [None]:
legs_coords_df.loc[legs_coords_df["legid"] == matching_legs["#24:23124"][0]]

In [None]:
pm_selected_legs = legs_df.loc[
    (legs_df["transp_category"] == "private_motorized")
    & (legs_df["legid"].isin(set(k for k in matching_legs.keys())))
]

In [None]:
print("Number of selected private motorized legs:", pm_selected_legs.legid.nunique())

In [None]:
selected_matching_legs = set()
for pm_legid in set(pm_selected_legs.legid.unique()):
    selected_matching_legs.update(matching_legs[pm_legid])

In [None]:
take(selected_matching_legs, 5)

In [None]:
print("Number of matching legs: ", len(selected_matching_legs))

In [None]:
alternative_selected_legs = legs_df.loc[
    (legs_df["transp_category"].isin(ALTERNATIVE_TRANSPORT_CATEGORIES))
    & (legs_df["legid"].isin(selected_matching_legs))
]

In [None]:
alternative_selected_legs.head(3)

In [None]:
all_factors_alternative_selected_legs = all_factors.loc[
    all_factors["legid"].isin(alternative_selected_legs["legid"].unique())
]

In [None]:
# select all negative experience factor
all_factors_minus_alternative_selected_legs = all_factors_alternative_selected_legs.loc[
    all_factors_alternative_selected_legs["minus"] == True
]

In [None]:
all_factors_minus_alternative_selected_legs = all_factors_minus_alternative_selected_legs.merge(
    legs_df[["legid", "transp_category"]], on="legid"
)

In [None]:
res = (
    all_factors_minus_alternative_selected_legs.groupby(["transp_category", "factor"])
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)

In [None]:
res