# Case Study


**Obj:** Modal distances
<br>
To explore how many urban trips are short distances.

## Questions

- [Q3](#Q3): What are the negative experience factors of cyclists and users of public transport for the same short trip legs performed by car?


<a id='Q3' ></a>
### Q3: What are the negative experience factors of cyclists and users of public transport for the same short trip legs performed by car?

What is the potential for shifting to other modes?

In [None]:
# Import libraries

import os
import re
import sys
import csv
import json
import time
import pathlib
from datetime import date, datetime

# numerical libraries
import pandas as pd
import numpy as np

# plotting libraries
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [None]:
# global variables
cutting_date = "2019-05-01"  # remove trips and data published before this date
meta_data_path = pathlib.Path("../../data-campaigns/meta-data/")
input_path = pathlib.Path("../../2019-12-16.out/")
out_path = pathlib.Path("../../2019-12-16.out/h9/")
# img_path = pathlib.Path("../../2019-12-16.out/h9/img/")
matching_points_path = input_path / "matching_points"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

# try:
#    os.makedirs(os.path.abspath(img_path))
# except FileExistsError:
#     print("Directory '{}' already exists".format(img_path), file=sys.stderr)

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"
users_with_trips = "users_df_with_trips.pkl"

# read datasets
legs_df = pd.read_pickle(input_path / legs)
trips_users_df = pd.read_pickle(input_path / trips_users)
trips_df = pd.read_pickle(input_path / trips)
users_df_with_trips = pd.read_pickle(input_path / users_with_trips)

In [None]:
# regex in glob
# https://stackoverflow.com/q/13031989/2377454
def reglob(pattern, path, invert=False):
    r = re.compile(pattern)

    if invert is False:
        res = [path / f for f in os.listdir(path) if r.match(f)]
    else:
        res = [path / f for f in os.listdir(path) if not r.match(f)]

    return res

## Leg Selection

Procedure:
1. select trip legs performed by car;
2. get users that have performed at least a trip as per point 1. above;
3. among users from point 2., get the ones that have chosen at least one preferred transport mode within the transport categories: "biking". "public transport (short)", "public transport (long)"
4. for the users from point 3., select all trips that were not perfomed by car and look at the top negative experience factors.

In [None]:
print("Total number of legs'", legs_df.legid.nunique())
print("Total number of trips'", legs_df.tripid.nunique())
print("Total number of trips'", legs_df.userid.nunique())

In [None]:
transport_categories = legs_df.transp_category.unique()

In [None]:
# 1. select trips with at least one leg that is private motorized
pm_tripids = legs_df.loc[
    legs_df["transp_category"] == "private_motorized"
].tripid.unique()

print(
    "Number of trips with at least one leg being 'private_motorized'", pm_tripids.shape
)

In [None]:
# 2. select users that have performed at least a trip as per point 1. above;
pm_userids = legs_df.loc[legs_df["tripid"].isin(pm_tripids)].userid.unique()
print(
    "Number of user with at least one trip being 'private_motorized'", pm_userids.shape
)

In [None]:
pm_legids = legs_df.loc[legs_df["tripid"].isin(pm_tripids)].legid.unique()
print(
    "Number of total legs in trips where 0at least one trip being 'private_motorized'",
    pm_legids.shape,
)

In [None]:
# 3. among users from point 2., get the ones that have chosen at least one preferred transport mode
#    within the trasport categories: "biking". "public transport (short)", "public transport (long)"
user_prefmots = users_df_with_trips.loc[users_df_with_trips["userid"].isin(pm_userids)][
    ["userid", "preferedMots"]
]

In [None]:
import itertools


def list_prefmots(pm):
    prefmots = []

    for pmdict in pm:
        prefmots.append(pmdict["Mot"])

    return prefmots

In [None]:
# create transport category according to spreadsheet
category_transp_mode_dict = {
    "walking": [2, 7, 8, 34, 37],
    "cycling_emerging_micromobility": [1, 16, 17, 18, 19, 31, 35],
    "public_transp_short_dist": [10, 11, 12, 15, 30],
    "public_transp_long_dist": [14, 13, 28, 33, 27],
    "private_motorized": [0, 9, 20, 21, 22, 23, 25, 26, 32, 36],
}

transp_mode_category_dict = {}
for tc, motlist in category_transp_mode_dict.items():
    for mot in motlist:
        transp_mode_category_dict[mot] = tc

In [None]:
def map_prefmot_to_tranpcats(prefmot_list):
    tranpcats = []

    for pmot in prefmot_list:
        tc = transp_mode_category_dict.get(pmot, "Unknown")
        tranpcats.append(tc)

    return set(tranpcats)

In [None]:
user_prefmots["prefmots_transp_categories"] = user_prefmots.preferedMots.apply(
    lambda pm: map_prefmot_to_tranpcats(list_prefmots(pm))
)

In [None]:
user_prefmots.head(3)

In [None]:
ALTERNATIVE_TRANSPORT_CATEGORIES = [
    "cycling_emerging_micromobility",
    "public_transp_short_dist",
    "public_transp_long_dist",
]

In [None]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if a_set & b_set:
        return True
    else:
        return False

In [None]:
def has_aternative_transp_categories(tc):
    return common_member(tc, ALTERNATIVE_TRANSPORT_CATEGORIES)

In [None]:
user_prefmots[
    "has_aternative_transp_categories"
] = user_prefmots.prefmots_transp_categories.apply(has_aternative_transp_categories)

In [None]:
user_prefmots.head(3)

In [None]:
# df.loc[df['column_name'] == some_value]
alt_userids = user_prefmots.loc[
    user_prefmots["has_aternative_transp_categories"] == True
].userid.unique()

print(
    "Number of users that have at least one preferred alternative mode of transport mode:",
    alt_userids.shape,
)

In [None]:
alt_tripids = legs_df.loc[legs_df["userid"].isin(alt_userids)].tripid.unique()
print(
    "Number of trips of users that have at least one preferred alternative"
    "mode of transport mode:",
    alt_tripids.shape,
)

In [None]:
alt_legids = legs_df.loc[legs_df["userid"].isin(alt_userids)].legid.unique()
print(
    "Number of legs of users that have at least one preferred alternative"
    "mode of transport mode:",
    alt_legids.shape,
)

In [None]:
# users that have performed a trip by car but that have also preferred modes that are bike and public transport
common_users = set(pm_userids).intersection(set(alt_userids))
print(
    "Number of users that have performed a trip by car but have also alternative preferred modes:",
    len(common_users),
)

In [None]:
common_users_trips_ids = legs_df.loc[
    legs_df["userid"].isin(common_users)
].tripid.nunique()
print(
    "Number of trips of users that have performed a trip by car "
    "but have also alternative preferred modes:",
    common_users_trips_ids,
)

In [None]:
common_users_legs_df = legs_df.loc[legs_df["userid"].isin(common_users)]
print(
    "Number of legs of users that have performed a trip by car but have also alternative preferred modes:",
    common_users_legs_df.shape,
)

In [None]:
common_users_legs_df.head(3)

In [None]:
common_users_noncar_legs_df = common_users_legs_df.loc[
    common_users_legs_df["transp_category"].isin(ALTERNATIVE_TRANSPORT_CATEGORIES)
]
print("Common legs non-car legs", common_users_noncar_legs_df.shape)

In [None]:
common_users_noncar_legs_df.head(3)

In [None]:
common_users_noncar_legs_df.userid.nunique()

### Read Factors

In [None]:
### read data for reliability
all_factors = pd.read_pickle(input_path / "all_factors.pkl")

# delete legs with minus=F and plus=F
all_factors = all_factors[
    ~((all_factors["minus"] == False) & (all_factors["plus"] == False))
]

# delete legs with minus=T and plus=T (3% of obs)
all_factors = all_factors[
    ~((all_factors["minus"] == True) & (all_factors["plus"] == True))
]

print("all records:", len(all_factors))
print()

In [None]:
all_factors.columns

In [None]:
all_factors_users_noncar_legs = all_factors.loc[
    all_factors["legid"].isin(common_users_noncar_legs_df["legid"].unique())
]

In [None]:
all_factors_users_noncar_legs.head(3)

In [None]:
# select all trips that were not perfomed by car and look at the top negative experience factor
all_factors_minus_users_noncar_legs = all_factors_users_noncar_legs.loc[
    all_factors_users_noncar_legs["minus"] == True
]

In [None]:
all_factors_minus_users_noncar_legs = all_factors_minus_users_noncar_legs.merge(
    legs_df[["legid", "transp_category"]], on="legid"
)

In [None]:
all_factors_minus_users_noncar_legs.head(3)

In [None]:
results = (
    all_factors_minus_users_noncar_legs.groupby(["transp_category", "factor"])
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)

results.head(3)

In [None]:
transport_categories = list(set(transp_mode_category_dict.values()))

In [None]:
results_filename = "all_negative_factors_alternative.csv"
results_path = out_path / results_filename
print(results_path)

In [None]:
results.to_csv(results_path, header=True, index=False)

### Identification of Similar Legs

In [None]:
COUNTRIES = "(ITA|BEL)"

In [None]:
# load files
matching_points_files = sorted(
    [
        csvfile
        for csvfile in reglob(
            # "matching_points_\w{3}_(rural|sub-urban|urban)\.csv",
            ("matching_points_{c}_(rural|sub-urban|urban)\.csv".format(c=COUNTRIES)),
            matching_points_path,
        )
    ]
)

In [None]:
matching_points_files

In [None]:
matching_points = []
for csvfile in matching_points_files:
    print("csvfile: ", csvfile)

    with open(csvfile, "r") as csvfp:
        reader = csv.reader(csvfp, delimiter=",")

        # skip header
        next(reader)

        matching_points.extend([row for row in reader])

In [None]:
all_legs_coords_filename = "all_legs_final_ds_user_info_urban_class.pkl"
all_legs_coords = pd.read_pickle(os.path.join(input_path, all_legs_coords_filename))

In [None]:
gps_cities_filename = "gps_cities.pkl"
gps_cities = pd.read_pickle(os.path.join(input_path, gps_cities_filename))

In [None]:
gps_cities.head(3)

In [None]:
gps_cities.columns

In [None]:
legs_coords_df = gps_cities[
    [
        "legid",
        "StartLat",
        "StartLon",
        "country_start",
        "start_class",
        "EndLat",
        "EndLon",
        "country_end",
        "end_class",
    ]
]
legs_coords_df = legs_coords_df.rename(
    columns={
        "StartLat": "lat_start",
        "StartLon": "lon_start",
        "start_class": "class_start",
        "EndLat": "lat_end",
        "EndLon": "lon_end",
        "end_class": "class_end",
    }
)
legs_coords_df.drop_duplicates(keep="first", inplace=True)
legs_coords_df.head(3)

In [None]:
from collections import defaultdict

ss_points = defaultdict(list)
se_points = defaultdict(list)
es_points = defaultdict(list)
ee_points = defaultdict(list)

for legid1, type1, legid2, type2 in matching_points:
    if type1 == "start" and type2 == "start":
        ss_points[legid1].append(legid2)
    elif type1 == "start" and type2 == "end":
        se_points[legid1].append(legid2)
        es_points[legid2].append(legid1)
    elif type1 == "end" and type2 == "end":
        ee_points[legid1].append(legid2)
    else:
        print("Unepected types: ({}, {}, {}, {})".format(legid1, type1, legid2, type2))

In [None]:
from itertools import islice


def take(iterable, n):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
print("ss_points:", end="")
print([(k, len(v)) for k, v in take(ss_points.items(), 5)])

print("se_points:", end="")
print([(k, len(v)) for k, v in take(se_points.items(), 5)])

print("es_points:", end="")
print([(k, len(v)) for k, v in take(es_points.items(), 5)])

print("ee_points:", end="")
print([(k, len(v)) for k, v in take(ee_points.items(), 5)])

In [None]:
matching_legs = defaultdict(list)

for legid in ss_points.keys():
    matching_start_ids = ss_points[legid]
    matching_end_ids = ee_points[legid]

    common_matches = set(matching_start_ids).intersection(matching_end_ids)
    if len(common_matches) > 0:
        matching_legs[legid].extend(common_matches)

In [None]:
print("Number of matching legs (start-end):", len(matching_legs))

In [None]:
from pprint import pprint

print("matching_legs:", end="")
pprint([(k, v) for k, v in take(matching_legs.items(), 5)])

In [None]:
legs_df.loc[legs_df["legid"] == "#24:23124"].userid

In [None]:
legs_coords_df.loc[legs_coords_df["legid"] == "#24:23124"]

In [None]:
pm_selected_legs = legs_df.loc[
    (legs_df["transp_category"] == "private_motorized")
    & (legs_df["legid"].isin(set(k for k in matching_legs.keys())))
]

In [None]:
print("Number of selected private motorized legs:", pm_selected_legs.legid.nunique())

In [None]:
selected_matching_legs = set()
for pm_legid in set(pm_selected_legs.legid.unique()):
    selected_matching_legs.update(matching_legs[pm_legid])

In [None]:
take(selected_matching_legs, 5)

In [None]:
print("Number of matching legs: ", len(selected_matching_legs))

In [None]:
alternative_selected_legs = legs_df.loc[
    (legs_df["transp_category"].isin(ALTERNATIVE_TRANSPORT_CATEGORIES))
    & (legs_df["legid"].isin(selected_matching_legs))
]

In [None]:
alternative_selected_legs.head(3)

In [None]:
all_factors_alternative_selected_legs = all_factors.loc[
    all_factors["legid"].isin(alternative_selected_legs["legid"].unique())
]

In [None]:
# select all negative experience factor
all_factors_minus_alternative_selected_legs = all_factors_alternative_selected_legs.loc[
    all_factors_alternative_selected_legs["minus"] == True
]

In [None]:
all_factors_minus_alternative_selected_legs = all_factors_minus_alternative_selected_legs.merge(
    legs_df[["legid", "transp_category"]], on="legid"
)

In [None]:
final_results = (
    all_factors_minus_alternative_selected_legs.groupby(["transp_category", "factor"])
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)
final_results.head(10)

In [None]:
outfile_name = "h9_experiencefactors_comparison.csv"
outfile_path = out_path / outfile_name
final_results.to_csv(outfile_path, index=False)