# H02

**Obj:** Reliable door-to-door time
<br> To explore how VTT is influenced by the reliability of the planned travel choice.

## Questions

- [Q1](#Q1): How is the reliability of travel time  related to worthwhile assessments?
- [Q2](#Q2): How often is reliability of travel time selected as an experience factor in comparison to other factors?

**oss:** all analysis should be done for all users and also filtering by gender and by country

**todo from work plan:** Q1, Q2, Q3

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math
import operator

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/hypothesis/H2/"
img_path = "../../2019-12-16.out/hypothesis/H2/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
# trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()

In [None]:
### read data for reliability
all_factors = pd.read_pickle(input_path + "all_factors.pkl")

# delete legs with minus=F and plus=F
all_factors = all_factors[
    ~((all_factors["minus"] == False) & (all_factors["plus"] == False))
]

# delete legs with minus=T and plus=T (3% of obs)
all_factors = all_factors[
    ~((all_factors["minus"] == True) & (all_factors["plus"] == True))
]

print("all records:", len(all_factors))
print()

## select only legs that have reliability
reliability_legs = all_factors[all_factors["factor"] == "Reliability_Of_Travel_Time"]
# select only useful cols
reliability_legs = reliability_legs[
    [
        "correctedModeOfTransport_str",
        "legid",
        "minus",
        "plus",
        "tripid",
        "factor",
        "legStartDay",
    ]
]

# add info
reliability_legs = reliability_legs.merge(
    all_legs[
        [
            "legid",
            "wastedTime",
            "gender",
            "age",
            "onCampaigns",
            "transp_category",
            "we_vs_wd",
        ]
    ],
    on="legid",
)

## add purpose
# read purposes -> trip_obj_grouped.pkl
trip_objs = pd.read_pickle(input_path + "trip_objs_grouped.pkl")
# add purpose to values_from_trip
reliability_legs = reliability_legs.merge(
    trip_objs[["tripid", "objective_str"]], on="tripid"
)

# select useful wastedTime
reliability_legs = reliability_legs[
    (reliability_legs.wastedTime > 0) & (reliability_legs.wastedTime < 6)
]
reliability_legs["wastedTime"] = reliability_legs["wastedTime"].apply(
    lambda x: np.round(x, 0)
)

# remove legs with "None" transport category
reliability_legs = reliability_legs[(reliability_legs.transp_category.notna())]

# checks
print("total legs with reliability:", len(reliability_legs))
xx = reliability_legs[
    (reliability_legs["minus"] == False) & (reliability_legs["plus"] == True)
]
print("only plus: ", len(xx))
xx = reliability_legs[
    (reliability_legs["minus"] == True) & (reliability_legs["plus"] == False)
]
print("only minus: ", len(xx))


def find_impact(plus, minus):

    if (minus == False) & (plus == True):
        return "plus"
    if (minus == True) & (plus == False):
        return "minus"


reliability_legs["impact"] = reliability_legs.apply(
    lambda row: find_impact(row["plus"], row["minus"]), axis=1
)

# reliability_legs.groupby('impact').size()


reliability_legs.head()

In [None]:
# define for plots
age_range = list(all_legs.age.unique())

# assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())

# transp_category list
tc_lst = reliability_legs.transp_category.unique()

# gender list
gender_lst = ["Male", "Female"]

# purpose
obj_lst = list(reliability_legs.objective_str.unique())

# impact
impact_lst = ["plus", "minus"]

<a id='Q1' ></a>
### Q1: How is the reliability of travel time related to worthwhile assessments?

Explore if and how reliability as an experience factor affects worthwhileness levels.
<br/> Study if the impact of these experience factors change according to mode, purpose, territory/country, weekday/weekend.

**Variables:**
worthwhileness levels: `wastedTime`
<br/> reliability: getting there factor, `Reliability_Of_Travel_Time`

In [None]:
## ALL + GENDER

nrows = 1
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5), sharey=True)
axes = axes.ravel()

tmp = (
    reliability_legs.groupby(["wastedTime", "impact"]).size().reset_index(name="count")
)

totM = tmp[tmp["impact"] == "minus"]["count"].sum()
totP = tmp[tmp["impact"] == "plus"]["count"].sum()
tmp["rel_count"] = tmp.apply(
    lambda x: x["count"] / totM if x["impact"] == "minus" else x["count"] / totP, axis=1
)


sns.barplot(x="wastedTime", y="rel_count", hue="impact", data=tmp, ax=axes[0])
axes[0].legend(fontsize="x-small")
# axes[0].set_xlabel(fontsize=12)
axes[0].set_title("All", fontsize=14)

gender_lst = ["Male", "Female"]
for i in range(ncols - 1):
    tmp = (
        reliability_legs[reliability_legs.gender == gender_lst[i]]
        .groupby(["wastedTime", "impact"])
        .size()
        .reset_index(name="count")
    )

    totM = tmp[tmp["impact"] == "minus"]["count"].sum()
    totP = tmp[tmp["impact"] == "plus"]["count"].sum()
    tmp["rel_count"] = tmp.apply(
        lambda x: x["count"] / totM if x["impact"] == "minus" else x["count"] / totP,
        axis=1,
    )

    sns.barplot(x="wastedTime", y="rel_count", hue="impact", data=tmp, ax=axes[i + 1])
    axes[i + 1].legend("")
    axes[i + 1].set_ylabel(None)
    # axes[i+1].set_xlabels(fontsize=12)
    axes[i + 1].set_title(gender_lst[i], fontsize=14)

plt.tight_layout()
plt.savefig(img_path + "h2_q1_all_gender.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY TRANSPORT CATEGORY

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

axid = 0
for idx, c in list(enumerate(tc_lst)):

    tmp = reliability_legs[reliability_legs.transp_category == c]
    tmp = tmp.groupby(["wastedTime", "impact"]).size().reset_index(name="count")

    totM = tmp[tmp["impact"] == "minus"]["count"].sum()
    totP = tmp[tmp["impact"] == "plus"]["count"].sum()
    tmp["rel_count"] = tmp.apply(
        lambda x: x["count"] / totM if x["impact"] == "minus" else x["count"] / totP,
        axis=1,
    )

    sns.barplot(x="wastedTime", y="rel_count", hue="impact", data=tmp, ax=axes[idx])
    axes[idx].legend("")
    axes[idx].set_title(c, fontsize=14)
    axes[idx].set_ylabel("relative count", fontsize=12)
    axes[idx].set_xlabel("worthwhileness ratings", fontsize=12)
    axes[idx].tick_params(axis="both", labelsize=12)

    if idx == 0:
        axes[idx].legend(fontsize="x-small", loc="upper left")

plt.tight_layout()
plt.savefig(img_path + "h2_q1_tc.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY COUNTRY

nrows = 2
ncols = 5
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

axid = 0
for idx, c in list(enumerate(top10)):

    tmp = reliability_legs[reliability_legs.onCampaigns == c]
    tmp = tmp.groupby(["wastedTime", "impact"]).size().reset_index(name="count")

    totM = tmp[tmp["impact"] == "minus"]["count"].sum()
    totP = tmp[tmp["impact"] == "plus"]["count"].sum()
    tmp["rel_count"] = tmp.apply(
        lambda x: x["count"] / totM if x["impact"] == "minus" else x["count"] / totP,
        axis=1,
    )

    sns.barplot(x="wastedTime", y="rel_count", hue="impact", data=tmp, ax=axes[idx])
    axes[idx].legend("")
    axes[idx].set_title(c, fontsize=14)
    axes[idx].set_ylabel("relative count", fontsize=12)
    axes[idx].set_xlabel("worthwhileness ratings", fontsize=12)
    axes[idx].tick_params(axis="both", labelsize=12)

    if idx == 0:
        axes[idx].legend(fontsize="x-small", loc="upper left")

plt.tight_layout()
plt.savefig(img_path + "h2_q1_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY PURPOSE

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

for i in range(len(obj_lst)):

    tmp = reliability_legs[reliability_legs.objective_str == obj_lst[i]]
    tmp = tmp.groupby(["wastedTime", "impact"]).size().reset_index(name="count")

    totM = tmp[tmp["impact"] == "minus"]["count"].sum()
    totP = tmp[tmp["impact"] == "plus"]["count"].sum()
    tmp["rel_count"] = tmp.apply(
        lambda x: x["count"] / totM if x["impact"] == "minus" else x["count"] / totP,
        axis=1,
    )

    sns.barplot(x="wastedTime", y="rel_count", hue="impact", data=tmp, ax=axes[i])
    axes[i].legend("")
    axes[i].set_title(obj_lst[i], fontsize=14)
    axes[i].set_ylabel("relative count", fontsize=12)
    axes[i].set_xlabel("worthwhileness ratings", fontsize=12)
    axes[i].tick_params(axis="both", labelsize=12)

    if i == 0:
        axes[i].legend(fontsize="x-small", loc="upper left")


fig.tight_layout()
plt.savefig(img_path + "h2_q1_purpose.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY WEEKEND AND WEEKDAY

day_lst = ["Working_day", "Weekend"]

nrows = 1
ncols = 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5), sharey=True)
axes = axes.ravel()

for i in range(len(day_lst)):

    tmp = reliability_legs[reliability_legs.we_vs_wd == day_lst[i]]
    tmp = tmp.groupby(["wastedTime", "impact"]).size().reset_index(name="count")

    totM = tmp[tmp["impact"] == "minus"]["count"].sum()
    totP = tmp[tmp["impact"] == "plus"]["count"].sum()
    tmp["rel_count"] = tmp.apply(
        lambda x: x["count"] / totM if x["impact"] == "minus" else x["count"] / totP,
        axis=1,
    )

    sns.barplot(x="wastedTime", y="rel_count", hue="impact", data=tmp, ax=axes[i])
    axes[i].legend("")
    axes[i].set_title(day_lst[i], fontsize=14)
    axes[i].set_ylabel("relative count", fontsize=12)
    axes[i].set_xlabel("worthwhileness ratings", fontsize=12)
    axes[i].tick_params(axis="both", labelsize=12)

    if i == 0:
        axes[i].legend(fontsize="x-small", loc="upper left")


fig.tight_layout()
plt.savefig(img_path + "h2_q1_we_vs_wd.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q2' ></a>
### Q2: How often is reliability of travel time selected as an experience factor in comparison to other factors?

In [None]:
## ALL
tmp_all = (
    all_factors.groupby("factor")
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)
# save
tmp_all.to_csv(out_path + "h2_q2_table.csv", index=False)

## PLUS
tmp_P = all_factors[(all_factors["minus"] == False) & (all_factors["plus"] == True)]
tmp = (
    tmp_P.groupby("factor")
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)
# save
tmp.to_csv(out_path + "h2_q2_table_plus.csv", index=False)

## MINUS
tmp_M = all_factors[(all_factors["minus"] == True) & (all_factors["plus"] == False)]
tmp = (
    tmp_M.groupby("factor")
    .size()
    .sort_values(ascending=False)
    .reset_index(name="nlegs")
)
# save
tmp.to_csv(out_path + "h2_q2_table_minus.csv", index=False)

tmp.head()

In [None]:
## BY TRANSPORT CATEGORY

# add tc to all_factors
all_factors_tc = all_factors.merge(all_legs[["legid", "transp_category"]], on="legid")

for tc in tc_lst:

    tmp_tc = all_factors_tc[all_factors_tc.transp_category == tc]
    tmp = (
        tmp_tc.groupby("factor")
        .size()
        .sort_values(ascending=False)
        .reset_index(name="nlegs")
        .head(10)
    )

    # save
    tmp.to_csv(out_path + "h2_q2_table_top10_" + tc + ".csv", index=False)