# H1

**Obj:** Door-to-door time
<br> To explore how the choice of modes and route is influenced by the door-to-door travel time and experience.

## Questions

- [Q1](#Q1): Is there a correlation between number of legs and the mood ?
- [Q2](#Q2): Is there a correlation between trip duration, duration of transfers (i.e. waiting time) and the mood?
- [Q3](#Q3): What is the actual travel time compared to non travel time (e.g. transfer time) per trip (connectivity ratio)?
- [Q4](#Q4): What are the most common types of transfers between modes?  i.e. active <-> pt, private car <-> pt, or between transport mode categories
- [Q5](#Q5): Which types of transfer are more significant in terms of high or low levels of mood?
- [Q6](#Q6): What is the most common main transport mode by variation of door-to-door travel time?
- [Q7](#Q7): What are the main ranges of door-to-door travel time and distance traveled? (distribution of trip durations and distances)
- [Q8](#Q8): How is the choice of mode influenced by travel time, against routeRANK alternatives e.g. do users always go for the shortest travel time?

**oss:** all analysis should be done for all users and also filtering by gender and by country



In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math
import operator

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

**READ DATA**

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/hypothesis/H1/"
img_path = "../../2019-12-16.out/hypothesis/H1/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
# trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()
## Divide between male and female users

all_legs_M = all_legs[all_legs.gender == "Male"]
print("Legs of male users:", all_legs_M.shape[0])
print("Male users:", len(all_legs_M.userid.unique()))
print()
all_legs_F = all_legs[all_legs.gender == "Female"]
print("Legs of female users:", all_legs_F.shape[0])
print("Female users:", len(all_legs_F.userid.unique()))
print()
all_legs_O = all_legs[all_legs.gender == "Other"]
print("Legs of other users:", all_legs_O.shape[0])
print("Other users:", len(all_legs_O.userid.unique()))

In [None]:
# define for plots
age_range = list(all_legs.age.unique())

# assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())

<a id='Q1' ></a>
### Q1: Is there a correlation between number of legs and the mood ?
Indicates if smoothness of travel, which relates to less unwanted travel efforts, affects the mood.
<br> Mood relates to the overall question "How did you feel about this trip".

**OSS:** we consider number of legs associated to each trip and the related mood (variable `overallScore`).

In [None]:
# count number of legs per trip
tmp = all_legs[all_legs["class"] == "Leg"]
legs_per_trip = (
    tmp.groupby("tripid")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
# add overallScore
legs_per_trip = legs_per_trip.merge(trips_df[["tripid", "overallScore"]], on="tripid")
# select useful values of overallScore
legs_per_trip = legs_per_trip[
    (legs_per_trip.overallScore > 0) & (legs_per_trip.overallScore < 6)
]
# add gender and country
legs_per_trip = legs_per_trip.merge(
    all_legs[["tripid", "gender", "onCampaigns", "age"]], on="tripid"
).drop_duplicates()

## filter number of legs!!
legs_per_trip = legs_per_trip[legs_per_trip["count"] <= 10]
legs_per_trip.head()

In [None]:
ncols = 3
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(18, 5), sharey=True)
axes = axes.ravel()

gender_lst = ["All", "Male", "Female"]
for i in range(ncols):

    if i == 0:
        sns.boxplot(x="count", y="overallScore", data=legs_per_trip, ax=axes[i])
        axes[i].set_title(gender_lst[i])
        axes[i].set_xlabel("Legs per trip")
    else:
        tmp = legs_per_trip[legs_per_trip.gender == gender_lst[i]]
        sns.boxplot(x="count", y="overallScore", data=tmp, ax=axes[i])
        axes[i].set_title(gender_lst[i])
        axes[i].set_xlabel("Legs per trip")
        axes[i].set_ylabel(None)

plt.tight_layout()
plt.savefig(img_path + "h1_q1_all_gender.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
avg_trips = (
    legs_per_trip.groupby("count")["overallScore"].mean().reset_index(name="all_avg")
)

# male
tmp = legs_per_trip[legs_per_trip.gender == "Male"]
tmp_group = tmp.groupby("count")["overallScore"].mean().reset_index(name="male_avg")
avg_trips = avg_trips.merge(tmp_group, on="count")
# female
tmp = legs_per_trip[legs_per_trip.gender == "Female"]
tmp_group = tmp.groupby("count")["overallScore"].mean().reset_index(name="female_avg")
avg_trips = avg_trips.merge(tmp_group, on="count")

plt.plot(avg_trips["count"], avg_trips.all_avg, label="all")
plt.plot(avg_trips["count"], avg_trips.male_avg, label="male")
plt.plot(avg_trips["count"], avg_trips.female_avg, label="female")
plt.legend()

plt.tight_layout()
# plt.savefig(img_path + "h1_q1_all_gender.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# country

ncols = 5
nrows = 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 10))
axes = axes.ravel()

for i in range(2 * ncols):

    tmp = legs_per_trip[legs_per_trip.onCampaigns == top10[i]]
    sns.boxplot(x="count", y="overallScore", data=tmp, ax=axes[i])
    axes[i].set_title(top10[i])
    axes[i].set_yticks(range(1, 6))
    axes[i].set_xlabel("Legs per trip")

plt.tight_layout()
plt.savefig(img_path + "h1_q1_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
avg_trips_country = pd.DataFrame()
for c in top10:
    tmp = legs_per_trip[legs_per_trip.onCampaigns == c]
    tmp_group = tmp.groupby("count")["overallScore"].mean().reset_index(name="avg_" + c)
    try:
        avg_trips_country = avg_trips_country.merge(tmp_group, on="count")
    except:
        avg_trips_country = tmp_group
    plt.plot(avg_trips_country["count"], avg_trips_country["avg_" + c], label=c)

plt.legend(ncol=2)

In [None]:
# BY AGE

ncols = 4
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5))
axes = axes.ravel()

for i in range(ncols):

    tmp = legs_per_trip[legs_per_trip.age == age_range[i]]
    sns.boxplot(x="count", y="overallScore", data=tmp, ax=axes[i])
    axes[i].set_title(age_range[i])
    axes[i].set_yticks(range(1, 6))
    axes[i].set_xlabel("Legs per trip")

plt.tight_layout()
plt.savefig(img_path + "h1_q1_age.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q2' ></a>
### Q2: Is there a correlation between trip duration, duration of transfers (i.e. waiting time) and the mood?

Indicates if smoothness of travel, which relates to less unwanted travel efforts, affects the mood.
<br>Duration of transfers is the total waiting time for the whole trip.

In [None]:
# take useful trips
trips_with_mood = trips_df[(trips_df.overallScore > 0) & (trips_df.overallScore < 6)]

# take legs and waiting events
legs = all_legs[
    (all_legs["class"] == "Leg")
    & (all_legs.tripid.isin(trips_with_mood.tripid.unique()))
]
wevent = all_legs[
    (all_legs["class"] == "WaitingEvent")
    & (all_legs.tripid.isin(trips_with_mood.tripid.unique()))
]

trip_dur_leg = (
    legs.groupby("tripid")["inferred_leg_duration_min"]
    .sum()
    .reset_index(name="trip_duration")
)
trip_dur_leg = trip_dur_leg.merge(
    trips_with_mood[["tripid", "overallScore"]], on="tripid"
).drop_duplicates()
trip_dur_leg = trip_dur_leg.merge(
    legs[["tripid", "gender", "onCampaigns", "age"]], on="tripid"
).drop_duplicates()

trip_dur_we = (
    wevent.groupby("tripid")["inferred_leg_duration_min"]
    .sum()
    .reset_index(name="trip_duration")
)
trip_dur_we = trip_dur_we.merge(
    trips_with_mood[["tripid", "overallScore"]], on="tripid"
).drop_duplicates()
trip_dur_we = trip_dur_we.merge(
    legs[["tripid", "gender", "onCampaigns", "age"]], on="tripid"
).drop_duplicates()

In [None]:
ncols = 3
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5), sharey=True)
axes = axes.ravel()

for i in range(ncols):

    if i == 0:
        avg_dur = (
            trip_dur_leg.groupby("overallScore")["trip_duration"].mean().reset_index()
        )
        axes[i].plot(avg_dur.overallScore, avg_dur.trip_duration, label="leg")

        avg_dur_we = (
            trip_dur_we.groupby("overallScore")["trip_duration"].mean().reset_index()
        )
        axes[i].plot(
            avg_dur_we.overallScore, avg_dur_we.trip_duration, label="waiting event"
        )
        axes[i].set_ylabel("Average trip duration (min)")
        axes[i].set_title(gender_lst[i])
        axes[i].set_xlabel("Mood")
        axes[i].legend(loc="best", fontsize="x-small")
    else:
        tmp = trip_dur_leg[trip_dur_leg.gender == gender_lst[i]]
        avg_dur = tmp.groupby("overallScore")["trip_duration"].mean().reset_index()
        axes[i].plot(avg_dur.overallScore, avg_dur.trip_duration)

        tmp_we = trip_dur_we[trip_dur_we.gender == gender_lst[i]]
        avg_dur_we = (
            tmp_we.groupby("overallScore")["trip_duration"].mean().reset_index()
        )
        axes[i].plot(avg_dur_we.overallScore, avg_dur_we.trip_duration)
        axes[i].set_title(gender_lst[i])
        axes[i].set_xlabel("Mood")

plt.tight_layout()
plt.savefig(img_path + "h1_q2_all_gender.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# country

ncols = 5
nrows = 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8))
axes = axes.ravel()

for i in range(2 * ncols):

    tmp_leg = trip_dur_leg[trip_dur_leg.onCampaigns == top10[i]]
    tmp_we = trip_dur_we[trip_dur_we.onCampaigns == top10[i]]

    avg_dur = tmp_leg.groupby("overallScore")["trip_duration"].mean().reset_index()
    axes[i].plot(avg_dur.overallScore, avg_dur.trip_duration, label="leg")

    avg_dur_we = tmp_we.groupby("overallScore")["trip_duration"].mean().reset_index()
    axes[i].plot(
        avg_dur_we.overallScore, avg_dur_we.trip_duration, label="waiting event"
    )

    axes[i].set_title(top10[i])
    axes[i].set_xlabel("Mood")
    axes[i].set_ylabel("Avg duration (min)")
    axes[i].set_xticks(range(1, 6))

    if i == 0:
        axes[i].legend(loc="lower right", fontsize="x-small")

plt.tight_layout()
plt.savefig(img_path + "h1_q2_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# BY AGE

ncols = 4
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5))
axes = axes.ravel()

for i in range(ncols):

    tmp_leg = trip_dur_leg[trip_dur_leg.age == age_range[i]]
    tmp_we = trip_dur_we[trip_dur_we.age == age_range[i]]

    avg_dur = tmp_leg.groupby("overallScore")["trip_duration"].mean().reset_index()
    axes[i].plot(avg_dur.overallScore, avg_dur.trip_duration, label="leg")

    avg_dur_we = tmp_we.groupby("overallScore")["trip_duration"].mean().reset_index()
    axes[i].plot(
        avg_dur_we.overallScore, avg_dur_we.trip_duration, label="waiting event"
    )

    axes[i].set_title(age_range[i])
    axes[i].set_xlabel("Mood")
    axes[i].set_ylabel("Avg duration (min)")
    axes[i].set_xticks(range(1, 6))

    if i == 0:
        axes[i].legend(loc="lower right", fontsize="x-small")

plt.tight_layout()
plt.savefig(img_path + "h1_q2_age.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
from scipy.stats import pearsonr, chi2_contingency

pearson_corr = pearsonr(trip_dur_leg["overallScore"], trip_dur_leg["trip_duration"])[0]
print("Correlation: ", pearson_corr)


def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


# the output is in the range of [0,1], where 0 means no association and 1 is full association.
#  Cramer’s V is symmetrical — it is insensitive to swapping x and y
cramerv_ass = cramers_v(trip_dur_leg["overallScore"], trip_dur_leg["trip_duration"])
print("Cramer's v: ", cramerv_ass)

<a id='Q5' ></a>
### Q5: Which types of transfer are more significant in terms of high or low levels of mood?

**Types of transfer:** most common combinations of transport mode changes within a trip i.e. bike to train, car to walking etc.

**OSS:** consider *only* wastedTime = 1 and wastedTime = 5

In [None]:
all_legs["wastedTime"] = all_legs["wastedTime"].apply(lambda x: int(round(x, 0)))

In [None]:
# remove legs with unknown as transport mode
trips_more2legs = all_legs[all_legs.correctedModeOfTransport_str != "unknown"]
print("Trips with valid mode of transport:", len(trips_more2legs))

# consider trips with more than 2 legs
trip_count = trips_more2legs.groupby("tripid").size().reset_index(name="legs_per_trip")
trip_count = trip_count[trip_count["legs_per_trip"] > 1]
trips_more2legs = trips_more2legs[
    trips_more2legs.tripid.isin(trip_count.tripid.unique())
]
print("Trips with 2 or more legs:", len(trips_more2legs))

# select only trips with wt=1
wt1_tripids = trips_more2legs[trips_more2legs.wastedTime == 1].tripid.unique()
print("Trips with wt=1", len(wt1_tripids))

# select only trips with wt=5
wt5_tripids = trips_more2legs[trips_more2legs.wastedTime == 5].tripid.unique()
print("Trips with wt=5", len(wt5_tripids))

# take only trips with legs with different transport modes
trip_count_distinct_modes = (
    trips_more2legs.groupby("tripid")["correctedModeOfTransport_str"]
    .nunique()
    .reset_index(name="distinct_modes")
)
trip_count_distinct_modes = trip_count_distinct_modes[
    trip_count_distinct_modes.distinct_modes > 1
]

trip_morelegs_ids = trip_count_distinct_modes.tripid.unique()

#### WT 1
wt1 = trips_more2legs[
    (trips_more2legs.tripid.isin(trip_morelegs_ids))
    & (trips_more2legs.tripid.isin(wt1_tripids))
]
print(
    "--- WT1: Legs from trips with 2 or more legs and different transport modes and wt=1:",
    len(wt1),
)


#### WT 5
wt5 = trips_more2legs[
    (trips_more2legs.tripid.isin(trip_morelegs_ids))
    & (trips_more2legs.tripid.isin(wt5_tripids))
]
print(
    "--- WT5: trips with 2 or more legs and different transport modes and WT=5:",
    len(wt5),
)

In [None]:
#### WT1

type_transfer = {}
type_transfer_tripids = {}

for tid in list(wt1.tripid.unique()):
    tmp = wt1[wt1.tripid == tid].sort_values(by="startDate_formated").reset_index()
    # print(len(tmp))
    for idx, row in tmp.iterrows():
        # print(idx)
        if idx != len(tmp) - 1:
            current_tm = row.correctedModeOfTransport_str
            next_tm = tmp["correctedModeOfTransport_str"].iloc[idx + 1]

            if current_tm != next_tm:
                # print(current_tm, next_tm)
                # update the count
                if (current_tm, next_tm) not in type_transfer.keys():
                    type_transfer[(current_tm, next_tm)] = 0

                type_transfer[(current_tm, next_tm)] += 1
                # update the list of tripids
                if (current_tm, next_tm) not in type_transfer_tripids.keys():
                    type_transfer_tripids[(current_tm, next_tm)] = []
                else:
                    type_transfer_tripids[(current_tm, next_tm)].append(tid)

top_type_t_wt1 = sorted(type_transfer.items(), key=operator.itemgetter(1), reverse=True)
top_type_t_df_wt1 = pd.DataFrame(top_type_t_wt1, columns=["transfer_type", "n_trips"])
print("different combinations of type of transfer ", len(top_type_t_df_wt1))

# save table
top_type_t_df_wt1.to_csv(out_path + "top_type_tranfer_wt1.csv", index=False)

top_type_t_df_wt1.head(10)

In [None]:
#### WT5

type_transfer = {}
type_transfer_tripids = {}

for tid in list(wt5.tripid.unique()):
    tmp = wt5[wt5.tripid == tid].sort_values(by="startDate_formated").reset_index()
    for idx, row in tmp.iterrows():

        if idx != len(tmp) - 1:
            current_tm = row.correctedModeOfTransport_str
            next_tm = tmp["correctedModeOfTransport_str"].iloc[idx + 1]

            if current_tm != next_tm:
                # update the count
                if (current_tm, next_tm) not in type_transfer.keys():
                    type_transfer[(current_tm, next_tm)] = 0

                type_transfer[(current_tm, next_tm)] += 1
                # update the list of tripids
                if (current_tm, next_tm) not in type_transfer_tripids.keys():
                    type_transfer_tripids[(current_tm, next_tm)] = []
                else:
                    type_transfer_tripids[(current_tm, next_tm)].append(tid)

top_type_t_wt5 = sorted(type_transfer.items(), key=operator.itemgetter(1), reverse=True)
top_type_t_df_wt5 = pd.DataFrame(top_type_t_wt5, columns=["transfer_type", "n_trips"])
print("different combinations of type of transfer ", len(top_type_t_df_wt5))

# save table
top_type_t_df_wt5.to_csv(out_path + "top_type_tranfer_wt5.csv", index=False)

top_type_t_df_wt5.head(10)