# H12

**Obj:** Travel activities
<br> To explore how VTT is influenced by the range and diversity of activities while travelling.

## Questions

- [Q1](#Q1): Is there a correlation among the number of different activities and worthwhileness ratings? a higher number of activities correlated to a more negative assessment of worthwhileness?
- [Q2](#Q2): What are the activities that are more frequently associated to each kind and level of worthwhileness value? (same as H14)
- [Q3](#Q3): What activities are more likely to be connected to work-related travel purposes vs leisure?

In [None]:
# Import libraries

import os
import sys
import json
import time
from datetime import date, datetime

# numerical libraries
import pandas as pd
import numpy as np

# plotting libraries
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# global variables
cutting_date = "2019-05-01"  # remove trips and data published before this date
meta_data_path = "../../data-campaigns/meta-data/"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/hypothesis/H12/"
img_path = "../../2019-12-16.out/hypothesis/H12/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

try:
    os.makedirs(os.path.abspath(img_path))
except FileExistsError:
    print("Directory '{}' already exists".format(img_path), file=sys.stderr)

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"
users_with_trips = "users_df_with_trips.pkl"

# read datasets
legs_df = pd.read_pickle(input_path + legs)
trips_users_df = pd.read_pickle(input_path + trips_users)
trips_df = pd.read_pickle(input_path + trips)
users_df_with_trips = pd.read_pickle(input_path + users_with_trips)

### Read activity data

In [None]:
# read data
all_gen_act = pd.read_pickle(input_path + "all_gen_act.pkl")

# add info
all_gen_act = all_gen_act.merge(legs_df[["legid", "wastedTime"]], on="legid")

# filter useful values of wt and round to int
all_gen_act = all_gen_act[(all_gen_act.wastedTime > 0) & (all_gen_act.wastedTime < 6)]
all_gen_act.wastedTime = all_gen_act.wastedTime.apply(lambda x: np.round(x))

# add values from trip
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]

tmp = values_from_trip[["legid", "value", "valueFromTrip"]]
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()

# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)
values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)


all_gen_act = all_gen_act.merge(values_from_trip_pivot, on="legid").drop_duplicates()
print("shape", all_gen_act.shape)
print("unique legs", all_gen_act.legid.nunique())
all_gen_act.head()

In [None]:
all_gen_act.groupby("code").nunique()

<a id='Q1' ></a>
### Q1: Is there a correlation among the number of different activities and worthwhileness ratings? a higher number of activities correlated to a more negative assessment of worthwhileness?

Correlation between worthwhileness/worthwhileness elements  and activities undertaken while travelling. Make a differentiation between activities that are smart-phone enabled, and those that don't. Does the impact of these activities change according to mode, purpose, territory/country etc.

In [None]:
num_activities = all_gen_act.groupby("tripid").code.nunique().reset_index()
num_activities = num_activities.rename(columns={"code": "num_activities"})
num_activities.head(5)

In [None]:
# double check count
xx = all_gen_act[["tripid", "code"]]
xx.loc[xx["tripid"].isin(["#30:10007", "#30:10009"])]

In [None]:
na_wt = all_gen_act.merge(num_activities, on="tripid").drop_duplicates()[
    ["tripid", "num_activities", "wastedTime"]
]
na_wt.head(3)

In [None]:
print("Number of trips: ", legs_df.tripid.nunique())
print("Number of trips with at least 1 activity: ", all_gen_act.tripid.nunique())

In [None]:
removed_trips_with_activity = set(all_gen_act.tripid.unique()) - set(
    legs_df.tripid.unique()
)
print(
    "Number of removed trips with at least 1 activity: ",
    len(removed_trips_with_activity),
)

all_gen_act_clean = (
    all_gen_act[(~all_gen_act.tripid.isin(removed_trips_with_activity))]
    .drop_duplicates(subset="tripid")
    .copy()
)
print(
    "Number of trips with at least 1 activity (clean): ",
    all_gen_act_clean.tripid.nunique(),
)

na_tmp = all_gen_act_clean.merge(num_activities, on="tripid")[
    ["tripid", "num_activities", "wastedTime"]
].drop_duplicates()

In [None]:
trips_zero_act = set(legs_df.tripid.unique()) - set(all_gen_act_clean.tripid.unique())
print("Trips with 0 activities: {}".format(len(trips_zero_act)))

# number of activities
na = {}
na[0] = len(trips_zero_act)
na.update(na_wt_clean.groupby("num_activities")["tripid"].count().to_dict())

print("Total trips: {}".format(sum(na.values())))

In [None]:
from collections import defaultdict

na_agg = defaultdict(int)

for k, v in na.items():
    if k < 5:
        na_agg[str(k)] = v
    else:
        na_agg["5+"] += v

# print( sum(na.values()), sum(na_agg.values()) )

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
ax.bar(na_agg.keys(), na_agg.values(), width=1.0, color="g", label=na.keys())
ax.set_xticks(list(na_agg.keys()))
ax.set_xlabel("Number of activities")
ax.set_yticks(range(0, 37500, 5000))
ax.set_ylabel("Number of trips")

print("Image path: ", img_path + "h12_q1_activities.png")
plt.savefig(
    img_path + "h12_q1_activities.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
# initial plots: number of activities and wastedTime distribution

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
axes = axes.ravel()

axes[0].bar(na.keys(), na.values(), width=1.0, color="g", label=na.keys())
axes[1].set_xticks(range(0, 10))
axes[0].set_xlabel("Number of activities")
axes[0].set_ylabel("Number of trips")
axes[0].set_title("Legs per number of activities")


tmp = na_wt.groupby("wastedTime").size().reset_index(name="count")
axes[1].bar(tmp["wastedTime"], tmp["count"])
axes[1].set_xticks(range(1, 6))
axes[1].set_title("Worthwhileness ratings distribution")
axes[1].set_xlabel("worthwhileness ratings")
axes[1].set_ylabel("number of legs")

plt.tight_layout()
plt.savefig(
    img_path + "h12_q1_activities_wt.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
sum(na.values())

In [None]:
alltrips_wt = legs_df[(legs_df.wastedTime > 0) & (legs_df.wastedTime < 6)][
    ["tripid", "wastedTime"]
].drop_duplicates()
alltrips_wt.wastedTime = alltrips_wt.wastedTime.apply(lambda x: np.round(x, 0))

alltrips_wt_mean = round(alltrips_wt.wastedTime.mean(), 1)
alltrips_wt_std = round(alltrips_wt.wastedTime.std(), 1)
print(
    "wastedTime (all trips): {:.1f} +/- {:.1f}".format(
        alltrips_wt_mean, alltrips_wt_std
    )
)

In [None]:
# select trips with zero activities
trips_zero_act_wt = alltrips_wt.loc[alltrips_wt["tripid"].isin(trips_zero_act)]

trips_zero_act_wt_mean = round(trips_zero_act_wt.wastedTime.mean(), 1)
trips_zero_act_wt_std = round(trips_zero_act_wt.wastedTime.std(), 1)
print(
    "wastedTime (all trips): {:.1f} +/- {:.1f}".format(
        trips_zero_act_wt_mean, trips_zero_act_wt_std
    )
)

In [None]:
###### Remove legs with 9 and 10 activities

na_wt = na_wt[~na_wt.num_activities.isin([9, 10])]

In [None]:

fig = (
    na_wt[["num_activities", "wastedTime"]]
    .groupby("num_activities")
    .mean()
    .reset_index()
    .plot.scatter(x="num_activities", y="wastedTime")
)

na_wt_all_data = (
    na_wt[["num_activities", "wastedTime"]]
    .groupby("num_activities")
    .mean()["wastedTime"]
    .to_dict()
)

In [None]:
fig = (
    na_wt[["num_activities", "wastedTime"]]
    .groupby("num_activities")
    .mean()
    .reset_index()
    .plot.scatter(x="num_activities", y="wastedTime", yerr=na_wt["wastedTime"].std())
)

#### Analysis by gender

In [None]:
users_trips_gender = users_df_with_trips[["userid", "gender"]]
users_trips_gender.head(3)

In [None]:
trips_users_gender = trips_users_df.merge(users_trips_gender, on="userid")
alltrips_act_gender = all_gen_act.merge(trips_users_gender, on="tripid")
alltrips_act_gender.head(3)

In [None]:
alltrips_act_male = alltrips_act_gender.loc[alltrips_act_gender["gender"] == "Male"]
alltrips_act_female = alltrips_act_gender.loc[alltrips_act_gender["gender"] == "Female"]

In [None]:
alltrips_act_male.head(3)

In [None]:
alltrips_act_female.head(3)

In [None]:
num_activities.head(3)

In [None]:
na_wt_male = alltrips_act_male.merge(num_activities, on="tripid").drop_duplicates()[
    ["tripid", "gender", "num_activities", "wastedTime"]
]
# remove 9 and 10 activities
na_wt_male = na_wt_male[~na_wt_male.num_activities.isin([9, 10])]
na_wt_male.head(3)

In [None]:
na_wt_female = alltrips_act_female.merge(num_activities, on="tripid").drop_duplicates()[
    ["tripid", "gender", "num_activities", "wastedTime"]
]
# remove 9 and 10 activities
na_wt_female = na_wt_female[~na_wt_female.num_activities.isin([9, 10])]
na_wt_female.head(3)

In [None]:
na_wt_male_data = (
    na_wt_male[["num_activities", "wastedTime"]]
    .groupby("num_activities")
    .mean()["wastedTime"]
    .to_dict()
)
na_wt_male_data

In [None]:
na_wt_female_data = (
    na_wt_female[["num_activities", "wastedTime"]]
    .groupby("num_activities")
    .mean()["wastedTime"]
    .to_dict()
)
na_wt_female_data

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(
    na_wt_all_data.keys(), na_wt_all_data.values(), c="black", marker="x", label="All"
)
ax.scatter(
    na_wt_male_data.keys(), na_wt_male_data.values(), c="b", marker="s", label="Male"
)
ax.scatter(
    na_wt_female_data.keys(),
    na_wt_female_data.values(),
    c="r",
    marker="d",
    label="Female",
)
ax.set_xticks(range(1, 9))

plt.title(
    "Worthwhileness rating versus number of activities undertaken while travelling (by gender)"
)
plt.xlabel("Number of activities")
plt.ylabel("Average worthwhileness rating")

plt.legend(loc="upper left")

plt.tight_layout()
plt.savefig(
    img_path + "h12_q1_all_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

#### Analysis by transport category

In [None]:
all_gen_act.transp_category.unique()

In [None]:
num_activities.head(3)

In [None]:
transport_categories = [
    "walking",
    "cycling_emerging_micromobility",
    "public_transp_long_dist",
    "public_transp_short_dist",
    "private_motorized",
]

na_wt_tc = {}
na_wt_tc_data = {}
for tc in transport_categories:
    alltrips_act_tc = all_gen_act.loc[all_gen_act["transp_category"] == tc]
    na_wt_tc_tmp = alltrips_act_tc.merge(num_activities, on="tripid").drop_duplicates()[
        ["tripid", "transp_category", "num_activities", "wastedTime"]
    ]

    # remove legs with 9 and 10
    na_wt_tc_tmp = na_wt_tc_tmp[~na_wt_tc_tmp.num_activities.isin([9, 10])]
    na_wt_tc[tc] = na_wt_tc_tmp

    na_wt_tc_data[tc] = (
        na_wt_tc_tmp[["num_activities", "wastedTime"]]
        .groupby("num_activities")
        .mean()["wastedTime"]
        .to_dict()
    )

In [None]:
fig, axs = plt.subplots(2, 3, sharey=True, figsize=(15, 8))
axes_coords = [(i, j) for i in range(2) for j in range(3)]
colors = ["blue", "orange", "green", "red", "purple"]

for tc, ax_coords, color in zip(transport_categories, axes_coords, colors):
    ax = axs[ax_coords]
    ax.set_title(tc)
    ax.set_xticks(np.arange(0, 10, step=1))
    ax.scatter(na_wt_tc_data[tc].keys(), na_wt_tc_data[tc].values(), c=color)
    ax.set_xticks(range(1, 9))

plt.tight_layout()
plt.savefig(img_path + "h12_q1_tc.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q2' ></a>
### Q2: What are the activities that are more frequently associated to each kind and level of worthwhileness value?

To explore which activities relate to high and low worthwhileness values. Can we create charts/graphs presenting favourite activities?

In [None]:
act_wv = all_gen_act[
    ["tripid", "code", "wastedTime", "Enjoyment", "Fitness", "Productivity"]
]
act_wv.head(3)

In [None]:
ww_el = ["Enjoyment", "Fitness", "Productivity"]
ww_val = [0.0, 1.0, 2.0]

worthwhile_elements = {}
for el in ww_el:
    for val in ww_val:
        key = "{element}_{level}".format(element=el[0], level=int(val))
        worthwhile_elements[key] = (el, val)


def get_top_activities(act_wv):
    top_activities = {}
    for el in ww_el:
        top_activities[el] = {}
        for val in ww_val:
            tmp = (
                act_wv.loc[act_wv[el] == val]
                .groupby("code", group_keys=False)
                .tripid.nunique()
                .sort_values(ascending=False)
            )
            top_activities[el][val] = tmp.to_dict()

    return top_activities

In [None]:
top_activities = get_top_activities(act_wv)
activities = top_activities["Enjoyment"][0.0].keys()

In [None]:
def get_heatmap(top_activities):
    activities = top_activities["Enjoyment"][0.0].keys()

    worthwhile_elements = {}
    for el in ww_el:
        for val in ww_val:
            key = "{element}_{level}".format(element=el[0], level=int(val))
            worthwhile_elements[key] = (el, val)

    heatmap = []
    for act in activities:
        element_values = []
        for wwel, (wwelement, wwvalue) in worthwhile_elements.items():
            # print('act: {}, wwel: {} ({}, {})'.format(act, wwel, wwelement, wwvalue))
            try:
                val = top_activities[wwelement][wwvalue][act]
            except:
                val = 0
            element_values.append(val)

        heatmap.append(element_values)

    return heatmap

In [None]:
heatmap = get_heatmap(top_activities)
heatmap_df = pd.DataFrame(heatmap, columns=worthwhile_elements.keys(), index=activities)
heatmap_df

In [None]:
# Matplotlib: How to remove white lines in the heatmap
# https://stackoverflow.com/a/45633288/2377454
plt.rcParams["axes.grid"] = False

sns.heatmap(heatmap_df, annot=True, fmt="d")
plt.title("Activities vs Worthwhileness factors")

# reactivate grids in graphs
plt.rcParams["axes.grid"] = True
plt.tight_layout()
plt.savefig(
    img_path + "h12_q2_heatmap_all.png", bbox_to_anchor=True, bbox_inches="tight"
)

#### Analysis by gender

In [None]:
# male
act_wv_male = alltrips_act_male[
    ["tripid", "gender", "code", "wastedTime", "Enjoyment", "Fitness", "Productivity"]
]
# female
act_wv_female = alltrips_act_female[
    ["tripid", "gender", "code", "wastedTime", "Enjoyment", "Fitness", "Productivity"]
]

In [None]:
# create heatmaps - male
top_activities_male = get_top_activities(act_wv_male)
heatmap_male = get_heatmap(top_activities_male)
heatmap_male_df = pd.DataFrame(
    heatmap_male, columns=worthwhile_elements.keys(), index=activities
)

# create heatmaps - female
top_activities_female = get_top_activities(act_wv_female)
heatmap_female = get_heatmap(top_activities_female)
heatmap_female_df = pd.DataFrame(
    heatmap_female, columns=worthwhile_elements.keys(), index=activities
)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 7))
axes = axes.ravel()

sns.heatmap(heatmap_male_df, annot=True, fmt="d", ax=axes[0])
axes[0].set_title("Activities vs Worthwhileness factors - Male")

sns.heatmap(heatmap_female_df, annot=True, fmt="d", ax=axes[1])
axes[1].set_title("Activities vs Worthwhileness factors - Female")

plt.tight_layout()
plt.savefig(
    img_path + "h12_q2_heatmap_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

#### Analysis by transport category

In [None]:
act_wv_all_tc = all_gen_act[
    [
        "tripid",
        "code",
        "transp_category",
        "wastedTime",
        "Enjoyment",
        "Fitness",
        "Productivity",
    ]
]
act_wv_all_tc.head(3)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))
axes = axes.ravel()

for i in range(len(transport_categories)):

    act_wv_tc_tmp = act_wv_all_tc.loc[
        act_wv_all_tc["transp_category"] == transport_categories[i]
    ]
    top_activities_tc = get_top_activities(act_wv_tc_tmp)
    activities_tc = top_activities_tc["Enjoyment"][0.0].keys()
    hm = pd.DataFrame(
        get_heatmap(top_activities_tc),
        columns=worthwhile_elements.keys(),
        index=activities_tc,
    )

    sns.heatmap(hm, annot=True, fmt="d", ax=axes[i])
    axes[i].set_title(transport_categories[i], fontsize=14)

plt.tight_layout()
plt.savefig(
    img_path + "h12_q2_heatmap_tc.png", bbox_to_anchor=True, bbox_inches="tight"
)

<a id='Q3' ></a>
### Q3: What activities are more likely to be connected to work-related travel purposes vs leisure?

Related to question H12-Q2 above

In [None]:
trip_objs = pd.read_pickle(input_path + "trip_objs_grouped.pkl")

# add activities
act_purposes = all_gen_act.merge(trip_objs, on="tripid").drop_duplicates()
act_purposes.head()

In [None]:
### work
act_work = act_purposes.loc[act_purposes["objective_str"] == "Work"]
# count activities
table_act_work = (
    act_work.groupby("code", group_keys=False)
    .tripid.nunique()
    .sort_values(ascending=False)
    .reset_index(name="ntrips")
)
# save
table_act_work.to_csv(out_path + "h12_q3_table_act_work.csv", index=False)

### hobby
act_leisure = act_purposes.loc[act_purposes["objective_str"] == "Leisure_Hobby"]
# count activities
table_act_leisure = (
    act_leisure.groupby("code", group_keys=False)
    .tripid.nunique()
    .sort_values(ascending=False)
    .reset_index(name="ntrips")
)
# save
table_act_leisure.to_csv(out_path + "h12_q3_table_act_leisure.csv", index=False)

In [None]:
table_act_leisure

In [None]:
table_act_work