# H10

**Obj:** Proportion of worthwhile time
<br> To explore how VTT is influenced by the share of worthwhile travel time out of total travel time.

## Questions

- [Q1](#Q1): What is the percentage of worthwhile time (positive values) compared to total travel time of a trip?
- [Q2](#Q2): What is the distribution of the four types of worthwhileness values associated to trip legs?
- [Q3](#Q3): What is the distribution of mood associated to trips?
- [Q4](#Q4): What is the correlation between worthwhileness ratings and the ratings for worthwhileness elements?
- [Q5](#Q5): Is there a correlation between the mood ratings at trip level and the worthwhile ratings at trip leg level?
- [Q6](#Q6): How worthwhileness ratings and worthwhileness elements change for weekdays and weekends?
- [Q7](#Q7): Assess if worthwhileness ratings and worthwhileness elements change according to countries
- [Q8](#Q8): Assess  if worthwhileness ratings and worthwhileness elements change if a trip is made in an urban, sub-urban or rural area

**oss:** all analysis should be done for all users and also by gender

**VTT: "Value of Travel Time"**

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

**READ DATA**

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/hypothesis/H10/"
img_path = "../../2019-12-16.out/hypothesis/H10/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
# trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()
## Divide between male and female users

all_legs_M = all_legs[all_legs.gender == "Male"]
print("Legs of male users:", all_legs_M.shape[0])
print("Male users:", len(all_legs_M.userid.unique()))
print()
all_legs_F = all_legs[all_legs.gender == "Female"]
print("Legs of female users:", all_legs_F.shape[0])
print("Female users:", len(all_legs_F.userid.unique()))
print()
all_legs_O = all_legs[all_legs.gender == "Other"]
print("Legs of other users:", all_legs_O.shape[0])
print("Other users:", len(all_legs_O.userid.unique()))

In [None]:
# define for plots
age_range = list(all_legs.age.unique())

# assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())

<a id='Q1' ></a>
### Q1: What is the percentage of worthwhile time (positive values) compared to total travel time of a trip?
How users rate trip legs in terms of worthwhileness?


**Variables:**
- total travel time: `inferred_leg_duration_min`
- worthwhile time: `wastedTime`
- `wastedTime_withtime`: compute also the worthwhileness in relation to travelled time. Given 2 legs l1 and l2, to obtain the related wasted times we compute $((l1*t1)+(l2*t2))/t1+t2$

In [None]:
# 1. wastedTime should be integer between 1 and 5 (stars).
# Remove values outside this range and round all values to integer.
# double values are because of the merging of the legs.

# take only values in 1-5
all_legs_tmp = all_legs[(all_legs["wastedTime"] > 0) & (all_legs["wastedTime"] <= 5)]
# round to integer
all_legs_tmp["wastedTime"] = all_legs["wastedTime"].apply(lambda x: float(x))

print("useful legs:", len(all_legs_tmp))
# all_legs_tmp.groupby("wastedTime").size().reset_index(name="count")

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP) and avg wastedTime (PER TRIP)
trips_tt_wt = (
    all_legs_tmp.groupby("tripid")["inferred_leg_duration_min"]
    .sum()
    .reset_index(name="total_tt")
)  # total travel time

tmp_wt = (
    all_legs_tmp.groupby("tripid")["wastedTime"].mean().reset_index(name="avg_wt")
)  # average wasted time
trips_tt_wt = trips_tt_wt.merge(tmp_wt)
trips_tt_wt = trips_tt_wt[trips_tt_wt["total_tt"] > 0]
print("useful trips:", trips_tt_wt.shape)
trips_tt_wt.head()

In [None]:
# histogram of total travel time
#### TODO: plot the mean
# create short-medium-long trips
dist_segs = trips_tt_wt["total_tt"].quantile([0.33, 0.66]).values
medium_threshold = dist_segs[0]
long_threshold = dist_segs[1]
print("medium_threshold:", medium_threshold)
print("long_threshold:", long_threshold)
print()

# fig = plt.figure(figsize=(12,12))
hist = trips_tt_wt.hist(
    column="total_tt", bins=[i * 2 for i in range(0, math.ceil(long_threshold * 4))]
)
plt.title("Histogram of total travel time")
plt.ylabel("Number of trips")
plt.xlabel("Minutes")
plt.tight_layout()

plt.savefig(img_path + "h10_q1_hist_tot_tt.png")

In [None]:
trip_tt_mean = trips_tt_wt["total_tt"].mean()
trip_tt_std = trips_tt_wt["total_tt"].std()
print("mean and std: {} +/, {}".format(trip_tt_mean, trip_tt_std))


# assign a class of 0 (short), 1 (medium), 2 (long)
def classify_traveltime(tt):
    if 0 <= tt <= medium_threshold:
        return 0
    elif medium_threshold < tt <= long_threshold:
        return 1
    else:
        return 2


trips_tt_wt["total_tt_class"] = trips_tt_wt["total_tt"].apply(classify_traveltime)
trips_tt_wt.head()

In [None]:
trips_tt_wt["total_tt"].median()

In [None]:
trips_tt_wt[trips_tt_wt["total_tt_class"] == 2].mean()

In [None]:
trips_tt_wt2.loc[
    (trips_tt_wt2["transp_category"] == "cycling_emerging_micromobility")
    & (trips_tt_wt2["avg_wt"] > 3.0)
    & (trips_tt_wt2["avg_wt"] < 4.0)
].head(3)

In [None]:
legs_wt = all_legs_tmp[["tripid", "transp_category", "wastedTime"]].copy()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
axes = axes.ravel()

# travel time
tmp = trips_tt_wt.groupby("total_tt_class")["avg_wt"].mean().reset_index()
axes[0].scatter(tmp.total_tt_class, tmp.avg_wt, lw=6)
axes[0].set_xlabel("Time travel")
axes[0].set_ylabel("worthwhile ratings")
axes[0].set_title("Scatterplot for short, medium, long trips", fontsize=14)
axes[0].set_xticks(range(3))
axes[0].set_xticklabels(["Short", "Medium", "Long"])


# by tc
tc2label = {
    "cycling_emerging_micromobility": "Cycling",
    "private_motorized": "Private Motorized",
    "public_transp_long_dist": "PT Long dist",
    "public_transp_short_dist": "PT Short dist",
    "walking": "Walking",
}
avg_wt_tc = (
    legs_wt.groupby("transp_category")["wastedTime"]
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)
axes[1].scatter(avg_wt_tc.transp_category, avg_wt_tc.wastedTime, lw=6)
axes[1].set_xlabel("Transport categories")
axes[1].set_ylabel("Worthwhileness ratings")
axes[1].set_title("Scatterplot for each transport category", fontsize=14)
axes[1].set_xticks(range(5))
axes[1].set_xticklabels([tc2label[k] for k in tmp2.transp_category], rotation=45)

plt.tight_layout()
plt.savefig(
    img_path + "h10_q1_scatter_distance_tc.png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)

In [None]:
avg_wt_tc

In [None]:
# round worthwhilness rating values
legs_wt

In [None]:
cycling = avg_wt_tc.loc[
    trips_tt_wt2["transp_category"] == "cycling_emerging_micromobility"
].copy()
cycling["avg_wt"] = cycling["avg_wt"].apply(round)
cycling.head()

In [None]:
cycling["avg_wt"].mean()

In [None]:
cycling_freq = cycling.groupby("avg_wt").size().to_list()

In [None]:
pt_short = trips_tt_wt2.loc[
    trips_tt_wt2["transp_category"] == "public_transp_short_dist"
].copy()
pt_short["avg_wt"] = pt_short["avg_wt"].apply(round)
pt_short.head()

In [None]:
pt_short["avg_wt"].mean()

In [None]:
pt_short_freq = pt_short.groupby("avg_wt").size().to_list()

In [None]:
npt_short = pt_short.shape[0]

In [None]:
ncycling = cycling.shape[0]

In [None]:
import scipy
from scipy.stats import chisquare

stat, pval = chisquare(cycling_freq, pt_short_freq, ddof=1)
pval

In [None]:
tmp2

In [None]:
# boxplots by distance and transp cateogry
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
axes = axes.ravel()

sns.boxplot(x="total_tt_class", y="avg_wt", data=trips_tt_wt, ax=axes[0])
axes[0].set_xlabel("Time travel")
axes[0].set_ylabel("worthwhile ratings")
axes[0].set_title("Boxplots for short, medium, long trips", fontsize=14)
axes[0].set_xticks(range(3))
axes[0].set_xticklabels(["Short", "Medium", "Long"])

trips_tt_wt2 = trips_tt_wt.merge(
    all_legs_tmp[["tripid", "transp_category"]], on="tripid", how="inner"
)
sns.boxplot(x="transp_category", y="avg_wt", data=trips_tt_wt2, ax=axes[1])
axes[1].set_xlabel("Transport categories")
axes[1].set_ylabel("worthwhile ratings")
axes[1].set_title("Boxplots for each transport category", fontsize=14)
axes[1].set_xticks(range(5))
axes[1].set_xticklabels(
    ["Walking", "Cycling", "Public_short", "Private", "Public_long"], rotation=45
)

plt.tight_layout()
plt.savefig(
    img_path + "h10_q1_box_distance_tc.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## by gender and country and age
trips_tt_wt3 = trips_tt_wt.merge(
    all_legs[["tripid", "userid", "gender", "onCampaigns", "age"]],
    on="tripid",
    how="inner",
).drop_duplicates()
trips_tt_wt3 = trips_tt_wt3[trips_tt_wt3.gender != "Other"]
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 7))
axes = axes.ravel()

sns.boxplot(x="gender", y="avg_wt", data=trips_tt_wt3, ax=axes[0])
axes[0].set_title("Boxplot of worthwhileness ratings for gender", fontsize=14)
axes[0].set_ylabel("worthwhile ratings")

# take top 10 countries
sns.boxplot(x="onCampaigns", y="avg_wt", data=trips_tt_wt3, ax=axes[1])
axes[1].tick_params(labelrotation=45, axis="x")
axes[1].set_title("Boxplot of worthwhileness ratings for top 10 countries", fontsize=14)
axes[1].set_ylabel("worthwhile ratings")

# age
sns.boxplot(x="age", y="avg_wt", data=trips_tt_wt3, ax=axes[2])
plt.xticks(rotation=45)
axes[2].set_title("Boxplot of worthwhileness ratings for age range", fontsize=14)
axes[2].set_ylabel("worthwhile ratings")


plt.tight_layout()
plt.savefig(
    img_path + "h10_q1_gender_country.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## by gender and country and age
trips_tt_wt3 = trips_tt_wt.merge(
    all_legs[["tripid", "userid", "gender", "onCampaigns", "age"]],
    on="tripid",
    how="inner",
).drop_duplicates()
trips_tt_wt3 = trips_tt_wt3[trips_tt_wt3.gender != "Other"]
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 7))
axes = axes.ravel()

tmp = trips_tt_wt3.groupby("gender")["avg_wt"].mean().reset_index()
axes[0].scatter(tmp.gender, tmp.avg_wt, lw=6)
axes[0].set_title("Boxplot of worthwhileness ratings for gender", fontsize=14)
axes[0].set_ylabel("worthwhile ratings")

# take top 10 countries
tmp = trips_tt_wt3.groupby("onCampaigns")["avg_wt"].mean().reset_index()
axes[1].scatter(tmp.onCampaigns, tmp.avg_wt, lw=6)
axes[1].tick_params(labelrotation=45, axis="x")
axes[1].set_title("Boxplot of worthwhileness ratings for top 10 countries", fontsize=14)
axes[1].set_ylabel("worthwhile ratings")

# age
tmp = trips_tt_wt3.groupby("age")["avg_wt"].mean().reset_index()
axes[2].scatter(tmp.age, tmp.avg_wt, lw=6)
plt.xticks(rotation=45)
axes[2].set_title("Boxplot of worthwhileness ratings for age range", fontsize=14)
axes[2].set_ylabel("worthwhile ratings")


plt.tight_layout()
plt.savefig(
    img_path + "h10_q1_gender_country_scatter.png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)

<a id='Q2' ></a>
### Q2:  What is the distribution of the four types of worthwhileness values associated to trip legs?

How users rate trip legs in terms of the 3 dimensions and the further subdimension of productivity of worthwhileness?

**Variables:**
- 4 types of worthwhileness: Enjoyment, Fitness, Payed_work, Personal_task - **leg level** (not onBoarding)

**?** "subdimension of productivity" is payed work? and the 3 dimensions are PEF?


In [None]:
## Read data
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]
# add country info
values_from_trip = values_from_trip.merge(
    all_legs[["legid", "userid", "onCampaigns", "age"]], on="legid"
).drop_duplicates()
# add weekday
values_from_trip["weekday"] = values_from_trip["legStartDay"].apply(
    lambda x: x.weekday()
)
values_from_trip.head()

In [None]:
# select legid of male and femal users
legsM = list(all_legs_M.legid.unique())
legsF = list(all_legs_F.legid.unique())
legsO = list(all_legs_O.legid.unique())

values_from_trip_M = values_from_trip[values_from_trip.legid.isin(legsM)]
values_from_trip_F = values_from_trip[values_from_trip.legid.isin(legsF)]
values_from_trip_O = values_from_trip[values_from_trip.legid.isin(legsO)]

# tables for plot
values_count = (
    values_from_trip.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)
values_count_M = (
    values_from_trip_M.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)
values_count_F = (
    values_from_trip_F.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)

values_count_O = (
    values_from_trip_O.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)

values_count.head()

In [None]:
### ALL + GENDER


def plot_h10_q2(values_count, axid, title_str):

    sns.barplot(
        data=values_count, x="valueFromTrip", y="count", hue="value", ax=axes[axid]
    )
    axes[axid].legend("")
    axes[axid].set_xticks(range(4))
    axes[axid].set_xticklabels(["E", "F", "Pw", "Pt"])
    axes[axid].tick_params(axis="both", labelsize=12)
    axes[axid].set_title(title_str, fontsize=14)
    axes[axid].set_xlabel("")
    axes[axid].set_ylabel("")


ncols = 4
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 7))
axes = axes.ravel()

plot_h10_q2(values_count, axid=0, title_str="All")
fig.legend(loc="best", fontsize="x-small")
plot_h10_q2(values_count_M, axid=1, title_str="Male")
plot_h10_q2(values_count_F, axid=2, title_str="Female")
# plot_h10_q2(values_count_O, axid=3, title_str="Others")

plt.tight_layout()
plt.savefig(img_path + "h10_q2_gender.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY COUNTRY
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 7))
axes = axes.ravel()

axid = 0
for c in top10:

    tmp = values_from_trip[values_from_trip.onCampaigns == c]
    val_count = tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")

    plot_h10_q2(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(img_path + "h10_q2_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY AGE

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5))
axes = axes.ravel()

axid = 0
for c in age_range:

    tmp = values_from_trip[values_from_trip.age == c]
    val_count = tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")

    plot_h10_q2(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(img_path + "h10_q2_age.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q3' ></a>
### Q3: What is the distribution of mood associated to trips?

How users rate trips in terms of mood? For modal split analysis we can use (as suggested) the main mode of transport based on the distance travelled.

**mood:** `overallScore`

In [None]:
# take only useful values
overall_df = trips_df[(trips_df.overallScore > 0) & (trips_df.overallScore < 6)]
overall_df_grouped = (
    overall_df[["tripid", "overallScore"]]
    .groupby("overallScore")
    .size()
    .reset_index(name="count")
)
print(len(overall_df))
# overall_df_grouped.head()
overall_df["overallScore"].describe()

In [None]:
ax = plt.gca()
sns.barplot(data=overall_df_grouped, x="overallScore", y="count").set(
    xlabel="Trip Mood", ylabel="count"
)
plt.title("Mood distribution")

for p in ax.patches:
    ax.annotate(
        "%.2f" % p.get_height(),
        (p.get_x() + p.get_width() / 2.0, p.get_height()),
        ha="center",
        va="center",
        fontsize=10,
        color="black",
        # rotation=90,
        xytext=(0, 20),
        textcoords="offset points",
    )
plt.tight_layout()
plt.savefig(img_path + "h10_q3_all.png")

In [None]:
# mode of transport for the trip: mode of transport of the longest leg.
# dividere anche per TC

long_trips = (
    all_legs.groupby("tripid")["trueDistance"].max().reset_index(name="trueDistance")
)
tc = all_legs.merge(long_trips, on=["tripid", "trueDistance"], how="right")[
    ["tripid", "transp_category", "userid", "gender", "onCampaigns", "age"]
]
long_trips = long_trips.merge(tc, on="tripid")
long_trips = long_trips.merge(overall_df[["tripid", "overallScore"]], on="tripid")

print(len(long_trips))
long_trips = long_trips[long_trips.gender != "Other"]
long_trips.head()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
axes.ravel()

tmp = long_trips.groupby(["overallScore", "gender"]).size().reset_index(name="count")
tmp["rel_count"] = tmp["count"].apply(lambda x: x / len(long_trips))

sns.barplot(data=tmp, x="overallScore", y="count", hue="gender", ax=axes[0])
axes[0].set_title("Absolute count")
axes[0].legend("")
fig.legend(loc="center right")

sns.barplot(data=tmp, x="overallScore", y="rel_count", hue="gender", ax=axes[1])
axes[1].set_title("Relative count")
axes[1].legend("")

plt.subplots_adjust(right=0.85)

plt.savefig(img_path + "h10_q3_gender.png", bbox_to_anchor=True, bbox_inches="tight")

### interpretation:
# 15% of users are male and voted 3

In [None]:
## by TC and country
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 7))
axes = axes.ravel()

sns.boxplot(x="transp_category", y="overallScore", data=long_trips, ax=axes[0])
axes[0].set_title("Boxplots of mood for each transport category", fontsize=14)
axes[0].set_ylabel("Mood")
axes[0].tick_params(axis="x", labelsize=12, rotation=45)
axes[0].tick_params(axis="y", labelsize=12)
axes[0].set_xticks(range(5))
axes[0].set_xticklabels(
    ["Walking", "Public_short", "Private", "Cycling", "Public_long"]
)

# take top 10 countries

sns.boxplot(x="onCampaigns", y="overallScore", data=long_trips, ax=axes[1])
axes[1].set_title("Boxplot of the mood for top 10 countries", fontsize=14)
axes[1].tick_params(axis="x", labelsize=12, rotation=45)
axes[1].tick_params(axis="y", labelsize=12)
axes[1].set_ylabel("Mood")

# age
sns.boxplot(x="age", y="overallScore", data=long_trips, ax=axes[2])
axes[2].set_title("Boxplot of the mood for age groups", fontsize=14)
axes[2].tick_params(axis="x", labelsize=12, rotation=45)
axes[2].tick_params(axis="y", labelsize=12)
axes[2].set_ylabel("Mood")


plt.savefig(
    img_path + "h10_q3_tc_country.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## by TC and country
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 7))
axes = axes.ravel()

# sns.boxplot(x="transp_category", y="overallScore", data=long_trips, ax=axes[0])
tmp = long_trips.groupby("transp_category")["overallScore"].mean().reset_index()
axes[0].scatter(tmp.transp_category, tmp.overallScore, lw=6)
axes[0].set_title("Boxplots of mood for each transport category", fontsize=14)
axes[0].set_ylabel("Mood")
axes[0].tick_params(axis="x", labelsize=12, rotation=45)
axes[0].tick_params(axis="y", labelsize=12)
axes[0].set_xticks(range(5))
axes[0].set_xticklabels(
    ["Walking", "Public_short", "Private", "Cycling", "Public_long"]
)

# take top 10 countries

tmp = long_trips.groupby("onCampaigns")["overallScore"].mean().reset_index()
axes[1].scatter(tmp.onCampaigns, tmp.overallScore, lw=6)
axes[1].set_title("Boxplot of the mood for top 10 countries", fontsize=14)
axes[1].tick_params(axis="x", labelsize=12, rotation=45)
axes[1].tick_params(axis="y", labelsize=12)
axes[1].set_ylabel("Mood")

# age
tmp = long_trips.groupby("age")["overallScore"].mean().reset_index()
axes[2].scatter(tmp.age, tmp.overallScore, lw=6)
axes[2].set_title("Boxplot of the mood for age groups", fontsize=14)
axes[2].tick_params(axis="x", labelsize=12, rotation=45)
axes[2].tick_params(axis="y", labelsize=12)
axes[2].set_ylabel("Mood")

plt.tight_layout()
plt.savefig(
    img_path + "h10_q3_tc_country_scatter.png", bbox_to_anchor=True, bbox_inches="tight"
)

<a id='Q4' ></a>
### Q4: What is the correlation between worthwhileness ratings and the ratings for worthwhileness elements?

Is the difference between none to some more or less significant to the difference between some and high

Correlation between `wastedTime` and PEF elements

In [None]:
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")

# read purposes -> trip_obj_grouped.pkl
trip_objs = pd.read_pickle(input_path + "trip_objs_grouped.pkl")

# add info
values_from_trip = values_from_trip.merge(
    all_legs[
        [
            "legid",
            "wastedTime",
            "userid",
            "gender",
            "onCampaigns",
            "age",
            "transp_category",
        ]
    ],
    on="legid",
).drop_duplicates()
# select useful values of WT
values_from_trip = values_from_trip[
    (values_from_trip["wastedTime"] > 0) & (values_from_trip["wastedTime"] < 6)
]
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]
# convert WT to int
values_from_trip["wastedTime"] = values_from_trip["wastedTime"].apply(
    lambda x: np.round(x)
)
# remove none transp cat
values_from_trip = values_from_trip[values_from_trip.transp_category.notna()]

# add purpose to values_from_trip
values_from_trip = values_from_trip.merge(
    trip_objs[["tripid", "objective_str"]], on="tripid"
)

values_from_trip.head()

In [None]:
## ALL + GENDER
nrows = 1
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5))
axes = axes.ravel()

tmp = (
    values_from_trip.groupby(["wastedTime", "valueFromTrip"])["value"]
    .mean()
    .reset_index(name="average")
)
sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[0])
axes[0].legend(fontsize="x-small")
# axes[0].set_xlabel(fontsize=12)
axes[0].set_title("All", fontsize=14)
# fig.legend(loc="best", fontsize="x-small", ncol=1)

gender_lst = ["Male", "Female"]
for i in range(ncols - 1):
    tmp = (
        values_from_trip[values_from_trip.gender == gender_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(
        x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i + 1]
    )
    axes[i + 1].legend("")
    axes[i + 1].set_ylabel(None)
    # axes[i+1].set_xlabels(fontsize=12)
    axes[i + 1].set_title(gender_lst[i], fontsize=14)

fig.tight_layout()
plt.savefig(
    img_path + "h10_q4_all_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## ALL
nrows = 1
ncols = 3

tmp = (
    values_from_trip.groupby(["wastedTime", "valueFromTrip"])["value"]
    .mean()
    .reset_index(name="average")
)
graph = sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp)
graph.set(
    xlabel="Worthwhilness rating", ylabel="Average value for Worthwhilness Element"
)
graph.legend(title="Worthwhilness Element")
fig.tight_layout()

In [None]:
## BY COUNTRY
nrows = 2
ncols = 5
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

# fig.legend(loc="best", fontsize="x-small", ncol=1)

for i in range(len(top10)):
    tmp = (
        values_from_trip[values_from_trip.onCampaigns == top10[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(top10[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
## BY AGE

nrows = 1
ncols = 4
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5), sharey=True)
axes = axes.ravel()

# fig.legend(loc="best", fontsize="x-small", ncol=1)


for i in range(len(age_range)):
    tmp = (
        values_from_trip[values_from_trip.age == age_range[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(age_range[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_age.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY TRANSPORT CATEGORY

tc_lst = list(values_from_trip.transp_category.unique())

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

# fig.legend(loc="best", fontsize="x-small", ncol=1)

for i in range(len(tc_lst)):
    tmp = (
        values_from_trip[values_from_trip.transp_category == tc_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(tc_lst[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_tc.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY TRANSPORT CATEGORY --- MALE

values_from_trip_M = values_from_trip[values_from_trip.gender == "Male"]
tc_lst = list(values_from_trip_M.transp_category.unique())

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

# fig.legend(loc="best", fontsize="x-small", ncol=1)

for i in range(len(tc_lst)):
    tmp = (
        values_from_trip_M[values_from_trip_M.transp_category == tc_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(tc_lst[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_tc_M.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY TRANSPORT CATEGORY --- FEMALE

values_from_trip_F = values_from_trip[values_from_trip.gender == "Female"]
tc_lst = list(values_from_trip_F.transp_category.unique())

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

# fig.legend(loc="best", fontsize="x-small", ncol=1)

for i in range(len(tc_lst)):
    tmp = (
        values_from_trip_F[values_from_trip_F.transp_category == tc_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(tc_lst[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_tc_F.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY PURPOSE

obj_lst = list(values_from_trip.objective_str.unique())

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

for i in range(len(obj_lst)):
    tmp = (
        values_from_trip[values_from_trip.objective_str == obj_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(obj_lst[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_purpose.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY PURPOSE -- MALE

values_from_trip_M = values_from_trip[values_from_trip.gender == "Male"]
obj_lst = list(values_from_trip_M.objective_str.unique())

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

for i in range(len(obj_lst)):
    tmp = (
        values_from_trip_M[values_from_trip_M.objective_str == obj_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(obj_lst[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_purpose_M.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY PURPOSE -- FEMALE

values_from_trip_F = values_from_trip[values_from_trip.gender == "Female"]
obj_lst = list(values_from_trip_F.objective_str.unique())

nrows = 2
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
axes = axes.ravel()

for i in range(len(obj_lst)):
    tmp = (
        values_from_trip_F[values_from_trip_F.objective_str == obj_lst[i]]
        .groupby(["wastedTime", "valueFromTrip"])["value"]
        .mean()
        .reset_index(name="average")
    )
    sns.barplot(x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")
    axes[i].set_ylabel("average", fontsize=13)
    axes[i].set_xlabel("wastedTime", fontsize=13)
    axes[i].tick_params(labelsize=12)
    axes[i].set_title(obj_lst[i], fontsize=15)

fig.tight_layout()
plt.savefig(img_path + "h10_q4_purpose_F.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q5' ></a>
### Q5: Is there a correlation between the mood ratings at trip level and the worthwhile ratings at trip leg level?

This analysis can help us explain worthwhileness values, but probably, it will not have an effect on the Worthwhileness Index calculations

Find the correlation between `overallScore` (trip) and `wastedTime` (leg)

In [None]:
overall_df = trips_df[["tripid", "overallScore"]][
    (trips_df.overallScore > 0) & (trips_df.overallScore < 6)
]
wt_df = all_legs[
    ["tripid", "legid", "userid", "wastedTime", "gender", "onCampaigns", "age"]
][(all_legs.wastedTime > 0) & (all_legs.wastedTime < 6)]
os_wt_df = overall_df.merge(wt_df, on="tripid")
os_wt_df["wastedTime"] = os_wt_df["wastedTime"].apply(lambda x: np.round(x))
os_wt_df.head()

In [None]:
tb_all = pd.crosstab(
    os_wt_df.overallScore, os_wt_df.wastedTime, margins=True, normalize="all"
)

sns.set()
plt.figure(figsize=(10, 7))
sns.heatmap(tb_all.iloc[:-1, :-1], annot=True)
plt.xlabel("worthwhile ratings (wastedTime)")
plt.ylabel("mood (overallScore)")

# plt.show()
plt.savefig(img_path + "h10_q5_all.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
from scipy.stats import pearsonr, chi2_contingency

pearson_corr = pearsonr(os_wt_df.overallScore, os_wt_df.wastedTime)[0]
print("Correlation: ", pearson_corr)


def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


# the output is in the range of [0,1], where 0 means no association and 1 is full association.
#  Cramer’s V is symmetrical — it is insensitive to swapping x and y
cramerv_ass = cramers_v(os_wt_df.overallScore, os_wt_df.wastedTime)
print("Cramer's v: ", cramerv_ass)

In [None]:
## by gender
ncols = 2
nrows = 1

fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(15, 7))
plt.subplots_adjust(wspace=0.05, hspace=0.1)
axes = axes.ravel()

gender_lst = ["Male", "Female"]
for i in range(ncols):

    tmp = os_wt_df[os_wt_df.gender == gender_lst[i]]
    tb = pd.crosstab(tmp.overallScore, tmp.wastedTime, margins=True, normalize="all")

    im = sns.heatmap(
        tb.iloc[:-1, :-1], annot=True, ax=axes[i], cbar=False, vmin=0, vmax=0.4
    )
    axes[i].set_xlabel("worthwhile ratings (wastedTime)")
    axes[i].set_title(gender_lst[i])
    if i == 0:
        axes[i].set_ylabel("mood (overallScore)")
    else:
        axes[i].set_ylabel(None)

fig.tight_layout(rect=[0, 0, 0.9, 1])
mappable = im.get_children()[0]
plt.colorbar(mappable, orientation="vertical")
plt.savefig(img_path + "h10_q5_gender.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# by country
ncols = 5
nrows = 2
fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(20, 10))
plt.subplots_adjust(wspace=0.05, hspace=0.1)
axes = axes.ravel()

for i in range(2 * ncols):

    tmp = os_wt_df[os_wt_df.onCampaigns == top10[i]]
    tb = pd.crosstab(tmp.overallScore, tmp.wastedTime, margins=True, normalize="all")
    tb = np.round(tb * 100, 1)

    im = sns.heatmap(
        tb.iloc[:-1, :-1], annot=True, ax=axes[i], cbar=False, vmin=0, vmax=50
    )
    axes[i].set_title(top10[i])
    axes[i].set_xlabel("worthwhile ratings")
    if i == 0 or i == 5:
        axes[i].set_ylabel("mood (overallScore)")
    else:
        axes[i].set_ylabel(None)

fig.tight_layout(rect=[0, 0, 0.9, 1])
mappable = im.get_children()[0]
plt.colorbar(mappable, orientation="vertical")
plt.savefig(img_path + "h10_q5_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# by AGE

ncols = 4
nrows = 1
fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(20, 5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)
axes = axes.ravel()

for i in range(ncols):

    tmp = os_wt_df[os_wt_df.age == age_range[i]]
    tb = pd.crosstab(tmp.overallScore, tmp.wastedTime, margins=True, normalize="all")
    tb = np.round(tb * 100, 1)

    im = sns.heatmap(
        tb.iloc[:-1, :-1], annot=True, ax=axes[i], cbar=False, vmin=0, vmax=50
    )
    axes[i].set_title(age_range[i])
    axes[i].set_xlabel("worthwhile ratings")
    if i == 0 or i == 5:
        axes[i].set_ylabel("mood (overallScore)")
    else:
        axes[i].set_ylabel(None)

fig.tight_layout(rect=[0, 0, 0.9, 1])
mappable = im.get_children()[0]
plt.colorbar(mappable, orientation="vertical")
plt.savefig(img_path + "h10_q5_age.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q6' ></a>
### Q6: How worthwhileness ratings and worthwhileness elements change for weekdays and weekends?

Analysis of `wastedTime` and PEF for weekdays and weekends.
<br> Select legs with both start and end date in the same day.
<br> Variables `startDate_formated` and `endDate_formated`

**WASTED TIME**

In [None]:
## legs with wastedTime
wt_df = all_legs[
    [
        "tripid",
        "legid",
        "userid",
        "wastedTime",
        "startDate_formated",
        "endDate_formated",
        "gender",
        "onCampaigns",
        "age",
    ]
][(all_legs.wastedTime > 0) & (all_legs.wastedTime < 6)]
# get day of the week for start and end
wt_df["weekday_S"] = wt_df.startDate_formated.apply(lambda x: x.weekday())
wt_df["weekday_E"] = wt_df.endDate_formated.apply(lambda x: x.weekday())
wt_df = wt_df[wt_df.weekday_S == wt_df.weekday_E]

wt_df_working = wt_df[wt_df.weekday_S.isin([0, 1, 2, 3, 4])]
wt_df_working["wastedTime"] = wt_df_working["wastedTime"].apply(lambda x: np.round(x))
wt_df_weekend = wt_df[wt_df.weekday_S.isin([5, 6])]
wt_df_weekend["wastedTime"] = wt_df_weekend["wastedTime"].apply(lambda x: np.round(x))

In [None]:
## all and by gender
ncols = 3
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 6))
axes = axes.ravel()

# all
wd_vs_we = wt_df_working.groupby("wastedTime").size().reset_index(name="working")
tmp = wt_df_weekend.groupby("wastedTime").size().reset_index(name="weekend")
wd_vs_we = wd_vs_we.merge(tmp, on="wastedTime")

wd_vs_we[["working", "weekend"]].plot(kind="bar", ax=axes[0])
axes[0].set_title("All")
axes[0].set_xticks(range(5))
axes[0].set_xticklabels(range(1, 6))
axes[0].tick_params(axis="both", labelsize=12, rotation=0)

axes[0].legend("")
fig.legend(loc="center right")

# male
gender_lst = ["Male", "Female"]

for i in range(len(gender_lst)):

    tmp_wd = wt_df_working[wt_df_working.gender == gender_lst[i]]
    wd_vs_we = tmp_wd.groupby("wastedTime").size().reset_index(name="working")

    tmp_we = wt_df_weekend[wt_df_weekend.gender == gender_lst[i]]
    tmp = tmp_we.groupby("wastedTime").size().reset_index(name="weekend")
    wd_vs_we = wd_vs_we.merge(tmp, on="wastedTime")

    wd_vs_we[["working", "weekend"]].plot(kind="bar", ax=axes[i + 1])
    axes[i + 1].set_title(gender_lst[i])
    axes[i + 1].set_xticks(range(5))
    axes[i + 1].set_xticklabels(range(1, 6))
    axes[i + 1].tick_params(axis="both", labelsize=12, rotation=0)
    axes[i + 1].legend("")

fig.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(
    img_path + "h10_q6_all_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
# by country
ncols = 2
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 6))
axes = axes.ravel()


tmp = wt_df_working.copy()
sns.boxplot(data=tmp, x="onCampaigns", y="wastedTime", ax=axes[0])
axes[0].set_title("Boxplot of worthwhile ratings" + "\n" + "working days", fontsize=15)
axes[0].tick_params(axis="both", labelsize=12)

tmp = wt_df_weekend.copy()
sns.boxplot(data=tmp, x="onCampaigns", y="wastedTime", ax=axes[1])
axes[1].set_title("Boxplot of worthwhile ratings" + "\n" + "weekend days", fontsize=15)
axes[1].tick_params(axis="both", labelsize=12)

fig.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(img_path + "h10_q6_country.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# by country
ncols = 2
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 6), sharey=True)
axes = axes.ravel()

# wd
tmp = wt_df_working.groupby("onCampaigns")["wastedTime"].mean().reset_index()
axes[0].scatter(tmp.onCampaigns, tmp.wastedTime, lw=6)
axes[0].set_title(
    "Scatterplot of average worthwhile ratings" + "\n" + "working days", fontsize=15
)
axes[0].tick_params(axis="both", labelsize=12)

# we
tmp = wt_df_weekend.groupby("onCampaigns")["wastedTime"].mean().reset_index()
axes[1].scatter(tmp.onCampaigns, tmp.wastedTime, lw=6)
axes[1].set_title(
    "Scatterplot of worthwhile ratings" + "\n" + "weekend days", fontsize=15
)
axes[1].tick_params(axis="both", labelsize=12)

fig.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(
    img_path + "h10_q6_country_scatter.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
# by AGE
ncols = 2
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 6))
axes = axes.ravel()

age_range = list(all_legs.age.unique())

sns.boxplot(data=wt_df_working, x="age", y="wastedTime", ax=axes[0])
axes[0].set_title("Boxplot of worthwhile ratings" + "\n" + "working days", fontsize=15)
axes[0].tick_params(axis="both", labelsize=12)

sns.boxplot(data=wt_df_weekend, x="age", y="wastedTime", ax=axes[1])
axes[1].set_title("Boxplot of worthwhile ratings" + "\n" + "weekend days", fontsize=15)
axes[1].tick_params(axis="both", labelsize=12)

fig.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(img_path + "h10_q6_age.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
# by AGE
ncols = 2
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 6))
axes = axes.ravel()

age_range = list(all_legs.age.unique())

# wd
tmp = wt_df_working.groupby("age")["wastedTime"].mean().reset_index()
axes[0].scatter(tmp.age, tmp.wastedTime, lw=6)
axes[0].set_title(
    "Scatterplot of worthwhile ratings" + "\n" + "working days", fontsize=15
)
axes[0].tick_params(axis="both", labelsize=12)

# we
tmp = wt_df_weekend.groupby("age")["wastedTime"].mean().reset_index()
axes[1].scatter(tmp.age, tmp.wastedTime, lw=6)
axes[1].set_title(
    "Scatterplot of worthwhile ratings" + "\n" + "weekend days", fontsize=15
)
axes[1].tick_params(axis="both", labelsize=12)

fig.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig(
    img_path + "h10_q6_age_scatter.png", bbox_to_anchor=True, bbox_inches="tight"
)

**worthwhileness elements: PEF**

In [None]:
### ALL + GENDER - WORKING DAYS

values_from_trip["weekday"] = values_from_trip["legStartDay"].apply(
    lambda x: x.weekday()
)

# select legid of male and femal users
legsM = list(all_legs_M.legid.unique())
legsF = list(all_legs_F.legid.unique())
legsO = list(all_legs_O.legid.unique())

values_from_trip_M = values_from_trip[
    (values_from_trip.legid.isin(legsM))
    & (values_from_trip.weekday.isin([0, 1, 2, 3, 4]))
]
values_from_trip_F = values_from_trip[
    (values_from_trip.legid.isin(legsF))
    & (values_from_trip.weekday.isin([0, 1, 2, 3, 4]))
]
values_from_trip_O = values_from_trip[
    (values_from_trip.legid.isin(legsO))
    & (values_from_trip.weekday.isin([0, 1, 2, 3, 4]))
]

# tables for plot
values_count = (
    values_from_trip.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)
values_count_M = (
    values_from_trip_M.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)
values_count_F = (
    values_from_trip_F.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)

values_count_O = (
    values_from_trip_O.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)

ncols = 3
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(15, 5))
axes = axes.ravel()

plot_h10_q2(values_count, axid=0, title_str="All")
fig.legend(loc="best", fontsize="x-small")
plot_h10_q2(values_count_M, axid=1, title_str="Male")
plot_h10_q2(values_count_F, axid=2, title_str="Female")
# plot_h10_q2(values_count_O, axid=3, title_str="Others")

plt.tight_layout()
plt.savefig(
    img_path + "h10_q6_pef_gender_wd.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
### ALL + GENDER - WEEKENDS

# select legid of male and femal users
legsM = list(all_legs_M.legid.unique())
legsF = list(all_legs_F.legid.unique())
legsO = list(all_legs_O.legid.unique())

values_from_trip_M = values_from_trip[
    (values_from_trip.legid.isin(legsM)) & (values_from_trip.weekday.isin([5, 6]))
]
values_from_trip_F = values_from_trip[
    (values_from_trip.legid.isin(legsF)) & (values_from_trip.weekday.isin([5, 6]))
]
values_from_trip_O = values_from_trip[
    (values_from_trip.legid.isin(legsO)) & (values_from_trip.weekday.isin([5, 6]))
]

# tables for plot
values_count = (
    values_from_trip.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)
values_count_M = (
    values_from_trip_M.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)
values_count_F = (
    values_from_trip_F.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)

values_count_O = (
    values_from_trip_O.groupby(["valueFromTrip", "value"])
    .size()
    .reset_index(name="count")
)

ncols = 3
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(15, 5))
axes = axes.ravel()

plot_h10_q2(values_count, axid=0, title_str="All")
fig.legend(loc="best", fontsize="x-small")
plot_h10_q2(values_count_M, axid=1, title_str="Male")
plot_h10_q2(values_count_F, axid=2, title_str="Female")
# plot_h10_q2(values_count_O, axid=3, title_str="Others")

plt.tight_layout()
plt.savefig(
    img_path + "h10_q6_pef_gender_we.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
### BY COUNTRY - WORKING DAYS

values_from_trip_wd = values_from_trip[values_from_trip.weekday.isin([0, 1, 2, 3, 4])]

fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 7))
axes = axes.ravel()

axid = 0
for c in top10:

    tmp = values_from_trip_wd[values_from_trip_wd.onCampaigns == c]
    val_count = tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")

    plot_h10_q2(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "h10_q6_pef_country_wd.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
### BY COUNTRY - WEEKENDS

values_from_trip_wd = values_from_trip[values_from_trip.weekday.isin([5, 6])]


fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 7))
axes = axes.ravel()

axid = 0
for c in top10:

    tmp = values_from_trip_wd[values_from_trip_wd.onCampaigns == c]
    val_count = tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")

    plot_h10_q2(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "h10_q6_pef_country_we.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
### BY AGE - WORKING DAYS

values_from_trip_wd = values_from_trip[values_from_trip.weekday.isin([0, 1, 2, 3, 4])]

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5))
axes = axes.ravel()

axid = 0
for c in age_range:

    tmp = values_from_trip_wd[values_from_trip_wd.age == c]
    val_count = tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")

    plot_h10_q2(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "h10_q6_pef_age_wd.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
### BY AGE - WEEKENDS

values_from_trip_wd = values_from_trip[values_from_trip.weekday.isin([5, 6])]

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5))
axes = axes.ravel()

axid = 0
for c in age_range:

    tmp = values_from_trip_wd[values_from_trip_wd.age == c]
    val_count = tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")

    plot_h10_q2(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "h10_q6_pef_age_we.png", bbox_to_anchor=True, bbox_inches="tight"
)

<a id='Q8' ></a>
### Q8:  Assess if worthwhileness ratings and worthwhileness elements change if a trip is made in an urban, sub-urban or rural area

We will consider legs.
<br>A leg is considered if both the starting and ending points are in an urban, sub-urban or rural area.
<br>Variables to consider `start_class`, `end_class`

In [None]:
# read data
all_legs_urban = pd.read_pickle(
    input_path + "all_legs_final_ds_user_info_urban_class.pkl"
)

In [None]:
wt_df = all_legs_urban[
    [
        "tripid",
        "legid",
        "userid",
        "wastedTime",
        "startDate_formated",
        "endDate_formated",
        "gender",
        "onCampaigns",
        "age",
        "start_class",
        "end_class",
    ]
][(all_legs_urban.wastedTime > 0) & (all_legs_urban.wastedTime < 6)]
# take legs starts and end in the same area
wt_df = wt_df[wt_df.start_class == wt_df.end_class]
wt_df["wastedTime"] = wt_df["wastedTime"].apply(lambda x: np.round(x))
wt_df.head()

In [None]:
def group_by_area(wt_df):
    n_rural = len(wt_df[wt_df.start_class == "rural"])
    n_urban = len(wt_df[wt_df.start_class == "urban"])
    n_suburban = len(wt_df[wt_df.start_class == "sub-urban"])

    wt_df_group = (
        wt_df.groupby(["wastedTime", "start_class"]).size().reset_index(name="count")
    )
    wt_df_group["rel_count"] = np.zeros(len(wt_df_group))
    for idx, row in wt_df_group.iterrows():

        if row["start_class"] == "rural":
            wt_df_group.loc[idx, "rel_count"] = wt_df_group.loc[idx, "count"] / n_rural

        elif row["start_class"] == "urban":
            wt_df_group.loc[idx, "rel_count"] = wt_df_group.loc[idx, "count"] / n_urban
        else:
            wt_df_group.loc[idx, "rel_count"] = (
                wt_df_group.loc[idx, "count"] / n_suburban
            )

    return wt_df_group


fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))
axes = axes.ravel()

# all
wt_df_group = group_by_area(wt_df)
sns.barplot(
    x="wastedTime", y="rel_count", hue="start_class", data=wt_df_group, ax=axes[0]
)
# fig.legend(loc='center right', fontsize="x-small")
axes[0].legend(loc="best", fontsize="x-small")
axes[0].set_title("All")

wt_df_M = wt_df[wt_df.gender == "Male"]
wt_df_group = group_by_area(wt_df_M)
sns.barplot(
    x="wastedTime", y="rel_count", hue="start_class", data=wt_df_group, ax=axes[1]
)
axes[1].legend("")
axes[1].set_title("Male")


wt_df_F = wt_df[wt_df.gender == "Female"]
wt_df_group = group_by_area(wt_df_F)
sns.barplot(
    x="wastedTime", y="rel_count", hue="start_class", data=wt_df_group, ax=axes[2]
)
axes[2].legend("")
axes[2].set_title("Female")

plt.tight_layout()
plt.savefig(
    img_path + "h10_q8_wt_all_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## country
ncols = 5
fig, axes = plt.subplots(nrows=2, ncols=ncols, figsize=(18, 10))
axes = axes.ravel()


for i in range(2 * ncols):
    tmp = wt_df[wt_df.onCampaigns == top10[i]]
    wt_df_group = group_by_area(tmp)
    sns.barplot(
        x="wastedTime", y="rel_count", hue="start_class", data=wt_df_group, ax=axes[i]
    )
    axes[i].set_title(top10[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")

plt.tight_layout()
plt.savefig(
    img_path + "h10_q8_wt_country.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## AGE
ncols = 4
fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=(18, 5))
axes = axes.ravel()

for i in range(ncols):
    tmp = wt_df[wt_df.age == age_range[i]]
    wt_df_group = group_by_area(tmp)
    sns.barplot(
        x="wastedTime", y="rel_count", hue="start_class", data=wt_df_group, ax=axes[i]
    )
    axes[i].set_title(age_range[i])
    if i == 0:
        axes[i].legend(fontsize="x-small")
    else:
        axes[i].legend("")

plt.tight_layout()
plt.savefig(img_path + "h10_q8_wt_age.png", bbox_to_anchor=True, bbox_inches="tight")

**worthwhileness values**

In [None]:
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")
# remove unknown
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]
values_from_trip = values_from_trip.merge(
    all_legs_urban[
        ["legid", "userid", "gender", "onCampaigns", "age", "start_class", "end_class"]
    ],
    on="legid",
).drop_duplicates()
values_from_trip = values_from_trip[
    values_from_trip.start_class == values_from_trip.end_class
]

values_from_trip.head()

In [None]:
### 3x3 per urban/suburban/rural
nrows = 3
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 12))
axes = axes.ravel()

gender_lst = ["All", "Male", "Female"]
area_lst = ["urban", "sub-urban", "rural"]


for area in area_lst:
    tmp_area = values_from_trip[values_from_trip.start_class == area]
    if area == "urban":
        cnt = 0
    if area == "sub-urban":
        cnt = 3
    if area == "rural":
        cnt = 6

    for i in range(ncols):

        if i == 0:
            values_count = (
                tmp_area.groupby(["valueFromTrip", "value"])
                .size()
                .reset_index(name="count")
            )
            plot_h10_q2(
                values_count, axid=i + cnt, title_str=gender_lst[i] + " - " + area
            )
            if i + cnt == 0:
                fig.legend(loc="best", fontsize="x-small")

        else:
            tmp_gender = tmp_area[tmp_area.gender == gender_lst[i]]
            values_count = (
                tmp_gender.groupby(["valueFromTrip", "value"])
                .size()
                .reset_index(name="count")
            )
            plot_h10_q2(
                values_count, axid=i + cnt, title_str=gender_lst[i] + " - " + area
            )

plt.tight_layout()
plt.savefig(
    img_path + "h10_q8_pef_all_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
## country

for area in area_lst:

    tmp_area = values_from_trip[values_from_trip.start_class == area]

    if area == "sub-urban":
        fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 7))
        axes = axes.ravel()

    else:
        fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 7))
        axes = axes.ravel()

    axid = 0
    for c in top10:

        try:
            tmp = tmp_area[tmp_area.onCampaigns == c]
            val_count = (
                tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")
            )

            plot_h10_q2(val_count, axid=axid, title_str=c)
            if axid == 0:
                fig.legend(loc="best", fontsize="x-small")

            axid += 1

        except:
            pass

    plt.tight_layout()
    plt.savefig(
        img_path + "h10_q8_pef_country_" + area + ".png",
        bbox_to_anchor=True,
        bbox_inches="tight",
    )

In [None]:
## age

for area in area_lst:
    tmp_area = values_from_trip[values_from_trip.start_class == area]

    fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(15, 5))
    axes = axes.ravel()

    axid = 0
    for c in age_range:

        tmp = tmp_area[tmp_area.age == c]
        val_count = (
            tmp.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")
        )

        plot_h10_q2(val_count, axid=axid, title_str=c)
        if axid == 0:
            fig.legend(loc="best", fontsize="x-small")

        axid += 1

    plt.tight_layout()
    plt.savefig(
        img_path + "h10_q8_pef_age_" + area + ".png",
        bbox_to_anchor=True,
        bbox_inches="tight",
    )