# H13

**Obj:** Smartphone-based activities
<br> To explore how VTT is influenced by smartphone apps and the time spent on them.

## Questions

- [Q1](#Q1): What are the ICT activities that are more frequently associated to each kind and level of value?
- [Q2](#Q2): What is the correlation between worthwhileness and ICT activities?
- [Q3](#Q3): What is the role of ICT in shaping VTT: how are the two factors 'Internet connectivity' and 'charging opportunity' correlated to travel worthwhileness?

**ICT activities:** ReadingDevice, Listening, Watching, Browsing

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/hypothesis/H13/"
img_path = "../../2019-12-16.out/hypothesis/H13/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
# trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()

In [None]:
# read activities
all_gen_act = pd.read_pickle(input_path + "all_gen_act.pkl")

# add info
all_gen_act = all_gen_act.merge(
    all_legs[["legid", "wastedTime", "gender", "onCampaigns"]], on="legid",
)

# filter useful values of wt and round to int
all_gen_act = all_gen_act[(all_gen_act.wastedTime > 0) & (all_gen_act.wastedTime < 6)]
all_gen_act.wastedTime = all_gen_act.wastedTime.apply(lambda x: np.round(x))

# add values from trip
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]

tmp = values_from_trip[["legid", "value", "valueFromTrip"]]
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()

# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)
values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)


all_gen_act = all_gen_act.merge(values_from_trip_pivot, on="legid").drop_duplicates()
print("shape", all_gen_act.shape)
print("unique legs", all_gen_act.legid.nunique())
all_gen_act.head()

In [None]:
### select ICT activities
ict_activities_lst = ["ReadingDevice", "Listening", "Watching", "Browsing"]
ict_activities_df = all_gen_act[all_gen_act.code.isin(ict_activities_lst)]

In [None]:
# define for plots
age_range = list(all_legs.age.unique())

# assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())

# transp_category list
tc_lst = ict_activities_df.transp_category.unique()
tc_lst = tc_lst[:-1]

# gender list
gender_lst = ["Male", "Female"]

<a id='Q1' ></a>
### Q1: What are the ICT activities that are more frequently associated to each kind and level of value?

In [None]:
act_wv = ict_activities_df[
    ["tripid", "code", "wastedTime", "Enjoyment", "Fitness", "Productivity"]
]
act_wv.head(3)

In [None]:
ww_el = ["Enjoyment", "Fitness", "Productivity"]
ww_val = [0.0, 1.0, 2.0]

worthwhile_elements = {}
for el in ww_el:
    for val in ww_val:
        key = "{element}_{level}".format(element=el[0], level=int(val))
        worthwhile_elements[key] = (el, val)


def get_top_activities(act_wv):
    top_activities = {}
    for el in ww_el:
        top_activities[el] = {}
        for val in ww_val:
            tmp = (
                act_wv.loc[act_wv[el] == val]
                .groupby("code", group_keys=False)
                .tripid.nunique()
                .sort_values(ascending=False)
            )
            top_activities[el][val] = tmp.to_dict()

    return top_activities

In [None]:
top_activities = get_top_activities(act_wv)
activities = top_activities["Enjoyment"][0.0].keys()

In [None]:
def get_heatmap(top_activities):
    activities = top_activities["Enjoyment"][0.0].keys()

    worthwhile_elements = {}
    for el in ww_el:
        for val in ww_val:
            key = "{element}_{level}".format(element=el[0], level=int(val))
            worthwhile_elements[key] = (el, val)

    heatmap = []
    for act in activities:
        element_values = []
        for wwel, (wwelement, wwvalue) in worthwhile_elements.items():
            # print('act: {}, wwel: {} ({}, {})'.format(act, wwel, wwelement, wwvalue))
            try:
                val = top_activities[wwelement][wwvalue][act]
            except:
                val = 0
            element_values.append(val)

        heatmap.append(element_values)

    return heatmap

In [None]:
heatmap = get_heatmap(top_activities)
heatmap_df = pd.DataFrame(heatmap, columns=worthwhile_elements.keys(), index=activities)
heatmap_df

In [None]:
# Matplotlib: How to remove white lines in the heatmap
# https://stackoverflow.com/a/45633288/2377454
plt.rcParams["axes.grid"] = False

sns.heatmap(heatmap_df, annot=True, fmt="d")
plt.title("Activities vs Worthwhileness factors")

# reactivate grids in graphs
plt.rcParams["axes.grid"] = True
plt.tight_layout()
plt.savefig(
    img_path + "h13_q1_heatmap_all.png", bbox_to_anchor=True, bbox_inches="tight"
)

#### Analysis by gender

In [None]:
# male
tmp_m = ict_activities_df[ict_activities_df.gender == "Male"]
act_wv_male = tmp_m[
    ["tripid", "gender", "code", "wastedTime", "Enjoyment", "Fitness", "Productivity"]
]
# female
tmp_f = ict_activities_df[ict_activities_df.gender == "Female"]
act_wv_female = tmp_f[
    ["tripid", "gender", "code", "wastedTime", "Enjoyment", "Fitness", "Productivity"]
]

In [None]:
# create heatmaps - male
top_activities_male = get_top_activities(act_wv_male)
heatmap_male = get_heatmap(top_activities_male)
heatmap_male_df = pd.DataFrame(
    heatmap_male, columns=worthwhile_elements.keys(), index=activities
)

# create heatmaps - female
top_activities_female = get_top_activities(act_wv_female)
heatmap_female = get_heatmap(top_activities_female)
heatmap_female_df = pd.DataFrame(
    heatmap_female, columns=worthwhile_elements.keys(), index=activities
)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 7))
axes = axes.ravel()

sns.heatmap(heatmap_male_df, annot=True, fmt="d", ax=axes[0])
axes[0].set_title("Activities vs Worthwhileness factors - Male")

sns.heatmap(heatmap_female_df, annot=True, fmt="d", ax=axes[1])
axes[1].set_title("Activities vs Worthwhileness factors - Female")

plt.tight_layout()
plt.savefig(
    img_path + "h13_q1_heatmap_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

#### Analysis by transport category

In [None]:
act_wv_all_tc = ict_activities_df[
    [
        "tripid",
        "code",
        "transp_category",
        "wastedTime",
        "Enjoyment",
        "Fitness",
        "Productivity",
    ]
]
act_wv_all_tc.head(3)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))
axes = axes.ravel()

for i in range(len(tc_lst)):

    act_wv_tc_tmp = act_wv_all_tc.loc[act_wv_all_tc["transp_category"] == tc_lst[i]]
    top_activities_tc = get_top_activities(act_wv_tc_tmp)
    activities_tc = top_activities_tc["Enjoyment"][0.0].keys()
    hm = pd.DataFrame(
        get_heatmap(top_activities_tc),
        columns=worthwhile_elements.keys(),
        index=activities_tc,
    )

    sns.heatmap(hm, annot=True, fmt="d", ax=axes[i])
    axes[i].set_title(tc_lst[i], fontsize=14)
    axes[i].tick_params(axis="y", rotation=0)

plt.tight_layout()
plt.savefig(
    img_path + "h13_q1_heatmap_tc.png", bbox_to_anchor=True, bbox_inches="tight"
)

<a id='Q2' ></a>
### Q2: What is the correlation between worthwhileness and ICT activities?

In [None]:
plt.figure(figsize=(10, 7))
# all
tmp = ict_activities_df.groupby("code")["wastedTime"].mean().reset_index(name="avg_wt")
plt.scatter(tmp.code, tmp.avg_wt, c="black", marker="x", label="All")

# male
tmp_m = ict_activities_df[ict_activities_df.gender == "Male"]
tmp = tmp_m.groupby("code")["wastedTime"].mean().reset_index(name="avg_wt")
plt.scatter(tmp.code, tmp.avg_wt, c="b", marker="s", label="Male")

# female
tmp_f = ict_activities_df[ict_activities_df.gender == "Female"]
tmp = tmp_f.groupby("code")["wastedTime"].mean().reset_index(name="avg_wt")
plt.scatter(tmp.code, tmp.avg_wt, c="r", marker="d", label="Female")

plt.xlabel("ICT activities")
plt.ylabel("Average worthwhileness rating")

plt.legend(loc="best")

plt.tight_layout()
plt.savefig(
    img_path + "h13_q2_all_gender.png", bbox_to_anchor=True, bbox_inches="tight"
)

In [None]:
### BY TC
fig, axs = plt.subplots(2, 3, figsize=(15, 8))
axes_coords = [(i, j) for i in range(2) for j in range(3)]
colors = ["blue", "orange", "green", "red", "purple"]


for tc, ax_coords, color in zip(tc_lst, axes_coords, colors):
    tmp_tc = ict_activities_df[ict_activities_df.transp_category == tc]
    tmp = tmp_tc.groupby("code")["wastedTime"].mean().reset_index(name="avg_wt")

    ax = axs[ax_coords]
    ax.set_title(tc)
    ax.scatter(tmp.code, tmp.avg_wt, c=color)
    ax.tick_params(axis="x", rotation=25)

plt.tight_layout()
plt.savefig(img_path + "h13_q2_tc.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY COUNTRY
fig, axes = plt.subplots(nrows=2, ncols=5, sharey=True, figsize=(18, 7))
axes = axes.ravel()

act_lst = ["B", "L", "R", "W"]
for i in range(len(top10)):

    tmp_c = ict_activities_df[ict_activities_df.onCampaigns == top10[i]]
    tmp = tmp_c.groupby("code")["wastedTime"].mean().reset_index(name="avg_wt")

    axes[i].scatter(tmp.code, tmp.avg_wt)
    axes[i].set_title(top10[i])
    axes[i].set_xticks(range(4))
    axes[i].set_xticklabels(act_lst)

plt.tight_layout()
plt.savefig(img_path + "h13_q2_country.png", bbox_to_anchor=True, bbox_inches="tight")

<a id='Q3' ></a>
### Q3: What is the role of ICT in shaping VTT: how are the two factors 'Internet connectivity' and 'charging opportunity' correlated to travel worthwhileness?

In [None]:
### read experience factors
all_factors = pd.read_pickle(input_path + "all_factors.pkl")

# delete legs with minus=F and plus=F
all_factors = all_factors[
    ~((all_factors["minus"] == False) & (all_factors["plus"] == False))
]

# delete legs with minus=T and plus=T (3% of obs)
all_factors = all_factors[
    ~((all_factors["minus"] == True) & (all_factors["plus"] == True))
]

# select only useful cols
all_factors = all_factors[
    [
        "correctedModeOfTransport_str",
        "legid",
        "minus",
        "plus",
        "tripid",
        "factor",
        "legStartDay",
    ]
]

# add info
all_factors = all_factors.merge(
    all_legs[
        ["legid", "wastedTime", "gender", "age", "onCampaigns", "transp_category"]
    ],
    on="legid",
)

# select useful wastedTime
all_factors = all_factors[(all_factors.wastedTime > 0) & (all_factors.wastedTime < 6)]
all_factors["wastedTime"] = all_factors["wastedTime"].apply(lambda x: np.round(x, 0))

# remove legs with "None" transport category
all_factors = all_factors[(all_factors.transp_category.notna())]

# select only "internet connectivity" and "charging opportunity"
lst = ["Charging_Opportunity", "Internet_Connectivity"]
ict_factors = all_factors[all_factors.factor.isin(lst)]


# checks
print("all records:", len(ict_factors))
xx = ict_factors[(ict_factors["minus"] == False) & (ict_factors["plus"] == True)]
print("only plus: ", len(xx))
xx = ict_factors[(ict_factors["minus"] == True) & (ict_factors["plus"] == False)]
print("only minus: ", len(xx))


def find_impact(plus, minus):

    if (minus == False) & (plus == True):
        return "plus"
    if (minus == True) & (plus == False):
        return "minus"


ict_factors["impact"] = ict_factors.apply(
    lambda row: find_impact(row["plus"], row["minus"]), axis=1
)

impact_lst = ["plus", "minus"]

# drop transport category 'walking' only 2 data points
ict_factors = ict_factors.loc[~(ict_factors["transp_category"] == "walking")]

ict_factors.head()

In [None]:
ict_factors.groupby("transp_category").count()["legid"].reset_index()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
axes = axes.ravel()


# all
tmp = ict_factors.groupby("factor")["wastedTime"].mean().reset_index(name="avg_wt")
axes[0].scatter(tmp.factor, tmp.avg_wt, c="black", marker="x", label="All", lw=6)

# male
tmp_m = ict_factors[ict_factors.gender == "Male"]
tmp = tmp_m.groupby("factor")["wastedTime"].mean().reset_index(name="avg_wt")
axes[0].scatter(tmp.factor, tmp.avg_wt, c="b", marker="s", label="Male", lw=6)

# female
tmp_f = ict_factors[ict_factors.gender == "Female"]
tmp = tmp_f.groupby("factor")["wastedTime"].mean().reset_index(name="avg_wt")
axes[0].scatter(tmp.factor, tmp.avg_wt, c="r", marker="d", label="Female", lw=6)

axes[0].set_xlabel("ICT factors")
axes[0].set_ylabel("Average worthwhileness rating")
axes[0].legend(loc="best")
axes[0].set_title("All and by gender")

# tc
tc_lst2 = ict_factors.transp_category.unique()
colors = ["blue", "orange", "green", "red", "purple"]
for tc, ax_coords, color in zip(tc_lst2, axes_coords, colors):
    tmp_tc = ict_factors[ict_factors.transp_category == tc]
    tmp = tmp_tc.groupby("factor")["wastedTime"].mean().reset_index(name="avg_wt")

    axes[1].scatter(tmp.factor, tmp.avg_wt, c=color, lw=6, label=tc)
    axes[1].legend(loc="best")
    axes[1].set_xlabel("ICT factors")
    axes[1].set_ylabel("Average worthwhileness rating")
    axes[1].set_title("By transport cateogry")

plt.tight_layout()
plt.savefig(img_path + "h13_q3_gender_tc.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### BY COUNTRY
fig, axes = plt.subplots(nrows=2, ncols=5, sharey=True, figsize=(18, 7))
axes = axes.ravel()

for i in range(len(top10)):

    tmp_c = ict_factors[ict_factors.onCampaigns == top10[i]]
    tmp = tmp_c.groupby("factor")["wastedTime"].mean().reset_index(name="avg_wt")

    axes[i].scatter(tmp.factor, tmp.avg_wt, lw=6)
    axes[i].set_title(top10[i])
    axes[i].set_xticks(range(2))
    axes[i].set_xticklabels(["Charging", "Internet"])

plt.tight_layout()
plt.savefig(img_path + "h13_q3_country.png", bbox_to_anchor=True, bbox_inches="tight")