# WI analysis

Plot to add to H10.

## [1.](#1)

With respect to **H10 - Q4** (Correlation between worthwhileness and worthwhileness elements rating):

For each of the following variables, find the correlation between ww and ww elements by **Transport Category**:

- purpose
- country
- age range
- weekday/weekend
- urban/suburban/rural


## [2.](#2)

For each of the following variables, find the correlation between ww and ww elements by **Purpose**:

- transport category
- country
- age range
- weekday/weekend
- urban/suburban/rural


## [3.](#3)

For each transport category find the distribution of worthwhileness rating and ww elements.
<br>Filtering also for:

- gender
- purpose
- country
- age range

## [4.](#4)

- ww rating distribution
- transport categories distribution
- ww elements distribution
- transport category vs ww ratings
- ww elements vs ww ratings
- ww elements vs transport categories

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/WI_results/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

**READ DATA**

In [None]:
#### all_legs ####

all_legs = pd.read_pickle(input_path + legs)

# remove "unknown" as transport category (?)
all_legs = all_legs[all_legs.transp_category != "Unknown"]

# select only useful wasted time
all_legs = all_legs[(all_legs.wastedTime > 0) & (all_legs.wastedTime < 6)]
# convert to int
all_legs["wastedTime"] = all_legs["wastedTime"].apply(lambda x: np.round(x))

# country - assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())


#### all_legs_urban ####
all_legs_urban = pd.read_pickle(
    input_path + "all_legs_final_ds_user_info_urban_class.pkl"
)
# take legs starts and end in the same area
all_legs_urban = all_legs_urban[all_legs_urban.start_class == all_legs_urban.end_class]


#### values_from_trip ####
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")

# add info
values_from_trip = values_from_trip.merge(
    all_legs[
        [
            "legid",
            "wastedTime",
            "userid",
            "gender",
            "onCampaigns",
            "age",
            "transp_category",
            "we_vs_wd",
        ]
    ],
    on="legid",
).drop_duplicates()

# add urban class
values_from_trip = values_from_trip.merge(
    all_legs_urban[["legid", "start_class"]], on="legid"
).drop_duplicates()

values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]
# remove none transp cat
values_from_trip = values_from_trip[values_from_trip.transp_category.notna()]


#### purposes -> trip_obj_grouped.pkl ####
trip_objs = pd.read_pickle(input_path + "trip_objs_grouped.pkl")

# add purpose to values_from_trip
values_from_trip = values_from_trip.merge(
    trip_objs[["tripid", "objective_str"]], on="tripid"
)


print("Legs:", values_from_trip.shape[0])
print("Trips: ", len(values_from_trip.tripid.unique()))
print("Users:", len(values_from_trip.userid.unique()))
print()

values_from_trip.head()

<a id='1' ></a>
## 1.

In [None]:
def create_plot(variable, modality, img_path, img_title, title):

    """Input: 
    - variable: the variable we want to filter (e.g. gender)
    - modality: the modality for which we want to filter (e.g. Male)
    - img_title: title of the image (e.g. 'tc_by_'+variable+'_'+modality+'.png' --> tc_by_gender_male.png)
    
    """

    values_from_trip_filtered = values_from_trip[values_from_trip[variable] == modality]
    tc_lst = list(values_from_trip_filtered.transp_category.unique())

    nrows = 2
    ncols = 3
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
    axes = axes.ravel()

    for i in range(len(tc_lst)):
        tmp = (
            values_from_trip_filtered[
                values_from_trip_filtered.transp_category == tc_lst[i]
            ]
            .groupby(["wastedTime", "valueFromTrip"])["value"]
            .mean()
            .reset_index(name="average")
        )
        sns.barplot(
            x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i]
        )
        if i == 0:
            axes[i].legend(fontsize="x-small")
        else:
            axes[i].legend("")
        axes[i].set_ylabel("average", fontsize=13)
        axes[i].set_xlabel("wastedTime", fontsize=13)
        axes[i].tick_params(labelsize=12)
        axes[i].set_title(tc_lst[i], fontsize=15)

    fig.suptitle(title)
    fig.tight_layout()
    fig.subplots_adjust(top=0.75)

    plt.savefig(img_path + img_title, bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### GENDER

img_path = "../../2019-12-16.out/WI_results/tc_gender/"

title = (
    "Correlation between worthwhileness value and worthwhileness elements."
    + "\n"
    + "By transport category and by gender "
    + "Male"
)
create_plot("gender", "Male", img_path, "tc_by_gender_male.png", title)
title = (
    "Correlation between worthwhileness value and worthwhileness elements."
    + "\n"
    + "By transport category and by gender "
    + "Female"
)
create_plot("gender", "Female", img_path, "tc_by_gender_female.png", title)

In [None]:
### PURPOSE

img_path = "../../2019-12-16.out/WI_results/tc_purpose/"
purpose_lst = values_from_trip.objective_str.unique()

for i in purpose_lst:
    img_title = "tc_by_purpose_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By transport category and by purpose "
        + i
    )
    create_plot("objective_str", i, img_path, img_title, title)

In [None]:
### COUNTRY

img_path = "../../2019-12-16.out/WI_results/tc_country/"
country_lst = values_from_trip.onCampaigns.unique()

for i in country_lst:
    img_title = "tc_by_country_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By transport category and by country "
        + i
    )
    create_plot("onCampaigns", i, img_path, img_title, title)

In [None]:
### AGE RANGE

img_path = "../../2019-12-16.out/WI_results/tc_age/"
age_lst = values_from_trip.age.unique()

for i in age_lst:
    img_title = "tc_by_age_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By transport category and by age range "
        + i
    )
    create_plot("age", i, img_path, img_title, title)

In [None]:
### WEEKEND - WORKING DAYS

img_path = "../../2019-12-16.out/WI_results/tc_work_week/"
work_week_lst = values_from_trip.we_vs_wd.unique()

for i in work_week_lst:

    img_title = "tc_by_we_wd_" + i + ".png"
    print(i)
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By transport category and by "
        + i
    )
    create_plot("we_vs_wd", i, img_path, img_title, title)

In [None]:
### URBAN CLASS

img_path = "../../2019-12-16.out/WI_results/tc_urban_class/"
urban_lst = values_from_trip.start_class.unique()

for i in urban_lst:
    img_title = "tc_by_urban_class" + i + ".png"
    print(i)
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By transport category and by class "
        + i
    )
    create_plot("start_class", i, img_path, img_title, title)

<a id='2' ></a>
## 2.


In [None]:
def create_plot(variable, modality, img_path, img_title, title):

    """Input: 
    - variable: the variable we want to filter (e.g. gender)
    - modality: the modality for which we want to filter (e.g. Male)
    - img_title: title of the image (e.g. 'tc_by_'+variable+'_'+modality+'.png' --> tc_by_gender_male.png)
    
    """

    values_from_trip_filtered = values_from_trip[values_from_trip[variable] == modality]
    purpose_lst = list(values_from_trip_filtered.objective_str.unique())

    nrows = 2
    ncols = 3
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8), sharey=True)
    axes = axes.ravel()

    for i in range(len(purpose_lst)):
        tmp = (
            values_from_trip_filtered[
                values_from_trip_filtered.objective_str == purpose_lst[i]
            ]
            .groupby(["wastedTime", "valueFromTrip"])["value"]
            .mean()
            .reset_index(name="average")
        )
        sns.barplot(
            x="wastedTime", y="average", hue="valueFromTrip", data=tmp, ax=axes[i]
        )
        if i == 0:
            axes[i].legend(fontsize="x-small")
        else:
            axes[i].legend("")
        axes[i].set_ylabel("average", fontsize=13)
        axes[i].set_xlabel("wastedTime", fontsize=13)
        axes[i].tick_params(labelsize=12)
        axes[i].set_title(purpose_lst[i], fontsize=15)

    fig.suptitle(title)
    fig.tight_layout()
    fig.subplots_adjust(top=0.75)

    plt.savefig(img_path + img_title, bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### GENDER

img_path = "../../2019-12-16.out/WI_results/purpose_gender/"

title = (
    "Correlation between worthwhileness value and worthwhileness elements."
    + "\n"
    + "By purpose and by gender "
    + "Male"
)
create_plot("gender", "Male", img_path, "purpose_by_gender_male.png", title)
title = (
    "Correlation between worthwhileness value and worthwhileness elements."
    + "\n"
    + "By purpose and by gender "
    + "Female"
)
create_plot("gender", "Female", img_path, "purpose_by_gender_female.png", title)

In [None]:
### TC

img_path = "../../2019-12-16.out/WI_results/purpose_tc/"

tc_lst = values_from_trip.transp_category.unique()

for i in tc_lst:
    img_title = "purpose_by_tc_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By purpose and by transport category "
        + i
    )
    create_plot("transp_category", i, img_path, img_title, title)

In [None]:
### AGE

img_path = "../../2019-12-16.out/WI_results/purpose_age/"

age_lst = values_from_trip.age.unique()

for i in age_lst:
    img_title = "purpose_by_age_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By purpose and by age range "
        + i
    )
    create_plot("age", i, img_path, img_title, title)

In [None]:
### COUNTRY

img_path = "../../2019-12-16.out/WI_results/purpose_country/"

country_lst = values_from_trip.onCampaigns.unique()

for i in country_lst:
    img_title = "purpose_by_country_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By purpose and by country "
        + i
    )
    create_plot("onCampaigns", i, img_path, img_title, title)

In [None]:
### WEEKDAY - WEEKEND

img_path = "../../2019-12-16.out/WI_results/purpose_work_week/"

work_week_lst = values_from_trip.we_vs_wd.unique()

for i in work_week_lst:
    img_title = "purpose_by_work_week_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By purpose and by "
        + i
    )
    create_plot("we_vs_wd", i, img_path, img_title, title)

In [None]:
### URBAN CLASS

img_path = "../../2019-12-16.out/WI_results/purpose_urban_class/"

urban_lst = values_from_trip.start_class.unique()

for i in urban_lst:
    img_title = "purpose_by_class_" + i + ".png"
    title = (
        "Correlation between worthwhileness value and worthwhileness elements."
        + "\n"
        + "By purpose and by class"
        + i
    )
    create_plot("start_class", i, img_path, img_title, title)

<a id='3' ></a>
## 3.

**WORTHWHILENESS RATING**

In [None]:
vv = values_from_trip.drop(["valueFromTrip", "value", "code"], axis=1).drop_duplicates()
vv.head()

In [None]:
def create_plot_ww_rate(variable, modality, img_path, img_title, title):

    """
    - variable: the variable we want to filter (e.g. gender)
    - modality: the modality for which we want to filter (e.g. Male)
    - img_title: title of the image (e.g. 'ww_rate_tc_by_'+variable+'_'+modality+'.png' --> ww_rate_tc_by_gender_male.png)
    """
    vv_filtered = vv[vv[variable] == modality]
    tc_lst = list(vv_filtered.transp_category.unique())

    nrows = 1
    ncols = 2
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 8))
    axes = axes.ravel()

    for i in range(2):

        if i == 0:
            first = vv_filtered[
                vv_filtered.transp_category.isin(
                    ["walking", "cycling_emerging_micromobility"]
                )
            ]
            tmp = (
                first.groupby(["transp_category", "wastedTime"])
                .size()
                .reset_index(name="count")
            )
        else:
            second = vv_filtered[
                ~vv_filtered.transp_category.isin(
                    ["walking", "cycling_emerging_micromobility"]
                )
            ]
            tmp = (
                second.groupby(["transp_category", "wastedTime"])
                .size()
                .reset_index(name="count")
            )

        sns.barplot(
            x="wastedTime", y="count", hue="transp_category", data=tmp, ax=axes[i]
        )
        axes[i].set_xlabel("worthwhileness ratings")
        axes[i].legend(loc="upper left", fontsize="small")

    fig.suptitle(title)
    fig.tight_layout()
    fig.subplots_adjust(top=0.90)
    plt.savefig(img_path + img_title, bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### GENDER

img_path = "../../2019-12-16.out/WI_results/ww_rate_gender/"
gender_lst = ["Male", "Female"]

for i in gender_lst:
    img_title = "ww_rate_tc_by_gender_" + i + ".png"
    title = "Worthwhileness ratings for " + i
    create_plot_ww_rate("gender", i, img_path, img_title, title)

In [None]:
### PURPOSE

img_path = "../../2019-12-16.out/WI_results/ww_rate_purpose/"
purpose_lst = values_from_trip.objective_str.unique()

for i in purpose_lst:
    img_title = "ww_rate_tc_by_purpose_" + i + ".png"
    title = "Worthwhileness ratings for purpose " + i
    create_plot_ww_rate("objective_str", i, img_path, img_title, title)

In [None]:
### COUNTRY

img_path = "../../2019-12-16.out/WI_results/ww_rate_country/"
country_lst = values_from_trip.onCampaigns.unique()

for i in country_lst:
    img_title = "ww_rate_tc_by_country_" + i + ".png"
    title = "Worthwhileness ratings for country " + i
    create_plot_ww_rate("onCampaigns", i, img_path, img_title, title)

In [None]:
### AGE

img_path = "../../2019-12-16.out/WI_results/ww_rate_age/"
age_lst = values_from_trip.age.unique()

for i in age_lst:
    img_title = "ww_rate_tc_by_age_" + i + ".png"
    title = "Worthwhileness ratings for age range " + i
    create_plot_ww_rate("age", i, img_path, img_title, title)

**WORTHWHILENESS ELEMENTS**

In [None]:
def create_plot_ww_elem(variable, modality, img_path, img_title, title):

    """
    - variable: the variable we want to filter (e.g. gender)
    - modality: the modality for which we want to filter (e.g. Male)
    - img_title: title of the image (e.g. 'ww_elem_tc_by_'+variable+'_'+modality+'.png' --> ww_elem_tc_by_gender_male.png)
    """
    values_from_trip_filtered = values_from_trip[values_from_trip[variable] == modality]
    tc_lst = list(values_from_trip_filtered.transp_category.unique())

    nrows = 2
    ncols = 3
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 9))
    axes = axes.ravel()

    for i in range(len(tc_lst)):

        tmp0 = values_from_trip_filtered[
            values_from_trip_filtered.transp_category == tc_lst[i]
        ]
        tmp = tmp0.groupby(["valueFromTrip", "value"]).size().reset_index(name="count")
        sns.barplot(x="valueFromTrip", y="count", hue="value", data=tmp, ax=axes[i])
        axes[i].set_xlabel("")
        axes[i].legend(fontsize="small")
        axes[i].set_title(tc_lst[i], fontsize=14)
        axes[i].set_xticklabels(["E", "F", "Pw", "Pt"])
        axes[i].legend(fontsize="small")

    fig.suptitle(title, fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(top=0.85)
    plt.savefig(img_path + img_title, bbox_to_anchor=True, bbox_inches="tight")

In [None]:
### GENDER

img_path = "../../2019-12-16.out/WI_results/ww_elem_gender/"
gender_lst = ["Male", "Female"]

for i in gender_lst:
    img_title = "ww_elem_tc_by_gender_" + i + ".png"
    title = "Worthwhileness elements for gender " + i
    create_plot_ww_elem("gender", i, img_path, img_title, title)

In [None]:
### PURPOSE

img_path = "../../2019-12-16.out/WI_results/ww_elem_purpose/"
purpose_lst = values_from_trip.objective_str.unique()

for i in purpose_lst:
    img_title = "ww_elem_tc_by_purpose_" + i + ".png"
    title = "Worthwhileness elements for purpose " + i
    create_plot_ww_elem("objective_str", i, img_path, img_title, title)

In [None]:
### COUNTRY

img_path = "../../2019-12-16.out/WI_results/ww_elem_country/"
country_lst = values_from_trip.onCampaigns.unique()

for i in country_lst:
    img_title = "ww_elem_tc_by_country_" + i + ".png"
    title = "Worthwhileness elements for country " + i
    create_plot_ww_elem("onCampaigns", i, img_path, img_title, title)

In [None]:
### AGE

img_path = "../../2019-12-16.out/WI_results/ww_elem_age/"
age_lst = values_from_trip.age.unique()

for i in age_lst:
    img_title = "ww_elem_tc_by_age_" + i + ".png"
    title = "Worthwhileness elements for country " + i
    create_plot_ww_elem("onCampaigns", i, img_path, img_title, title)

<a id='4' ></a>
## 4.

- ww rating distribution
- transport categories distribution
- ww elements distribution (v1/v2)
- transport category vs ww ratings
- ww elements vs ww ratings
- ww elements vs transport categories

In [None]:
table_path = "../../2019-12-16.out/WI_results/tables/"

# consider the same number of legs in values_from_trip and all_legs
leg_lst = values_from_trip.legid.unique()
all_legs2 = all_legs[all_legs.legid.isin(leg_lst)]

print(all_legs2.shape)
print(len(leg_lst))

In [None]:
### 1. worthwhileness rating distribution
tmp = all_legs2.groupby("wastedTime").size().reset_index(name="nlegs")
# save
tmp.to_csv(table_path + "ww_ratings.csv", index=False)
tmp

In [None]:
### 2. transport categories distribution
tmp = all_legs2.groupby("transp_category").size().reset_index(name="nlegs")
# save
tmp.to_csv(table_path + "transp_category.csv", index=False)
tmp

In [None]:
### 3. ww elements - V1

tmp = values_from_trip[["legid", "value", "valueFromTrip"]].drop_duplicates()
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()
# add transport category and userid
values_from_trip_pivot = values_from_trip_pivot.merge(
    all_legs[["legid", "userid", "transp_category", "wastedTime"]], on="legid"
).drop_duplicates()
# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)

values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)

In [None]:
values_from_trip_melt = values_from_trip_pivot.melt(
    id_vars=["legid", "transp_category", "userid", "wastedTime"],
    value_vars=["Enjoyment", "Productivity", "Fitness"],
)
tmp = pd.crosstab(
    values_from_trip_melt.variable, values_from_trip_melt.value, margins=True
)
# save
tmp.to_csv(table_path + "ww_elements_table.csv")
tmp

In [None]:
### 4. ww elements - V2

comb = (
    values_from_trip_pivot.groupby(["Enjoyment", "Fitness", "Productivity"])
    .size()
    .reset_index(name="nlegs")
)
# save
comb.to_csv(table_path + "ww_elements_combinations.csv", index=False)
comb

In [None]:
### 5. tc vs ww rate
tmp = pd.crosstab(all_legs2.transp_category, all_legs2.wastedTime, margins=True)
# save
tmp.to_csv(table_path + "tc_vs_ww_rate.csv")
tmp

In [None]:
### 6. ww elements vs transport category - V1

tmp = pd.crosstab(
    values_from_trip_melt.transp_category,
    [values_from_trip_melt.variable, values_from_trip_melt.value],
)
tmp.to_csv(table_path + "tc_vs_ww_elements_table.csv")
tmp

In [None]:
### 7. ww elements vs transport category - V2 combinations
tmp0 = (
    values_from_trip_pivot.groupby(
        ["Enjoyment", "Productivity", "Fitness", "transp_category"]
    )
    .size()
    .reset_index(name="nlegs")
)
tmp = tmp0.pivot_table(
    index=["Enjoyment", "Productivity", "Fitness"],
    columns="transp_category",
    values="nlegs",
)
tmp.fillna(0, inplace=True)
# save
tmp.to_csv(table_path + "tc_vs_ww_elem_combinations.csv")
tmp

In [None]:
### 8. ww elements vs ww rate - V1
tmp = pd.crosstab(
    values_from_trip_melt.wastedTime,
    [values_from_trip_melt.variable, values_from_trip_melt.value],
)
tmp.to_csv(table_path + "ww_rate_vs_ww_elements_table.csv")
tmp

In [None]:
### 9. ww elements vs ww rate - V2 combinations
tmp0 = (
    values_from_trip_pivot.groupby(
        ["Enjoyment", "Productivity", "Fitness", "wastedTime"]
    )
    .size()
    .reset_index(name="nlegs")
)
tmp = tmp0.pivot_table(
    index=["Enjoyment", "Productivity", "Fitness"], columns="wastedTime", values="nlegs"
)
tmp.fillna(0, inplace=True)
# save
tmp.to_csv(table_path + "ww_rate_vs_ww_elements_combinations.csv")
tmp