# Stats


## Questions

- [Q1](#Q1): Total number of trip legs by gender and distribution of worthwhileness ratings
- [Q2](#Q2): Gender distribution by country (needed for data interpretation later on)
- [Q3](#Q3): Modal split for all transport modes in a pie chart (not categories) – count of all trip legs
- [Q4](#Q4): Modal split for all transport mode categories – count of all trip legs – by country
- [Q5](#Q5): Total travel time per mode
- [Q6](#Q6): Total distance per mode
- [Q7](#Q7): Age distribution by country: number of users and trips


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

**READ DATA**

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/D5.2/"
img_path = "../../2019-12-16.out/D5.2/"

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
trips_users_df = pd.read_pickle(input_path + "trips_users_df.pkl")
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()
## Divide between male and female users

all_legs_M = all_legs[all_legs.gender == "Male"]
print("Legs of male users:", all_legs_M.shape[0])
print("Trips of male users:", all_legs_M.tripid.nunique())
print("Male users:", len(all_legs_M.userid.unique()))
print()
all_legs_F = all_legs[all_legs.gender == "Female"]
print("Legs of female users:", all_legs_F.shape[0])
print("Trips of female users:", all_legs_F.tripid.nunique())
print("Female users:", len(all_legs_F.userid.unique()))
print()
all_legs_O = all_legs[all_legs.gender == "Other"]
print("Legs of other users:", all_legs_O.shape[0])
print("Trips of other users:", all_legs_O.tripid.nunique())
print("Other users:", len(all_legs_O.userid.unique()))

In [None]:
# define for plots
age_range = list(all_legs.age.unique())

# assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())

<a id='Q1' ></a>
### Q1: Total number of trip legs by gender and distribution of worthwhileness ratings

In [None]:
# histogram of wastedTime
def wt_histo(data, color, users, filepath):
    bins = np.arange(1, data.wastedTime.max() + 1.5) - 0.5

    hist = data.hist(column="wastedTime", bins=bins, color=color)
    plt.title("Distribution of Worthhileness rating by {} users".format(users))
    plt.ylabel("Number of legs")
    plt.xlabel("Worthwhileness Rating")
    plt.tight_layout()

    plt.savefig(filepath)

In [None]:
# take only values in 1-5
all_legs_tmp = all_legs[
    (all_legs["wastedTime"] > 0) & (all_legs["wastedTime"] <= 5)
].copy()
# round to integer
all_legs_tmp["wastedTime"] = all_legs_tmp["wastedTime"].apply(lambda x: int(x))

filepath = img_path + "D5.2_start_dist_worthwhileness_rating_all.png"
wt_histo(all_legs_tmp, "green", "all", filepath)

In [None]:
# take only values in 1-5
all_legs_M_tmp = all_legs_M[
    (all_legs_M["wastedTime"] > 0) & (all_legs_M["wastedTime"] <= 5)
].copy()
# round to integer
all_legs_M_tmp["wastedTime"] = all_legs_M_tmp["wastedTime"].apply(lambda x: int(x))

filepath = img_path + "D5.2_start_dist_worthwhileness_rating_male.png"
wt_histo(all_legs_M_tmp, "blue", "male", filepath)

# take only values in 1-5
all_legs_F_tmp = all_legs_F[
    (all_legs_F["wastedTime"] > 0) & (all_legs_F["wastedTime"] <= 5)
].copy()
# round to integer
all_legs_F_tmp["wastedTime"] = all_legs_F_tmp["wastedTime"].apply(lambda x: int(x))

filepath = img_path + "D5.2_start_dist_worthwhileness_rating_female.png"
wt_histo(all_legs_F_tmp, "red", "female", filepath)

In [None]:
all_legs_tmp

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1)

bins = np.arange(1, all_legs_tmp.wastedTime.max() + 1.5) - 0.5
colors = ["green", "blue", "red"]
labels = ["All", "Male", "Female"]

ax.hist(
    [all_legs_tmp.wastedTime, all_legs_M_tmp.wastedTime, all_legs_F_tmp.wastedTime,],
    bins,
    histtype="bar",
    color=colors,
    label=labels,
)
ax.legend(prop={"size": 10})
ax.set_title("Distribution of worthwhileness ratings by gender")
plt.ylabel("Number of trips")
plt.xlabel("Worthwhileness Rating")

plt.tight_layout()
filepath = img_path + "D5.2_start_dist_worthwhileness_rating_multibar.png"
plt.savefig(filepath)

In [None]:
trips_users_df.columns

In [None]:
# take only values in 1-5
all_trips_tmp = trips_df.loc[
    (trips_df["overallScore"] > 0) & (trips_df["overallScore"] <= 5)
].copy()
# round to integer
all_trips_tmp["overallScore"] = all_trips_tmp["overallScore"].apply(lambda x: int(x))

In [None]:
all_trips_users_tmp = all_trips_tmp.merge(trips_users_df, on="tripid")[
    ["tripid", "userid", "overallScore"]
]
all_legs_gender = (
    all_legs[["userid", "gender"]].drop_duplicates(keep="first").reset_index()
)
all_trips_users_tmp = all_trips_users_tmp.merge(all_legs_gender, on="userid")

all_trips_users_M = all_trips_users_tmp.loc[all_trips_users_tmp.gender == "Male"]
all_trips_users_F = all_trips_users_tmp.loc[all_trips_users_tmp.gender == "Female"]

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1)

bins = np.arange(1, all_trips_users_tmp.overallScore.max() + 1.5) - 0.5
colors = ["green", "blue", "red"]
labels = ["All", "Male", "Female"]

ax.hist(
    [
        all_trips_users_tmp.overallScore,
        all_trips_users_M.overallScore,
        all_trips_users_F.overallScore,
    ],
    bins,
    histtype="bar",
    color=colors,
    label=labels,
)
ax.legend(prop={"size": 10})
ax.set_title("Distribution of mood ratings by gender")
plt.ylabel("Number of trips")
plt.xlabel("Mood Rating")

plt.tight_layout()
filepath = img_path + "D5.2_start_dist_mood_rating_multibar.png"
plt.savefig(filepath)

<a id='Q2' ></a>
### Q2: Gender distribution by country (needed for data interpretation later on)

In [None]:
all_legs_M_country = (
    all_legs_M[["userid", "onCampaigns"]].groupby("onCampaigns").size().reset_index()
)
all_legs_M_country.columns = ["campaign_country", "nusers"]
all_legs_M_country.set_index("campaign_country")

all_legs_F_country = (
    all_legs_F[["userid", "onCampaigns"]].groupby("onCampaigns").size().reset_index()
)
all_legs_F_country.columns = ["campaign_country", "nusers"]
all_legs_F_country.set_index("campaign_country")

df = pd.DataFrame(
    {
        "male": all_legs_M_country.nusers,
        "female": all_legs_F_country.nusers,
        "countries": all_legs_M_country.campaign_country.values,
    }
)
ax = df.plot.bar(x="countries", rot=45)

plt.tight_layout()
filepath = img_path + "D5.2_start_dist_legs_by_country_gender.png"
plt.savefig(filepath)

In [None]:
all_legs_M_country.set_index("campaign_country")

In [None]:
all_legs_F_country.set_index("campaign_country")

In [None]:
all_legs_country = (
    all_legs[["userid", "onCampaigns"]].groupby("onCampaigns").size().reset_index()
)
all_legs_country.columns = ["campaign_country", "nusers"]
all_legs_country.set_index("campaign_country")

In [None]:
all_legs_M_country = (
    all_legs_M[["userid", "onCampaigns"]]
    .groupby("onCampaigns")
    .nunique()["userid"]
    .reset_index()
)
all_legs_M_country.columns = ["campaign_country", "nusers"]
all_legs_M_country.set_index("campaign_country")

all_legs_F_country = (
    all_legs_F[["userid", "onCampaigns"]]
    .groupby("onCampaigns")
    .nunique()["userid"]
    .reset_index()
)
all_legs_F_country.columns = ["campaign_country", "nusers"]
all_legs_F_country.set_index("campaign_country")

df = pd.DataFrame(
    {
        "male": all_legs_M_country.nusers,
        "female": all_legs_F_country.nusers,
        "countries": all_legs_M_country.campaign_country.values,
    }
)
ax = df.plot.bar(x="countries", rot=45)

plt.tight_layout()
filepath = img_path + "D5.2_start_dist_users_by_country_gender.png"
plt.savefig(filepath)

<a id='Q3' ></a>
### Q3: Modal split for all transport modes in a pie chart (not categories) – count of all trip legs

In [None]:
tm_legs = (
    all_legs[["legid", "correctedModeOfTransport_str"]]
    .groupby("correctedModeOfTransport_str")
    .size()
    .reset_index()
)
tm_legs.columns = ["transport_mode", "nlegs"]

tm_legs = tm_legs.loc[tm_legs["transport_mode"] != "unknown"]
tm_dict = dict(zip(tm_legs.transport_mode, tm_legs.nlegs))

tm_legs_top = tm_legs.sort_values(by="nlegs", ascending=False)[:10]

In [None]:
from collections import defaultdict

tm_top = tm_legs_top.transport_mode.values.tolist()

new_tm_dict = defaultdict(int)
for tm, nlegs in tm_dict.items():
    if tm in tm_top:
        new_tm_dict[tm] += nlegs
    else:
        new_tm_dict["other"] += nlegs

sorted_tm_dict = {
    k: v for k, v in sorted(new_tm_dict.items(), key=lambda item: item[1])
}

In [None]:
old_keys = [
    "motorcycle",
    "tram",
    "electricBike",
    "subway",
    "other",
    "train",
    "carPassenger",
    "bus",
    "bicycle",
    "carDriver",
    "walking",
]

new_keys = [
    "motorcycle",
    "tram",
    "electric bike",
    "subway",
    "other",
    "train",
    "car passenger",
    "bus",
    "bicycle",
    "car driver",
    "walking",
]


for oldkey, newkey in zip(old_keys, new_keys):
    sorted_tm_dict[newkey] = sorted_tm_dict.pop(oldkey)

In [None]:
# Plot
# https://mycolor.space/?hex=%23845EC2&sub=1
colors = [
    "#2c73d2",
    "#845ec2",
    "#d83121",
    "#d65db1",
    "#ff6f91",
    "#c493ff",
    "#f3c5ff",
    "#ff9671",
    "#ffc75f",
    "#f9f871",
    "#4b4453",
][::-1]

explode = np.arange(0.1, 0.6, 0.1).tolist()[::-1] + [0] * 6  # explode 1st slice

plt.pie(
    sorted_tm_dict.values(),
    explode=explode,
    labels=sorted_tm_dict.keys(),
    colors=colors,
    autopct="%.1f%%",
    startangle=90,
)

plt.axis("equal")

filepath = img_path + "D5.2_mode_split_pie.png"
plt.savefig(filepath)

<a id='Q4' ></a>
### Q4: Modal split for all transport mode categories – count of all trip legs – by country

In [None]:
all_legs_country_tc = (
    all_legs[["legid", "onCampaigns", "transp_category"]]
    .groupby(["onCampaigns", "transp_category"])
    .size()
    .reset_index()
)
all_legs_country_tc.columns = ["campaign_country", "transp_category", "nlegs"]

all_legs_country_tc.head()

In [None]:
top10countries = (
    all_legs[["legid", "onCampaigns"]].groupby("onCampaigns").size().keys().tolist()
)
transport_categories = (
    all_legs[["legid", "transp_category"]]
    .groupby("transp_category")
    .size()
    .keys()
    .tolist()
)

transport_categories_short = {
    "cycling_emerging_micromobility": "C",
    "private_motorized": "Pm",
    "public_transp_long_dist": "PTl",
    "public_transp_short_dist": "PTs",
    "walking": "W",
}

tcs = [transport_categories_short[tc] for tc in transport_categories]

In [None]:
def plot_country(values_count, axid, title_str):

    sns.barplot(
        data=values_count, x="transp_category", y="count", ax=axes[axid],
    )
    axes[axid].set_xticks(range(5))
    axes[axid].set_xticklabels(tcs)
    for item in axes[axid].get_xticklabels():
        item.set_rotation(45)

    axes[axid].tick_params(labelsize=10)
    axes[axid].set_title(title_str, fontsize=14)
    axes[axid].set_xlabel("")
    axes[axid].set_ylabel("")

In [None]:
tmp = all_legs.loc[all_legs.onCampaigns == "ITA"]
tmp.groupby("transp_category").size().reset_index(name="count")

In [None]:
### BY COUNTRY
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(12, 7))
axes = axes.ravel()

axid = 0

for c in top10countries:

    tmp = all_legs.loc[all_legs.onCampaigns == c].copy()
    val_count = tmp.groupby("transp_category").size().reset_index(name="count")

    plot_country(val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "D5.2_stat_ct_country.png", bbox_to_anchor=True, bbox_inches="tight"
)

<a id='Q5' ></a>
### Q5: Total travel time per mode

In [None]:
# 1. wastedTime should be integer between 1 and 5 (stars).
# Remove values outside this range and round all values to integer.
# double values are because of the merging of the legs.

# take only values in 1-5
all_legs_tmp = all_legs[
    (all_legs["wastedTime"] > 0) & (all_legs["wastedTime"] <= 5)
].copy()
# round to integer
all_legs_tmp["wastedTime"] = all_legs["wastedTime"].apply(lambda x: int(x))

print("useful legs:", len(all_legs_tmp))
# all_legs_tmp.groupby("wastedTime").size().reset_index(name="count")

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP) and avg wastedTime (PER TRIP)
trips_tt_wt = (
    all_legs_tmp.groupby("tripid")["inferred_leg_duration_min"]
    .sum()
    .reset_index(name="total_tt")
)  # total travel time
tmp_wt = (
    all_legs_tmp.groupby("tripid")["wastedTime"].mean().reset_index(name="avg_wt")
)  # average wasted time
trips_tt_wt = trips_tt_wt.merge(tmp_wt)
trips_tt_wt = trips_tt_wt[trips_tt_wt["total_tt"] > 0]
print("useful trips:", trips_tt_wt.shape)
trips_tt_wt.head()

In [None]:
# histogram of total travel time
#### TODO: plot the mean
# create short-medium-long trips
dist_segs = trips_tt_wt["total_tt"].quantile([0.33, 0.66]).values
medium_threshold = dist_segs[0]
long_threshold = dist_segs[1]
print("medium_threshold:", medium_threshold)
print("long_threshold:", long_threshold)
print()

# fig = plt.figure(figsize=(12,12))
hist = trips_tt_wt.hist(
    column="total_tt", bins=[i * 2 for i in range(0, math.ceil(long_threshold * 4))]
)
plt.title("Histogram of total travel time")
plt.ylabel("Number of trips")
plt.xlabel("Minutes")
plt.tight_layout()

plt.savefig(img_path + "D5.2_total_trip_travel_time.png")

<a id='Q6' ></a>
### Q6: Total distance per mode

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP) and avg wastedTime (PER TRIP)
trips_td_wt = (
    all_legs_tmp.groupby("tripid")["trueDistance"].sum().reset_index(name="total_td")
)  # total travel time
tmp_wt = (
    all_legs_tmp.groupby("tripid")["wastedTime"].mean().reset_index(name="avg_wt")
)  # average wasted time
trips_td_wt = trips_td_wt.merge(tmp_wt)
trips_td_wt = trips_td_wt[trips_td_wt["total_td"] > 0]
print("useful trips:", trips_td_wt.shape)
trips_td_wt.head()

In [None]:
# histogram of total travel time
#### TODO: plot the mean
# create short-medium-long trips
dist_segs = trips_td_wt["total_td"].quantile([0.33, 0.66]).values
medium_threshold = dist_segs[0]
long_threshold = dist_segs[1]
print("medium_threshold:", medium_threshold)
print("long_threshold:", long_threshold)
print()

# fig = plt.figure(figsize=(12,12))
hist = trips_td_wt.hist(column="total_td", bins=range(0, 25000, 500),)

plt.title("Histogram of total travel distance")
plt.ylabel("Number of trips")
plt.xlabel("Distance")
plt.tight_layout()

plt.savefig(img_path + "D5.2_total_trip_travel_distance.png")

In [None]:
trips_td_wt["total_td"].mean()

In [None]:
trips_td_wt["total_td"].median()

<a id='Q7' ></a>
### Q7: Age distribution by country: number of users and trips

In [None]:
all_legs_age_country = all_legs[
    ["legid", "tripid", "userid", "onCampaigns", "age"]
].copy()
all_legs_age_country.columns = [
    "legid",
    "tripid",
    "userid",
    "campaign_country",
    "age_range",
]

all_legs_age_country.head()

In [None]:
age_country_ntrips = (
    all_legs_age_country[["tripid", "campaign_country", "age_range"]]
    .drop_duplicates("tripid", keep="first")
    .groupby(["campaign_country", "age_range"])
    .size()
    .reset_index()
)
age_country_ntrips.columns = ["campaign_country", "age_range", "ntrips"]

age_country_ntrips.head(3)

In [None]:
age_country_nusers = (
    all_legs_age_country[["userid", "campaign_country", "age_range"]]
    .drop_duplicates("userid", keep="first")
    .groupby(["campaign_country", "age_range"])
    .size()
    .reset_index()
)
age_country_nusers.columns = ["campaign_country", "age_range", "nusers"]

age_country_nusers.head(3)

In [None]:
age_country_ntrips_nusers = age_country_ntrips.merge(
    age_country_nusers, on=["campaign_country", "age_range"]
)
age_country_ntrips_nusers.head()

In [None]:
# top10countries
# ['AAA', 'BEL', 'ESP', 'FIN', 'FRA', 'HRV', 'ITA', 'NOR', 'PRT', 'SVK']
top10countries = (
    all_legs[["legid", "onCampaigns"]].groupby("onCampaigns").size().keys().tolist()
)

# age ranges
# ['16-24', '25-49', '50-64', '65+']
age_ranges = sorted(all_legs["age"].unique().tolist())

In [None]:
def plot_var_age_country(var, values_count, axid, title_str):

    sns.barplot(
        data=values_count, x="age_range", y=var, ax=axes[axid],
    )
    axes[axid].set_xticks(range(5))
    axes[axid].set_xticklabels(age_ranges)
    for item in axes[axid].get_xticklabels():
        item.set_rotation(45)

    axes[axid].tick_params(labelsize=10)
    axes[axid].set_title(title_str, fontsize=14)
    axes[axid].set_xlabel("")
    axes[axid].set_ylabel("")

In [None]:
tmp = age_country_ntrips_nusers.loc[age_country_ntrips_nusers.campaign_country == "ITA"]
tmp

In [None]:
# plot ntrips
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(12, 7))
axes = axes.ravel()

axid = 0

for c in top10countries:

    tmp = age_country_ntrips.loc[age_country_ntrips.campaign_country == c]
    val_count = tmp

    plot_var_age_country("ntrips", val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "D5.2_stat_ntrips_age_country.png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)

In [None]:
# plot nusers
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(12, 7))
axes = axes.ravel()

axid = 0

for c in top10countries:

    tmp = age_country_nusers.loc[age_country_nusers.campaign_country == c]
    val_count = tmp

    plot_var_age_country("nusers", val_count, axid=axid, title_str=c)
    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

plt.tight_layout()
plt.savefig(
    img_path + "D5.2_stat_nusers_age_country.png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)

In [None]:
def plot_grouped_age_country(values_count, axid, title_str):

    values_count.plot.bar()

    axes[axid].set_xticks(range(5))
    axes[axid].set_xticklabels(age_ranges)
    for item in axes[axid].get_xticklabels():
        item.set_rotation(45)

    axes[axid].tick_params(labelsize=10)
    axes[axid].set_title(title_str, fontsize=14)
    axes[axid].set_xlabel("")
    axes[axid].set_ylabel("")

In [None]:
country_axes = dict(
    el for el in zip(top10countries, itertools.product(range(2), range(5)))
)
country_axes

In [None]:
# plot grouped
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(12, 7))

axid = 0

country_axes = dict(
    el for el in zip(top10countries, itertools.product(range(2), range(5)))
)

for c in top10countries:

    tmp = age_country_ntrips_nusers.loc[age_country_ntrips_nusers.campaign_country == c]
    tmp.plot.bar(x="age_range", ax=axes[country_axes[c]], legend=False, title=c)

    if axid == 0:
        fig.legend(loc="best", fontsize="x-small")

    axid += 1

# handles, labels = ax.get_legend_handles_labels()
# fig.legend(handles, labels, loc='upper center')

plt.tight_layout()
plt.savefig(
    img_path + "D5.2_stat_grouped_age_country.png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)