# E-bikes

For bike, ebike, bikesharing, microscooter and walking:

1. to compare stated onboarding preferences to actual trips (to confirm the conclusion by Holger that the experience is generally below that which is expected)
2. to compare top positive and negative factors (you have produced something similar per mode category, if you could do it for these 5 modes it would be really great)
3. to compare preferred activities while on each five modes based on highest value given (e.g. difference of importance of fitness between bike and ebike)
4. basic gender analysis (all the above disaggregated by gender)
5. distance analysis: to show that most urban trips are lower than X kilometres (this is mostly to show the potential for modal shift, particularly from car to active modes, I think I can do this myself with the current dataset)

In [None]:
import os
import sys
import json
import math
import datetime
import importlib
import pathlib
import itertools
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

from pprint import pprint
from pandas.io.json import json_normalize
from matplotlib import rcParams

%matplotlib inline

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = pathlib.Path("../../2019-12-16.out/")
out_path = pathlib.Path("../../2019-12-16.out/yannick/")
img_path = pathlib.Path("../../2019-12-16.out/yannick/")

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
# trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()

In [None]:
### read experience factors
all_factors = pd.read_pickle(input_path + "all_factors.pkl")

# delete legs with minus=F and plus=F
all_factors = all_factors[
    ~((all_factors["minus"] == False) & (all_factors["plus"] == False))
]

# delete legs with minus=T and plus=T (3% of obs)
all_factors = all_factors[
    ~((all_factors["minus"] == True) & (all_factors["plus"] == True))
]

# select only useful cols
all_factors = all_factors[
    [
        "correctedModeOfTransport_str",
        "legid",
        "minus",
        "plus",
        "tripid",
        "factor",
        "legStartDay",
    ]
]

# add info
all_factors = all_factors.merge(
    all_legs[
        ["legid", "wastedTime", "gender", "age", "onCampaigns", "transp_category"]
    ],
    on="legid",
)

## add purpose
# read purposes -> trip_obj_grouped.pkl
# trip_objs = pd.read_pickle(input_path + 'trip_objs_grouped.pkl')
# add purpose to values_from_trip
# all_factors = all_factors.merge(trip_objs[['tripid', 'objective_str']], on='tripid').drop_duplicates()

# select useful wastedTime
all_factors = all_factors[(all_factors.wastedTime > 0) & (all_factors.wastedTime < 6)]
all_factors["wastedTime"] = all_factors["wastedTime"].apply(lambda x: np.round(x, 0))

# remove legs with "None" transport category
all_factors = all_factors[(all_factors.transp_category.notna())]

# checks
print("all records:", len(all_factors))
xx = all_factors[(all_factors["minus"] == False) & (all_factors["plus"] == True)]
print("only plus: ", len(xx))
xx = all_factors[(all_factors["minus"] == True) & (all_factors["plus"] == False)]
print("only minus: ", len(xx))

# create a column with the impact (minus)
# all_factors['impact'] = np.nan
# for idx, row in all_factors.iterrows():

# only plus
#    if (row['minus'] == False) & (row['plus'] == True):
#        all_factors.loc[idx, 'impact'] = 'plus'
#    # only minus
#    if (row['minus'] == True) & (row['plus'] == False):
#        all_factors.loc[idx, 'impact'] = 'minus'


impact_lst = ["plus", "minus"]
# all_factors.groupby('impact').size()

all_factors.head()

In [None]:
# define for plots
age_range = list(all_legs.age.unique())

# assign 'CHE' to the class Other (AAA)
all_legs["onCampaigns"] = all_legs["onCampaigns"].apply(
    lambda x: "AAA" if x == "CHE" else x
)
top10 = list(all_legs.onCampaigns.unique())

# transp_category list
tc_lst = all_factors.transp_category.unique()

# gender list
gender_lst = ["Male", "Female"]

In [None]:
# bike, ebike, bikesharing, microscooter, walking

selected_tranp_modes_dict = {
    1: "bicycle",
    7: "walking",
    8: "running",
    16: "electricBike",
    17: "bikeSharing",
    18: "microScooter",
}

selected_tranp_modes = [tm for tm in selected_tranp_modes_dict.values()]

<a id='Q2' ></a>
### Q2: Compare top positive and negative factors the 5 modes: bike, ebike, bikesharing, microscooter, walking

In [None]:
plus_factors = all_factors[
    (all_factors["minus"] == False) & (all_factors["plus"] == True)
]
minus_factors = all_factors[
    (all_factors["minus"] == True) & (all_factors["plus"] == False)
]

In [None]:
all_factors.loc[
    all_factors["correctedModeOfTransport_str"].isin(selected_tranp_modes)
].head(3)

In [None]:
plus_factors.head(3)

In [None]:
rename_cols_factors = {
    "": "factors",
    "Female": "female",
    "Male": "male",
    "Other": "other",
}

In [None]:
factors_tm_plus = {}
factors_tm_minus = {}
for tm in selected_tranp_modes:
    plus_tmp = plus_factors.loc[plus_factors["correctedModeOfTransport_str"] == tm][
        ["factor", "plus", "gender"]
    ].copy()
    factors_tm_plus[tm] = plus_tmp

    minus_tmp = minus_factors.loc[minus_factors["correctedModeOfTransport_str"] == tm][
        ["factor", "minus", "gender"]
    ].copy()
    factors_tm_minus[tm] = minus_tmp

    # plus factors
    tmp_plus = (
        factors_tm_plus[tm]
        .groupby(["factor", "gender"])
        .count()
        .pivot_table(index=["factor"], columns=["gender"], values=["plus"])
        .reset_index()
    )
    tmp_plus.columns = tmp_plus.columns.get_level_values(1).to_list()
    tmp_plus.rename(
        columns=dict((k, rename_cols_factors[k]) for k in tmp_plus.columns.to_list()),
        inplace=True,
    )
    if "other" is not tmp_plus:
        tmp_plus["other"] = np.nan
    tmp_plus = tmp_plus.fillna(0)
    tmp_plus = tmp_plus.astype({"female": "int", "male": "int", "other": "int"})
    tmp_plus["total"] = tmp_plus[["female", "male", "other"]].sum(axis=1)
    tmp_plus = tmp_plus.sort_values(by="total", ascending=False)
    ## save data
    out_plus_filename = "plus_factors_{}.csv".format(tm)
    out_plus_path = out_path / out_plus_filename
    tmp_plus.to_csv(out_plus_path, index=False)

    # minus factors
    tmp_minus = (
        factors_tm_minus[tm]
        .groupby(["factor", "gender"])
        .count()
        .pivot_table(index=["factor"], columns=["gender"], values=["minus"])
        .reset_index()
    )
    tmp_minus.columns = tmp_minus.columns.get_level_values(1).to_list()
    tmp_minus.rename(
        columns=dict((k, rename_cols[k]) for k in tmp_minus.columns.to_list()),
        inplace=True,
    )
    if "other" is not tmp_minus:
        tmp_minus["other"] = np.nan
    tmp_minus = tmp_minus.fillna(0)
    tmp_minus = tmp_minus.astype({"female": "int", "male": "int", "other": "int"})
    tmp_minus["total"] = tmp_minus[["female", "male", "other"]].sum(axis=1)
    tmp_minus = tmp_minus.sort_values(by="total", ascending=False)

    ## save data
    out_minus_filename = "minus_factors_{}.csv".format(tm)
    out_minus_path = out_path / out_minus_filename
    tmp_minus.to_csv(out_minus_path, index=False)

<a id='Q3' ></a>
### Q3: Compare preferred activities while on each five modes based on highest value given

(e.g. difference of importance of fitness between bike and ebike)

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"
users_with_trips = "users_df_with_trips.pkl"

# read datasets
legs_df = pd.read_pickle(input_path / legs)
trips_users_df = pd.read_pickle(input_path / trips_users)
trips_df = pd.read_pickle(input_path / trips)
users_df_with_trips = pd.read_pickle(input_path / users_with_trips)

In [None]:
### Read activity data

# read data
all_gen_act = pd.read_pickle(input_path / "all_gen_act.pkl")

# add info
all_gen_act = all_gen_act.merge(
    legs_df[["legid", "correctedModeOfTransport_str", "gender", "wastedTime"]],
    on="legid",
)

# filter useful values of wt and round to int
all_gen_act = all_gen_act[(all_gen_act.wastedTime > 0) & (all_gen_act.wastedTime < 6)]
all_gen_act.wastedTime = all_gen_act.wastedTime.apply(lambda x: np.round(x))

# add values from trip
values_from_trip = pd.read_pickle(input_path / "values_from_trip.pkl")
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]

tmp = values_from_trip[["legid", "value", "valueFromTrip"]]
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()

# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)
values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)


all_gen_act = all_gen_act.merge(values_from_trip_pivot, on="legid").drop_duplicates()
print("shape", all_gen_act.shape)
print("unique legs", all_gen_act.legid.nunique())
all_gen_act.head()

In [None]:
rename_cols_act = {
    "": "activities",
    "Female": "female",
    "Male": "male",
    "Other": "other",
}

In [None]:
activities_tm = {}
for tm in selected_tranp_modes:
    act_tm = (
        all_gen_act.loc[all_gen_act["correctedModeOfTransport_str"] == tm][
            ["code", "correctedModeOfTransport_str", "gender"]
        ]
    ).copy()
    activities_tm[tm] = act_tm

    act_count_tm = (
        activities_tm[tm]
        .groupby(["code", "gender"])
        .count()
        .pivot_table(
            index=["code"], columns=["gender"], values=["correctedModeOfTransport_str"]
        )
        .reset_index()
    )
    act_count_tm.columns = act_count_tm.columns.get_level_values(1).to_list()
    act_count_tm.rename(
        columns=dict((k, rename_cols_act[k]) for k in act_count_tm.columns.to_list()),
        inplace=True,
    )
    if "other" is not act_count_tm:
        act_count_tm["other"] = np.nan
    act_count_tm = act_count_tm.fillna(0)
    act_count_tm = act_count_tm.astype({"female": "int", "male": "int", "other": "int"})
    act_count_tm["total"] = act_count_tm[["female", "male", "other"]].sum(axis=1)
    act_count_tm = act_count_tm.sort_values(by="total", ascending=False)

    ## save data
    act_filename = "activities_{}.csv".format(tm)
    act_path = out_path / act_filename
    act_count_tm.to_csv(act_path, index=False)

<a id='Q5' ></a>
### Q5: Distance analysis: to show that most urban trips are lower than X kilometres

(this is mostly to show the potential for modal shift, particularly from car to active modes, I think I can do this myself with the current dataset)

In [None]:
# read data
all_legs_urban = pd.read_pickle(
    input_path / "all_legs_final_ds_user_info_urban_class.pkl"
)

In [None]:
all_legs_urban.columns

In [None]:
legs_select = all_legs_urban[
    [
        "legid",
        "correctedModeOfTransport_str",
        "legDistance",
        "legDuration",
        "start_class",
        "end_class",
    ]
].copy()
legs_select.columns = [
    "legid",
    "transport_mode",
    "distance",
    "duration",
    "start_class",
    "end_class",
]

In [None]:
urban_legs = legs_select.loc[
    (legs_select["start_class"] == "urban") & (legs_select["end_class"] == "urban")
].copy()

In [None]:
urban_legs.head(3)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
axes = axes.ravel()

axid = 0
for tm in selected_tranp_modes:

    tmp = urban_legs.loc[urban_legs.transport_mode == tm].copy()

    hist = tmp.hist(
        column="distance", bins=[(2 ** i) * 100 for i in range(0, 9)], ax=axes[axid]
    )
    axes[axid].set_xlabel("distance (m)")
    axes[axid].set_ylabel("count")
    # axes[axid].set_title("Distribution of leg distances for transport mode '{}''".format(tm),
    #                      fontsize=12)
    axes[axid].set_title(tm, fontsize=14)
    axes[axid].set_xscale("log")

    axid += 1

plt.tight_layout()
plt.savefig(img_path / "leg_distances.png")