# Create users profiles

## Steps

- Read data and preprocessing
    - `all_legs`
    - `category_transp_mode_dict`
    - `specific_worthwhile`
    - `gen_worthwhile`
    - `values_from_trip`

- Finale set of users
- Create users profiles for onBoarding data
- Create users profiles for trips data
- Take users demograhic information
- Create transport category tables


### Savings

- `final_users_sample.txt`
- `users_profile_spec_gen.csv`
- `users_profile_trips.csv`

In [None]:
import pandas as pd
import numpy as np
import clustering_functions
import importlib
import itertools
from pandas.io.json import json_normalize
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import json
import umap.umap_ as umap

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/clustering/"
anon_df_path = "../../anon-dataset/2019-12-16.anon/"

### Read data and preprocessing

In [None]:
all_legs = pd.read_pickle(input_path + legs)

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

# final sample of users for onBoarding clusters
# with open(out_path + 'final_users_sample.txt', 'r')as f:
#    final_users_onBoarding = f.read().splitlines()
# print(len(final_users_onBoarding))

print(all_legs.shape)
print("users in all_legs:", len(all_legs.userid.unique()))
print("legs:", len(all_legs.legid.unique()))
print("trips in all_legs:", len(all_legs.tripid.unique()))


all_legs.head()
### GENERAL results, not the ones referred to the sample we will create in this code!!

**`specific_worthwhile`**

- Take users with at least a leg

In [None]:
spec_worthwhile = pd.read_pickle(input_path + "specific_worthwhile.pkl")  # 3309 users
print("initial users:", len(spec_worthwhile.userid.unique()))
# select only trips in all_legs
spec_worthwhile = spec_worthwhile[
    (spec_worthwhile["userid"].isin(all_legs["userid"]))
    & (spec_worthwhile["MotText"] != "")
]
# convert intercityTrain into intercity
spec_worthwhile["MotText"] = spec_worthwhile["MotText"].apply(
    lambda x: "intercityTrain" if x == "intercity" else x
)
# select legs that are in all_legs
spec_worthwhile = spec_worthwhile[
    (spec_worthwhile["userid"].isin(list(all_legs.userid.unique())))
]  # 3164
print("only users with at least a leg:", len(spec_worthwhile.userid.unique()))

# add the transport category
spec_worthwhile["transp_category"] = spec_worthwhile["Mot"].apply(
    lambda x: inverted_category_transp_mode_dict.get(int(x))
)

# if a user has more than one mode of transport associate to the same transport category
# take the mean of PEF
# in the example, user 1sN4Fx4vabPZcUu9tFQdcbaYXq73 has walking and running
# associated to the same transp_category = Walking so
# the new motsFit will be 50 + 100 / 2 = 75
spec_worthwhile = (
    spec_worthwhile.groupby(["userid", "transp_category"])[
        "motsFit", "motsProd", "motsRelax"
    ]
    .mean()
    .reset_index()
)

**`gen_worthwhile`**

- Take users with at least a leg

In [None]:
gwv_file = "gen_worthwhile.pkl"
gwv_df = pd.read_pickle(input_path + gwv_file)

gwv_df.rename(
    columns={"fitness": "genFit", "productivity": "genProd", "enjoyment": "genEnj",},
    inplace=True,
)
print("initial users:", len(gwv_df.userid.unique()))

# select users with at least a leg
gwv_df = gwv_df[gwv_df.userid.isin(all_legs.userid.unique())]
print("users with at least a leg: ", len(gwv_df.userid.unique()))

gwv_df.head()

**`values_from_trip`**

- Remove legs with valueFromTrip = 'Unknown'
- Remove legs with transport category = None
- Remove trips with more than a leg
- Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value


In [None]:
## read values_from_trip
values_from_trip = pd.read_pickle(input_path + "values_from_trip.pkl")
print("initial legs: ", values_from_trip.legid.nunique())

# remove legs with valueFromTrip = 'Unknown'
values_from_trip = values_from_trip[values_from_trip.valueFromTrip != "Unknown"]
print("without unknown: ", values_from_trip.legid.nunique())

# remove legs with transport category = None
legs_without_transp_cat = all_legs["legid"][all_legs.transp_category.isna()]
values_from_trip = values_from_trip[
    ~values_from_trip.legid.isin(legs_without_transp_cat)
]
print("with valid transport category: ", values_from_trip.legid.nunique())
print()
# remove trips with more than a leg
trips_count = values_from_trip.groupby("tripid").size().reset_index(name="count")
trips_to_remove = trips_count["tripid"][trips_count["count"] > 4]
print("trips with more than a leg: ", len(trips_to_remove))
values_from_trip = values_from_trip[~values_from_trip.tripid.isin(trips_to_remove)]
print("legs: ", values_from_trip.legid.nunique())
print("trips: ", values_from_trip.tripid.nunique())


tmp = values_from_trip[["legid", "value", "valueFromTrip"]]
values_from_trip_pivot = pd.pivot(
    data=tmp, index="legid", columns="valueFromTrip", values="value"
).reset_index()
# add transport category and userid
values_from_trip_pivot = values_from_trip_pivot.merge(
    all_legs[["legid", "userid", "transp_category"]], on="legid"
).drop_duplicates()
# Merge Paid_work and Personal_tasks into Productivity taking the **maximum** value
values_from_trip_pivot["Productivity"] = values_from_trip_pivot[
    ["Paid_work", "Personal_tasks"]
].max(axis=1)
values_from_trip_pivot.drop(["Paid_work", "Personal_tasks"], axis=1, inplace=True)
print("users: ", values_from_trip_pivot.userid.nunique())

values_from_trip_pivot.head()

### Final set of users

Users that have **all** these conditions:

- Specific ww values
- Generic ww values
- Values from trip

In [None]:
spec_users = set(spec_worthwhile.userid.unique())
gen_users = set(gwv_df.userid.unique())
values_users = set(values_from_trip_pivot.userid.unique())

print("spec:", len(spec_users))
print("gen:", len(gen_users))
print("values:", len(values_users))
print()
#### INTERSECTION

final_sample_users = spec_users.intersection((gen_users.intersection(values_users)))
print("--- Final users: ", len(final_sample_users))


# SAVE USERS LIST
with open(out_path + "final_users_sample.txt", "w") as f:
    for usn in final_sample_users:
        f.write("%s\n" % usn)

### Create users profiles for onBoarding data

In [None]:
# select users for clustering
spec_worthwhile = spec_worthwhile[spec_worthwhile.userid.isin(final_sample_users)]
gwv_df = gwv_df[gwv_df.userid.isin(final_sample_users)]

# check
print(spec_worthwhile.userid.nunique())
print(gwv_df.userid.nunique())

In [None]:
# create a vector for each user
users_profile_spec_ww = pd.pivot(
    spec_worthwhile,
    index="userid",
    columns="transp_category",
    values=["motsFit", "motsProd", "motsRelax"],
).reset_index()
users_profile_spec_ww = users_profile_spec_ww.fillna(0)

users_profile_spec_ww.columns = [
    "userid",
    "fit_cycling_emerging_micromobility",
    "fit_private_motorized",
    "fit_public_transp_long_dist",
    "fit_public_transp_short_dist",
    "fit_walking",
    "prod_cycling_emerging_micromobility",
    "prod_private_motorized",
    "prod_public_transp_long_dist",
    "prod_public_transp_short_dist",
    "prod_walking",
    "enj_cycling_emerging_micromobility",
    "enj_private_motorized",
    "enj_public_transp_long_dist",
    "enj_public_transp_short_dist",
    "enj_walking",
]

## Add GENERIC ww
users_profile_spec_ww = users_profile_spec_ww.merge(gwv_df, on="userid")
print(users_profile_spec_ww.shape)
users_profile_spec_ww.head()

In [None]:
### SAVE
users_profile_spec_ww.to_csv(out_path + "users_profile_spec_gen.csv", index=False)

### Create users profiles for trip data

In [None]:
# select users from final_sample_users
values_from_trip_pivot = values_from_trip_pivot[
    values_from_trip_pivot.userid.isin(final_sample_users)
]

print(values_from_trip_pivot.userid.nunique())
tranps_cat_count_trips = (
    values_from_trip_pivot.groupby("transp_category").size().reset_index(name="count")
)
tranps_cat_count_trips["rel_count"] = tranps_cat_count_trips["count"].apply(
    lambda x: np.round(x / tranps_cat_count_trips["count"].sum(), 2)
)
tranps_cat_count_trips

In [None]:
values_from_trip_pivot2 = (
    values_from_trip_pivot.groupby(["userid", "transp_category"])[
        ["Enjoyment", "Fitness", "Productivity"]
    ]
    .mean()
    .reset_index()
)

users_profile_trips = pd.pivot(
    data=values_from_trip_pivot2,
    index="userid",
    columns="transp_category",
    values=["Enjoyment", "Fitness", "Productivity"],
).reset_index()

users_profile_trips.fillna(0, inplace=True)
users_profile_trips.columns = [
    "userid",
    "enj_cycling_emerging_micromobility",
    "enj_private_motorized",
    "enj_public_transp_long_dist",
    "enj_public_transp_short_dist",
    "enj_walking",
    "fit_cycling_emerging_micromobility",
    "fit_private_motorized",
    "fit_public_transp_long_dist",
    "fit_public_transp_short_dist",
    "fit_walking",
    "prod_cycling_emerging_micromobility",
    "prod_private_motorized",
    "prod_public_transp_long_dist",
    "prod_public_transp_short_dist",
    "prod_walking",
]

## Add GENERIC ww
users_profile_trips = users_profile_trips.merge(gwv_df, on="userid")
print(users_profile_trips.shape)
users_profile_trips.head()

In [None]:
### SAVE
users_profile_trips.to_csv(out_path + "users_profile_trips.csv", index=False)

### Legs - duration and distance

In [None]:
final_legs = all_legs[all_legs.userid.isin(final_sample_users)]
print("total legs: ", final_legs.legid.nunique())
reviewed_legs = all_legs[all_legs.legid.isin(values_from_trip_pivot.legid.unique())]
print("reviewed legs: ", len(reviewed_legs))
# print("trips: ", values_from_trip_pivot.tripid.nunique())

reviewed_legs = reviewed_legs[
    ["legid", "inferred_leg_duration_min", "trueDistance", "legDistance"]
]  # esiste anche legDistance
reviewed_legs.head()

In [None]:
# duration = reviewed_legs['inferred_leg_duration_min'].sort_values()
reviewed_legs2 = reviewed_legs[
    (reviewed_legs.inferred_leg_duration_min < 300)
    & (reviewed_legs.trueDistance < 400000)
]

plt.scatter(reviewed_legs2["inferred_leg_duration_min"], reviewed_legs2["trueDistance"])

In [None]:
plt.hist(
    reviewed_legs.inferred_leg_duration_min[
        reviewed_legs.inferred_leg_duration_min < 175
    ],
    bins=50,
)
plt.show()

In [None]:
plt.hist(reviewed_legs.trueDistance)
plt.show()

### Users' demographics

In [None]:
users_demographics = pd.read_csv(anon_df_path + "user_details.csv")
# take only users in the final sample
users_demographics = users_demographics[
    users_demographics.userid.isin(final_sample_users)
]
# take only useful columns
users_demographics = users_demographics[
    [
        "userid",
        "gender",
        "age_range",
        "lang",
        "education_level",
        "marital_status_household",
        "labour_status_household",
    ]
]

users_demographics["education_level"] = users_demographics["education_level"].apply(
    lambda x: "High school"
    if x == "High school (12th grade)"
    else "Basic"
    if x == "Basic (up to 10th grade)"
    else "University"
    if x == "University"
    else None
)
users_demographics["marital_status_household"] = users_demographics[
    "marital_status_household"
].apply(
    lambda x: "Partner"
    if x == "En pareja"
    else "Married"
    if x == "Naimisissa"
    else "Single"
    if x == "Slobodný/á"
    else "Married"
    if x == "Ženatý/vydatá"
    else "Partner"
    if x == "Registered partnership"
    else x
)
users_demographics["labour_status_household"] = users_demographics[
    "labour_status_household"
].apply(
    lambda x: "Employed FT"
    if x == "Employed full Time"
    else "Employed PT"
    if x == "Employed part-time"
    else x
)
users_demographics["lang"] = users_demographics["lang"].apply(
    lambda x: "spa" if x == "esp" else x
)

print(users_demographics.shape)
users_demographics.head()

In [None]:
## save user demographics for clustering
users_demographics.to_csv(out_path + "users_demographics.csv", index=False)

In [None]:
### PLOTS
ncols = 3
nrows = 2
fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(17, 10))


# gender
tmp = (
    users_demographics.groupby("gender")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[0][0].bar(range(len(tmp)), tmp["count"])
axes[0][0].set_xticks(range(len(tmp)))
axes[0][0].set_xticklabels(list(tmp.gender))
axes[0][0].set_title("Gender")

# Age
tmp = (
    users_demographics.groupby("age_range").size().reset_index(name="count")
)  # .sort_values(by='age_r', ascending=False)
axes[0][1].bar(range(len(tmp)), tmp["count"])
axes[0][1].set_xticks(range(len(tmp)))
axes[0][1].set_xticklabels(list(tmp.age_range), rotation=45)
axes[0][1].set_title("Age")

# Language
tmp = (
    users_demographics.groupby("lang")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[0][2].bar(range(len(tmp)), tmp["count"])
axes[0][2].set_xticks(range(len(tmp)))
axes[0][2].set_xticklabels(list(tmp.lang), rotation=45)
axes[0][2].set_title("Language")

# Educ level
tmp = (
    users_demographics.groupby("education_level")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[1][0].bar(range(len(tmp)), tmp["count"])
axes[1][0].set_xticks(range(len(tmp)))
axes[1][0].set_xticklabels(list(tmp.education_level))
axes[1][0].set_title("Education Level")

# Marital staus
tmp = (
    users_demographics.groupby("marital_status_household")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[1][1].bar(range(len(tmp)), tmp["count"])
axes[1][1].set_xticks(range(len(tmp)))
axes[1][1].set_xticklabels(list(tmp.marital_status_household), rotation=45)
axes[1][1].set_title("Marital Status")

# Labour status
tmp = (
    users_demographics.groupby("labour_status_household")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[1][2].bar(range(len(tmp)), tmp["count"])
axes[1][2].set_xticks(range(len(tmp)))
axes[1][2].set_xticklabels(list(tmp.labour_status_household), rotation=45)
axes[1][2].set_title("Labour Status")

plt.savefig(out_path + "plot_demographics.png", bbox_to_anchor=True)

In [None]:
tmp = (
    users_demographics.groupby("education_level")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
tmp

In [None]:
### PLOTS
ncols = 6
nrows = 1
fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(20, 4))


# gender
tmp = (
    users_demographics.groupby("gender")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[0].bar(range(len(tmp)), tmp["count"])
axes[0].set_xticks(range(len(tmp)))
axes[0].set_xticklabels(list(tmp.gender))
axes[0].set_title("Gender")

# Age
tmp = (
    users_demographics.groupby("age_range").size().reset_index(name="count")
)  # .sort_values(by='age_r', ascending=False)
axes[1].bar(range(len(tmp)), tmp["count"])
axes[1].set_xticks(range(len(tmp)))
axes[1].set_xticklabels(list(tmp.age_range), rotation=45)
axes[1].set_title("Age")

# Language
tmp = (
    users_demographics.groupby("lang")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[2].bar(range(len(tmp)), tmp["count"])
axes[2].set_xticks(range(len(tmp)))
axes[2].set_xticklabels(list(tmp.lang), rotation=55)
axes[2].set_title("Language")

# Educ level
tmp = (
    users_demographics.groupby("education_level")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[3].bar(range(len(tmp)), tmp["count"])
axes[3].set_xticks(range(len(tmp)))
axes[3].set_xticklabels(list(tmp.education_level), rotation=45)
axes[3].set_title("Education Level")

# Marital staus
tmp = (
    users_demographics.groupby("marital_status_household")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[4].bar(range(len(tmp)), tmp["count"])
axes[4].set_xticks(range(len(tmp)))
axes[4].set_xticklabels(list(tmp.marital_status_household), rotation=45)
axes[4].set_title("Marital Status")

# Labour status
tmp = (
    users_demographics.groupby("labour_status_household")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
axes[5].bar(range(len(tmp)), tmp["count"])
axes[5].set_xticks(range(len(tmp)))
axes[5].set_xticklabels(list(tmp.labour_status_household), rotation=45)
axes[5].set_title("Labour Status")

plt.savefig(
    out_path + "plot_demographics.png", bbox_to_anchor=True, bbox_inches="tight"
)

### Transport categories

In [None]:
## oboarding
tc = spec_worthwhile.groupby("transp_category").size().reset_index(name="count")
tc.to_csv(out_path + "transport_category_count_ob.csv", index=False)

## trips
tc = values_from_trip_pivot.groupby("transp_category").size().reset_index(name="count")
tc.to_csv(out_path + "transport_category_count_tr.csv", index=False)

In [None]:
nusers = 3011
tc_df = (
    values_from_trip_pivot.groupby("transp_category")
    .size()
    .reset_index(name="leg_count")
)
tc_df["leg_rel_count"] = tc_df["leg_count"].apply(
    lambda x: np.round((x / tc_df["leg_count"].sum()) * 100, 2)
)

# users with legs
tmp = (
    values_from_trip_pivot.groupby("transp_category")["userid"]
    .nunique()
    .reset_index(name="user_count_leg")
)
tc_df = tc_df.merge(tmp, on="transp_category")
tc_df["user_rel_count_leg"] = tc_df["user_count_leg"].apply(
    lambda x: np.round((x / nusers) * 100, 2)
)

# onboarding
tmp = (
    spec_worthwhile.groupby("transp_category").size().reset_index(name="user_count_ob")
)
tc_df = tc_df.merge(tmp, on="transp_category")
tc_df["user_rel_count_ob"] = tc_df["user_count_ob"].apply(
    lambda x: np.round((x / nusers) * 100, 2)
)

# save
tc_df.to_csv(out_path + "tc_table_all_for_paper")

tc_df

In [None]:
##### MALE
male_df = users_demographics[users_demographics.gender == "Male"]
nusers = len(male_df)
print(nusers)

values_from_trip_pivot = values_from_trip_pivot[
    values_from_trip_pivot.userid.isin(male_df.userid.unique())
]
spec_worthwhile = spec_worthwhile[spec_worthwhile.userid.isin(male_df.userid.unique())]

tc_df = (
    values_from_trip_pivot.groupby("transp_category")
    .size()
    .reset_index(name="leg_count")
)
tc_df["leg_rel_count"] = tc_df["leg_count"].apply(
    lambda x: np.round((x / tc_df["leg_count"].sum()) * 100, 2)
)

# users with legs
tmp = (
    values_from_trip_pivot.groupby("transp_category")["userid"]
    .nunique()
    .reset_index(name="user_count_leg")
)
tc_df = tc_df.merge(tmp, on="transp_category")
tc_df["user_rel_count_leg"] = tc_df["user_count_leg"].apply(
    lambda x: np.round((x / nusers) * 100, 2)
)

# onboarding
tmp = (
    spec_worthwhile.groupby("transp_category").size().reset_index(name="user_count_ob")
)
tc_df = tc_df.merge(tmp, on="transp_category")
tc_df["user_rel_count_ob"] = tc_df["user_count_ob"].apply(
    lambda x: np.round((x / nusers) * 100, 2)
)

# save
tc_df.to_csv(out_path + "tc_table_male_for_paper.csv", index=False)

tc_df

In [None]:
nusers