# Task 5.1 - 5.2

### SPRINT 1 - Individual Value Proposition:
Identificaion of clusters of users based on specific worthwhileness values (Self-declared value perception) and transport mode categories

### T5.2: Clustering based on specific worthwhileness values (self-declared)

### Steps

- [Read users' profiles](#read_users)

- [Dimensionality reduction - UMAP](#umap)

    - [kmeans](#umap_kmeans)
    - [hierarchical](#umap_hier)

- [Final setting](#final)

In [None]:
import pandas as pd
import numpy as np
import clustering_functions
import importlib
import itertools
from pandas.io.json import json_normalize
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import json
import umap.umap_ as umap

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/clustering/"
out_path = "../../2019-12-16.out/clustering/results_onBoarding/"
anon_df_path = "../../anon-dataset/2019-12-16.anon/"

<a id='read_users' ></a>
### Read users' profiles

Each user is represented with a vector of values, corresponding to PEF values for each category of transport, so it is a vector of **15 positions**.
<br> **0 if** the user has not selected that category as regularly used **OR** if the user selected 0 importance for that variable.

<br> We also added the generic values of PEF given by the user, so the final lenght of the vector will be of **18 elements**

In [None]:
# read demographics of users
users_demographics = pd.read_csv(input_path + "users_demographics.csv")
print(users_demographics.shape)

# transport categories
tc = pd.read_csv(input_path + "transport_category_count_ob.csv")
transp_cat = list(tc["transp_category"])

# read users profile
users_profile = pd.read_csv(input_path + "users_profile_spec_gen.csv")
print(users_profile.shape)
users_profile.head()

**CONVERT INTO CLASSES**

- 0: values from 0 to 32
- 1: values from 33 to 65
- 2: values from 66 to 100

In [None]:
# with right=True the classes are (-2, 32], (32, 65], (66,100]
users_profile_classes = users_profile.copy()
for col in users_profile_classes.columns[1:]:

    users_profile_classes[col] = pd.cut(
        users_profile_classes[col], [-2, 32, 65, 100], labels=[0, 1, 2], right=True
    )

print("users: ", users_profile_classes.userid.nunique())
users_profile_classes.head()

In [None]:
X_transformed = users_profile_classes.drop(["userid"], 1)
print(X_transformed.shape)
print("na in the dataset: ", X_transformed.isnull().values.any())

<a id='umap' ></a>
## Dimensionality reduction - UMAP

In [None]:
import umap.umap_ as umap

# main parameters: n_neighbors, min_dist, n_epochs, n_components
# min_dist is important for the visualization. We can set min_dist = 0

n_components = 2
n_neighbors = 10
umap_dr = umap.UMAP(
    n_neighbors=n_neighbors, min_dist=0.0, n_components=n_components, random_state=42
)

# apply to data
reduced_data_umap = pd.DataFrame(umap_dr.fit_transform(X_transformed))

In [None]:
ncols = 3
nrows = 1
n_neighbors_lst = [10, 20, 60]
fig, axes = plt.subplots(ncols=ncols, nrows=nrows, figsize=(15, 4))
axes = axes.ravel()
for i in range(len(n_neighbors_lst)):

    standard_embedding = umap.UMAP(
        random_state=42, n_neighbors=n_neighbors_lst[i], min_dist=0.0
    ).fit_transform(X_transformed)

    axes[i].scatter(standard_embedding[:, 0], standard_embedding[:, 1], s=0.9)
    axes[i].set_title("Neighbors: " + str(n_neighbors_lst[i]))

plt.savefig(
    out_path + "umap_neigh_dim" + str(n_components) + ".png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)

<a id='umap_kmeans' ></a>
### Kmeans

In [None]:
kmeans_dict_red_umap = {}
metrics_dict = {"distortion", "silhouette", "calinski_harabasz"}
for metric in metrics_dict:
    kmeans_dict_red_umap[metric] = {}
    kopt = clustering_functions.kmeans_elbow(reduced_data_umap, metric=metric)
    kmeans_dict_red_umap[metric]["kopt"] = kopt
    if kopt != None:

        labels, centroids = clustering_functions.kmeans(kopt, reduced_data_umap)
        kmeans_dict_red_umap[metric]["kopt"] = kopt
        kmeans_dict_red_umap[metric]["silhouette"] = metrics.silhouette_score(
            reduced_data_umap, labels
        )
        kmeans_dict_red_umap[metric][
            "calinski_harabasz"
        ] = metrics.calinski_harabasz_score(reduced_data_umap, labels)

kmeans_dict_red_umap

In [None]:
#### initialization
k = 5
method = "kmeans"


labels, centroids = clustering_functions.kmeans(k, reduced_data_umap)

all_data_clustered = users_profile.copy()
all_data_clustered["cluster"] = labels

cluster_df_umap = reduced_data_umap.copy()
cluster_df_umap.columns = ["comp_1", "comp_2"]
cluster_df_umap["cluster"] = labels

print(
    "Silhouette score:",
    np.round(metrics.silhouette_score(reduced_data_umap, labels), 2),
)
print(
    "Calinski Harabasz score:",
    np.round(metrics.calinski_harabasz_score(reduced_data_umap, labels), 2),
)

## Save all data
all_data_clustered.to_csv(
    out_path
    + "data_for_matching_trips_umap_"
    + method
    + "_dim"
    + str(n_components)
    + "_neigh"
    + str(n_neighbors)
    + ".csv",
    index=False,
)

In [None]:
####### VISUALIZATION

data_for_plot = all_data_clustered[["userid", "cluster"]]
data_for_plot.columns = ["userid", "cluster"]
data_for_plot = data_for_plot.merge(
    cluster_df_umap, on=[cluster_df_umap.index, "cluster"]
).iloc[:, 1:]

data_for_plot = data_for_plot.merge(
    users_demographics[["userid", "gender", "age_range"]], on="userid", how="left"
)

# data_for_plot.head()


f, axes = plt.subplots(1, 2, figsize=(15, 7), sharey=True)
f.subplots_adjust(top=0.9)
sns.despine(left=True)


sns.scatterplot(
    x="comp_1",
    y="comp_2",
    hue="cluster",
    style="gender",
    # palette=sns.color_palette("hls", 5),
    data=data_for_plot,
    legend="full",
    alpha=0.8,
    ax=axes[0],
)

sns.scatterplot(
    x="comp_1",
    y="comp_2",
    hue="cluster",
    style="age_range",
    # palette=sns.color_palette("hls", 5),
    data=data_for_plot,
    legend="full",
    alpha=0.8,
    ax=axes[1],
)


# plt.suptitle(method)
plt.tight_layout()

plt.savefig(
    out_path
    + "plot_"
    + method
    + "_cl"
    + str(k)
    + "_neigh_"
    + str(n_neighbors)
    + ".png",
    bbox_to_anchor=True,
)

In [None]:
plt.figure(figsize=(7, 5))
sns.despine(left=True)
sns.scatterplot(
    x="comp_1",
    y="comp_2",
    hue="cluster",
    palette=sns.color_palette("hls", 5),
    data=data_for_plot,
    legend="full",
    alpha=0.8,
)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.savefig(
    out_path
    + "plot_"
    + method
    + "_cl"
    + str(k)
    + "_neigh_"
    + str(n_neighbors)
    + "_ALL.png",
    bbox_to_anchor=True,
)

<a id='umap_hier' ></a>
### hierarchical

In [None]:
reduced_data_umap.head(5)

In [None]:
linkage_lst = ["ward", "complete", "average", "single"]
affinity_lst = ["euclidean", "manhattan", "cosine"]
# all comb
all_comb = list(itertools.product(linkage_lst, affinity_lst))
all_comb.remove(("ward", "manhattan"))
all_comb.remove(("ward", "cosine"))

rows_lst = []
cols = ["n_clusters"]
for comb in all_comb:
    comb_name = comb[0] + "_" + comb[1]
    cols.append(comb_name + "_silhouette")
    cols.append(comb_name + "_calinski_harabasz")

for k in range(2, 16):
    # hier_dict['n_cluster_'+str(k)] = {}
    row = [k]
    for comb in all_comb:
        comb_name = comb[0] + "_" + comb[1]

        labels = clustering_functions.hierarchical(
            reduced_data_umap, k, affinity=comb[1], linkage=comb[0]
        )
        row.append(np.round(metrics.silhouette_score(reduced_data_umap, labels), 2))
        row.append(
            np.round(metrics.calinski_harabasz_score(reduced_data_umap, labels), 2)
        )

    rows_lst.append(row)
hier_df_red_umap = pd.DataFrame(rows_lst, columns=cols)
hier_df_red_umap

In [None]:
# hier_df_red_umap.iloc[:, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]].max()
# hier_df_red_umap.iloc[:, [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]].max()

<a id='final' ></a>
## FINAL hierarchical clustering with UMAP

In [None]:
### hierarchical clustering with ward_euclidean and 5 clusters
k = 5
affinity = "euclidean"
linkage = "ward"
method = "hierarchical"

labels = clustering_functions.hierarchical(
    reduced_data_umap, k, affinity=affinity, linkage=linkage
)
# memorize in a df
cluster_df_umap = reduced_data_umap.copy()
cluster_df_umap.columns = ["comp_1", "comp_2"]
cluster_df_umap["cluster"] = labels

# add labels to the complete dataset
all_data_clustered = users_profile_classes.copy()
all_data_clustered["cluster"] = labels

print(
    "Silhouette score:",
    np.round(metrics.silhouette_score(reduced_data_umap, labels), 2),
)
print(
    "Calinski Harabasz score:",
    np.round(metrics.calinski_harabasz_score(reduced_data_umap, labels), 2),
)


all_data_clustered.to_csv(
    out_path
    + "data_for_matching_trips_umap_"
    + method
    + "_dim"
    + str(n_components)
    + "_neigh"
    + str(n_neighbors)
    + ".csv",
    index=False,
)

In [None]:
### VISUALIZATION

data_for_plot = all_data_clustered[["userid", "cluster"]]
data_for_plot.columns = ["userid", "cluster"]
data_for_plot = data_for_plot.merge(
    cluster_df_umap, on=[cluster_df_umap.index, "cluster"]
).iloc[:, 1:]

data_for_plot = data_for_plot.merge(
    users_demographics[["userid", "gender", "age_range"]], on="userid", how="left"
)


f, axes = plt.subplots(1, 2, figsize=(15, 7), sharey=True)
f.subplots_adjust(top=0.9)
sns.despine(left=True)


sns.scatterplot(
    x="comp_1",
    y="comp_2",
    hue="cluster",
    style="gender",
    # palette=sns.color_palette("hls", 5),
    data=data_for_plot,
    legend="full",
    alpha=0.8,
    ax=axes[0],
)

sns.scatterplot(
    x="comp_1",
    y="comp_2",
    hue="cluster",
    style="age_range",
    # palette=sns.color_palette("hls", 5),
    data=data_for_plot,
    legend="full",
    alpha=0.8,
    ax=axes[1],
)


# plt.suptitle(method)
plt.tight_layout()

plt.savefig(
    out_path
    + "plot_"
    + method
    + "_cl"
    + str(k)
    + "_neigh_"
    + str(n_neighbors)
    + ".png",
    bbox_to_anchor=True,
)

In [None]:
### SUMMARY TABLES

for c in range(0, len(all_data_clustered["cluster"].unique())):

    cl_df = all_data_clustered[all_data_clustered["cluster"] == c]
    print("Users in cluster", c, " : ", cl_df.shape[0])
    print()
    tmp = pd.DataFrame(
        index=transp_cat,
        columns=[
            "count",
            "relfreq_tot",
            "relfreq_tc",
            "meanP",
            "meanE",
            "meanF",
            "meanGenP",
            "meanGenE",
            "meanGenF",
        ],
    )

    for i in transp_cat:  # for each transport category
        users_with_tc = cl_df[
            (cl_df["fit_" + i] != 0)
            | (cl_df["prod_" + i] != 0)
            | (cl_df["enj_" + i] != 0)
        ]

        tmp.loc[i] = [
            len(users_with_tc),
            np.round(len(users_with_tc) / cl_df.shape[0], 2),
            np.round(len(users_with_tc) / int(tc["count"][tc.transp_category == i]), 2),
            np.round(users_with_tc["prod_" + i].astype("float").mean(), 2),
            np.round(users_with_tc["enj_" + i].astype("float").mean(), 2),
            np.round(users_with_tc["fit_" + i].astype("float").mean(), 2),
            np.round(users_with_tc["genProd"].astype("float").mean(), 2),
            np.round(users_with_tc["genEnj"].astype("float").mean(), 2),
            np.round(users_with_tc["genFit"].astype("float").mean(), 2),
        ]

    tmp.to_csv(
        out_path + "summary_table_onboarding_" + method + "_cl" + str(c) + "_mean.csv"
    )

# example of summaty table
# relfreq_tot: divided by total users in the cluster
# relfreq_tc: divided by total of selection
tmp

In [None]:
pd.read_csv(
    out_path + "summary_table_onboarding_" + method + "_cl" + str(4) + "_mean.csv"
)

In [None]:
tmp

### Matching with user trips

In [None]:
#### read dataset of users for trip clustering

users_profile_trips = pd.read_csv(input_path + "users_profile_trips.csv")
print(users_profile_trips.shape)
# convert in int
for col in users_profile_trips.columns[1:]:
    users_profile_trips[col] = users_profile_trips[col].apply(
        lambda x: int(np.round(x))
    )

# convert in class 0-1-2 the generic values
# with right=True the classes are (-2, 32], (32, 65], (66,100]
users_profile_trips["genFit"] = pd.cut(
    users_profile_trips["genFit"], [-2, 32, 65, 100], labels=[0, 1, 2], right=True
)
users_profile_trips["genProd"] = pd.cut(
    users_profile_trips["genProd"], [-2, 32, 65, 100], labels=[0, 1, 2], right=True
)
users_profile_trips["genEnj"] = pd.cut(
    users_profile_trips["genEnj"], [-2, 32, 65, 100], labels=[0, 1, 2], right=True
)


### add cluster from onBoarding data
users_profile_trips = users_profile_trips.merge(
    all_data_clustered[["userid", "cluster"]], on="userid"
)

# save for heatmaps on matching
users_profile_trips.to_csv(out_path + "users_profile_trips_matching.csv", index=False)

users_profile_trips.head()

In [None]:
ncols = 5
nrows = 2
names = [
    "Cycling and micro",
    "Private motorized",
    "Public long dist",
    "Public short dist",
    "Walking",
]
namesx = ["P", "E", "F", "GenP", "GenE", "GenF"]

fig, axes = plt.subplots(
    ncols=ncols, nrows=nrows, figsize=(20, 10), sharex=True, sharey=True
)
axes = axes.ravel()
plt.subplots_adjust(wspace=0.05, hspace=0.1)
cbar_ax = fig.add_axes([0.91, 0.3, 0.03, 0.4])

for i in range(ncols):

    tmp = pd.read_csv(
        out_path + "summary_table_onboarding_hierarchical_cl" + str(i) + "_mean.csv"
    )
    df_hm = tmp.iloc[:, -6:].astype("float")

    # if i == 4:
    sns.heatmap(
        df_hm,
        ax=axes[i],
        vmin=0,
        vmax=1.9,
        yticklabels=names,
        xticklabels="",
        cbar=False,
    )
    # else:
    #    sns.heatmap(df_hm, ax=axes[i], vmin=0, vmax=1.9, yticklabels=names, xticklabels='', cbar=False)


for i in range(ncols):

    tmp = (
        users_profile_trips[users_profile_trips.cluster == i]
        .mean()
        .reset_index(name="tc_mean")
    )

    row_lst = []
    for c in range(len(transp_cat)):

        row = list(tmp["tc_mean"].iloc[[c, c + 5, c + 10]])
        row_lst.append(row)

    df_hm = pd.DataFrame(row_lst)
    df_hm.columns = ["meanP", "meanE", "meanF"]
    df_hm["genP"] = [2] * 5
    df_hm["genE"] = [2] * 5
    df_hm["genF"] = [2] * 5
    # tmp2 = pd.read_csv(out_path + "summary_table_onboarding_hierarchical_cl" + str(i) + "_mean.csv")
    # tmp2 = tmp2.iloc[:, -3:].astype('float')
    # df_hm = df_hm.merge(tmp2, on=tmp2.index)
    # df_hm.drop('key_0', axis=1, inplace=True)

    # if i == 4:
    sns.heatmap(
        df_hm,
        ax=axes[i + 5],
        vmin=0,
        vmax=1.9,
        yticklabels=names,
        xticklabels=namesx,
        cbar=False,
    )
    # else:
    #    sns.heatmap(df_hm, ax=axes[i+5], vmin=0, vmax=1.9, yticklabels=names, xticklabels=namesx, cbar=False)


# plt.savefig(input_path + "heatmaps_users.png", bbox_to_anchor=True, bbox_inches='tight')

In [None]:
[1.9] * 5

### Number of TC

In [None]:
### Number of Transport Categories (TC) selected by each users
# ex. cluster 0: users selected 4 or 5 TC as preferred

nrows = 2
ncols = 3
# hist
fig, axes = plt.subplots(
    ncols=ncols, nrows=nrows, figsize=(17, 10), sharex=True, sharey=True
)

r = 0
for c in range(0, ncols):

    cl_df = all_data_clustered[all_data_clustered["cluster"] == c]
    transp_df_clust = spec_worthwhile[["userid", "transp_category"]][
        spec_worthwhile.userid.isin(list(cl_df.userid.unique()))
    ]

    tmp = pd.DataFrame(transp_df_clust.groupby("userid").count()["transp_category"])
    tmp2 = tmp.groupby("transp_category").size().reset_index(name="count")
    axes[r][c].bar(tmp2["transp_category"], tmp2["count"])
    axes[r][c].set_title("Number of TC - cluster " + str(c), fontsize=14)
    axes[r][c].set_xticks(range(1, len(transp_cat) + 1))
    axes[r][c - ncols].tick_params(labelsize=13)

r = 1
for c in range(ncols, k):

    cl_df = all_data_clustered[all_data_clustered["cluster"] == c]
    transp_df_clust = spec_worthwhile[["userid", "transp_category"]][
        spec_worthwhile.userid.isin(list(cl_df.userid.unique()))
    ]

    tmp = pd.DataFrame(transp_df_clust.groupby("userid").count()["transp_category"])
    tmp2 = tmp.groupby("transp_category").size().reset_index(name="count")
    axes[r][c - ncols].bar(tmp2["transp_category"], tmp2["count"])
    axes[r][c - ncols].set_title("Number of TC - cluster " + str(c), fontsize=14)
    axes[r][c - ncols].set_xticks(range(1, len(transp_cat) + 1))
    axes[r][c - ncols].tick_params(labelsize=13)


plt.savefig(out_path + "tc_per_user_classes.png", bbox_to_anchor=True)
# plt.show()

In [None]:
### Association Table
# for each combination of transport mode, find how many users selected that combination

all_comb_tr = [
    (transp_cat[i], transp_cat[j])
    for i in range(len(transp_cat))
    for j in range(i, len(transp_cat))
]

for c in range(k):

    cl_df = all_data_clustered[all_data_clustered["Hcluster_umap"] == c]
    transp_df_clust = spec_worthwhile[["userid", "transp_category"]][
        spec_worthwhile.userid.isin(list(cl_df.userid.unique()))
    ]

    df = (
        transp_df_clust.groupby(["userid", "transp_category"])
        .size()
        .reset_index(name="count")
    )
    basket = (
        df.groupby(["userid", "transp_category"])["count"]
        .sum()
        .unstack()
        .reset_index()
        .fillna(0)
    )

    cont_table = pd.DataFrame(columns=transp_cat, index=transp_cat)
    for comb in all_comb_tr:
        cont_table.loc[comb[0], comb[1]] = len(
            basket[(basket[comb[0]] == 1) & (basket[comb[1]] == 1)]
        )
        cont_table.loc[comb[1], comb[0]] = len(
            basket[(basket[comb[0]] == 1) & (basket[comb[1]] == 1)]
        )
        # save
        cont_table.to_csv(
            out_path + "transport_modes_associations_cl" + str(c) + ".csv"
        )