# Cluster Matching

In [None]:
import os
import sys
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from scipy import optimize
from scipy import spatial
import copy
import plotly.graph_objects as go
import scipy
import seaborn as sns

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

In [None]:
## input path and data
# input_path = "../../umap-data/"

input_path = "../../2019-12-16.out/clustering/"
input_path_ob = "../../2019-12-16.out/clustering/results_onBoarding/"
input_path_tr = "../../2019-12-16.out/clustering/results_trips/"

cluster_onboard_file = (
    "results_onBoarding/data_for_matching_onBoarding_umap_hierarchical_dim2_neigh10.csv"
)
cluster_trip_file = (
    "results_trips/data_for_matching_trips_umap_hierarchical_dim2_neigh60.csv"
)

In [None]:
columns = [
    "userid",
    "enj_cycling_emerging_micromobility",
    "enj_private_motorized",
    "enj_public_transp_long_dist",
    "enj_public_transp_short_dist",
    "enj_walking",
    "fit_cycling_emerging_micromobility",
    "fit_private_motorized",
    "fit_public_transp_long_dist",
    "fit_public_transp_short_dist",
    "fit_walking",
    "prod_cycling_emerging_micromobility",
    "prod_private_motorized",
    "prod_public_transp_long_dist",
    "prod_public_transp_short_dist",
    "prod_walking",
    "genFit",
    "genProd",
    "genEnj",
    "cluster",
]

coltypes = {
    "userid": "str",
    "enj_cycling_emerging_micromobility": "int",
    "enj_private_motorized": "int",
    "enj_public_transp_long_dist": "int",
    "enj_public_transp_short_dist": "int",
    "enj_walking": "int",
    "fit_cycling_emerging_micromobility": "int",
    "fit_private_motorized": "int",
    "fit_public_transp_long_dist": "int",
    "fit_public_transp_short_dist": "int",
    "fit_walking": "int",
    "prod_cycling_emerging_micromobility": "int",
    "prod_private_motorized": "int",
    "prod_public_transp_long_dist": "int",
    "prod_public_transp_short_dist": "int",
    "prod_walking": "int",
    "genFit": "int",
    "genProd": "int",
    "genEnj": "int",
    "cluster": "int",
}

cluster_onboard_path = os.path.join(input_path, cluster_onboard_file)
onboard_df = pd.read_csv(cluster_onboard_path, usecols=columns)
onboard_df = onboard_df.astype(coltypes)

cluster_trip_path = os.path.join(input_path, cluster_trip_file)
trip_df = pd.read_csv(cluster_trip_path, usecols=columns)
trip_df = trip_df.astype(coltypes)

In [None]:
onboard_df.head(3)

In [None]:
trip_df.head(3)

In [None]:
merge_df = pd.merge(onboard_df, trip_df, on=["userid"], suffixes=("_ob", "_tr"))
print(merge_df.shape)
merge_df.head(5)

In [None]:
### check data
merge_df.groupby("cluster_ob").size()

In [None]:
merge_df.groupby("cluster_tr").size()

In [None]:
common_df = merge_df[["userid", "cluster_ob", "cluster_tr"]]
# take the number of clusters
ncluster_tr = common_df["cluster_tr"].max() + 1
ncluster_ob = common_df["cluster_ob"].max() + 1

common_df.head()

### Matching with number of users

In [None]:
# initialization of the matrices of intersection and union with all zeros
intersection = np.zeros((ncluster_ob, ncluster_tr))
union = np.zeros((ncluster_ob, ncluster_tr))

# fill the matrices with the number of users in common or in total
for clob in range(ncluster_ob):
    for cltr in range(ncluster_tr):
        # intersection
        rescell = "ob{}_eq_tr{}".format(clob, cltr)
        common_df[rescell] = common_df.apply(
            lambda x: 1 if (x["cluster_ob"] == clob and x["cluster_tr"] == cltr) else 0,
            axis=1,
        )

        count = common_df.loc[common_df["cluster_ob"] == clob][rescell].sum()
        intersection[clob][cltr] = -count
        # print('({}, {}): {}'.format(clob, cltr, count))

        # union
        union[clob][cltr] = -len(
            common_df[(common_df.cluster_ob == clob) | (common_df.cluster_tr == cltr)]
        )

In [None]:
intersection

In [None]:
union

In [None]:
row_ind, col_ind = scipy.optimize.linear_sum_assignment(intersection)
ob_match_tr_nusers = dict(zip(row_ind, col_ind))

print(row_ind, col_ind)
print(ob_match_tr_nusers)

### Matching with symmetric difference

In [None]:
symmdiff = -(union - intersection)

In [None]:
symmdiff

In [None]:
row_ind, col_ind = scipy.optimize.linear_sum_assignment(symmdiff)
ob_match_tr_symmdiff = dict(zip(row_ind, col_ind))

print(row_ind, col_ind)
print(ob_match_tr_symmdiff)

### Matching with Jaccard

In [None]:
jaccard = np.zeros((ncluster_ob, ncluster_tr))

for clob in range(ncluster_ob):
    for cltr in range(ncluster_tr):

        jaccard[clob][cltr] = -intersection[clob][cltr] / union[clob][cltr]
jaccard

In [None]:
row_ind, col_ind = scipy.optimize.linear_sum_assignment(jaccard)
ob_match_tr_jaccard = dict(zip(row_ind, col_ind))

print(row_ind, col_ind)
print(ob_match_tr_nusers)

### Matching with distances

In [None]:
# find centroids for each cluster in ob and tr datasets
# for each cluster the centroid is a vector of 18 elements
# in which each element is the mean of the values of users in that cluster
group_ob = onboard_df.groupby("cluster", as_index=False).mean()
group_tr = trip_df.groupby("cluster", as_index=False).mean()
group_tr

In [None]:
## initialize matrix of distances
distances = np.zeros((ncluster_ob, ncluster_tr))

# take useful cols
finalcols = copy.deepcopy(columns)
finalcols.remove("userid")
finalcols.remove("cluster")


for clob in range(ncluster_ob):
    for cltr in range(ncluster_tr):

        cluster_ob_coord = (
            group_ob.loc[group_ob["cluster"] == clob]
            .drop("cluster", axis=1)[finalcols]
            .values.tolist()[0]
        )
        cluster_tr_coord = (
            group_tr.loc[group_ob["cluster"] == cltr]
            .drop("cluster", axis=1)[finalcols]
            .values.tolist()[0]
        )

        dist = scipy.spatial.distance.euclidean(cluster_ob_coord, cluster_tr_coord)
        distances[clob][cltr] = dist
        print("({}, {}): {}".format(clob, cltr, dist))

In [None]:
distances

In [None]:
row_ind, col_ind = scipy.optimize.linear_sum_assignment(distances)

In [None]:
row_ind, col_ind = scipy.optimize.linear_sum_assignment(distances)
ob_match_tr_distances = dict(zip(row_ind, col_ind))

print(row_ind, col_ind)
print(ob_match_tr_distances)

In [None]:
import matplotlib.pyplot as plt

In [None]:
# https://medium.com/plotly/4-interactive-sankey-diagram-made-in-python-3057b9ee8616

tmp = common_df.groupby(["cluster_ob", "cluster_tr"]).size().reset_index(name="values")
tmp["cluster_tr"] = tmp["cluster_tr"].apply(lambda x: x + 5)
tmp["color"] = [
    "paleturquoise",
    "paleturquoise",
    "paleturquoise",
    "dodgerblue",
    "paleturquoise",
    "navajowhite",
    "navajowhite",
    "darkorange",
    "navajowhite",
    "navajowhite",
    "forestgreen",
    "lightgreen",
    "lightgreen",
    "lightgreen",
    "lightgreen",
    "thistle",
    "purple",
    "thistle",
    "thistle",
    "thistle",
    "mistyrose",
    "mistyrose",
    "mistyrose",
    "mistyrose",
    "firebrick",
]


fig = go.Figure(
    data=[
        go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=["B1", "B2", "B3", "B4", "B5", "T1", "T2", "T3", "T4", "T5"],
                color=[
                    "dodgerblue",
                    "darkorange",
                    "forestgreen",
                    "purple",
                    "firebrick",
                    "forestgreen",
                    "purple",
                    "darkorange",
                    "dodgerblue",
                    "firebrick",
                ],
            ),
            link=dict(
                source=tmp.cluster_ob,  # indices correspond to labels, eg A1, A2, A2, B1, ...
                target=tmp.cluster_tr,
                value=tmp["values"],
                color=tmp["color"],
            ),
        )
    ]
)

fig.update_layout(height=500, width=500)

fig.show()
plt.savefig(input_path + "alluvial.png", bbox_to_anchor=True, bbox_inches="tight")

In [None]:
input_path

In [None]:
# data_trace = dict(
#     type="sankey",
#     domain=dict(x=[0, 1], y=[0, 1]),
#     orientation="h",
#     valueformat=".0f",
#     node=dict(
#         pad=10,
#         thickness=30,
#         line=dict(color="black", width=0),
#         label=scottish_df["Node, Label"].dropna(axis=0, how="any"),
#         color=scottish_df["Color"],
#     ),
#     link=dict(
#         source=scottish_df["Source"].dropna(axis=0, how="any"),
#         target=scottish_df["Target"].dropna(axis=0, how="any"),
#         value=scottish_df["Value"].dropna(axis=0, how="any"),
#         color=scottish_df["Link Color"].dropna(axis=0, how="any"),
#     ),
# )

# layout = dict(
#     title="Scottish Referendum Voters who now want Independence",
#     height=772,
#     font=dict(size=10),
# )

# fig = dict(data=[data_trace], layout=layout)
# py.iplot(fig, validate=False)

### Visualization with centroids and UMAP

In [None]:
#### visualization with centroids
centroids_df = pd.concat([group_ob, group_tr], keys="cluster").reset_index(drop=True)
centroids_df["cluster"] = centroids_df.index
print(centroids_df.shape)

X_transformed = centroids_df.drop("cluster", axis=1)

# UMAP
import umap.umap_ as umap

n_components = 2
n_neighbors = 30
umap_dr = umap.UMAP(
    min_dist=0.0, n_components=n_components, random_state=42, n_neighbors=3
)

# apply to data
reduced_data_umap = pd.DataFrame(
    umap_dr.fit_transform(X_transformed), columns=["comp_1", "comp_2"]
)
# reduced_data_umap['cluster'] = reduced_data_umap.index
reduced_data_umap["cluster"] = [0, 1, 2, 3, 4, 2, 3, 1, 0, 4]


sns.scatterplot(
    x="comp_1",
    y="comp_2",
    hue="cluster",
    palette=sns.color_palette("hls", 5),
    data=reduced_data_umap,
    legend="full",
)

### Heatmaps

Three HM:

- Trips profile
- Matching onboarding
- Trips info of same users onboarding profiles

In [None]:
print("input_path_ob:", input_path_ob)
print("input_path:", input_path)

In [None]:
## read trips info of onboarding profiles
users_profile_trips = pd.read_csv(input_path_ob + "users_profile_trips_matching.csv")

ncols = 5
nrows = 3
names = [
    "Cycling and micro",
    "Private motorized",
    "Public long dist",
    "Public short dist",
    "Walking",
]
namesx = ["P", "E", "F", "GenP", "GenE", "GenF"]

fig, axes = plt.subplots(
    ncols=ncols, nrows=nrows, figsize=(22, 10), sharex=True, sharey=True
)
plt.subplots_adjust(wspace=0.05, hspace=0.1)
axes = axes.ravel()
# cbar_ax = fig.add_axes([.91, .3, .03, .4])

###### TRIP PROFILES
for i in range(ncols):

    cl = ob_match_tr_distances[i]
    tmp = pd.read_csv(
        input_path_tr + "summary_table_trips_hierarchical_cl" + str(cl) + "_mean.csv"
    )
    df_hm = tmp.iloc[:, -6:].astype("float")

    sns.heatmap(
        df_hm,
        ax=axes[i],
        vmin=0,
        vmax=1.9,
        yticklabels=names,
        xticklabels=namesx,
        cmap="coolwarm",
        cbar=False,
    )


###### ONBOARDING PROFILES
for i in range(ncols):

    tmp = pd.read_csv(
        input_path_ob
        + "summary_table_onboarding_hierarchical_cl"
        + str(i)
        + "_mean.csv"
    )
    df_hm = tmp.iloc[:, -6:].astype("float")

    sns.heatmap(
        df_hm,
        ax=axes[i + 5],
        vmin=0,
        vmax=1.9,
        yticklabels=names,
        xticklabels="",
        cmap="coolwarm",
        cbar=False,
    )


###### TRIP PROFILES
for i in range(ncols):

    tmp = (
        users_profile_trips[users_profile_trips.cluster == i]
        .mean()
        .reset_index(name="tc_mean")
    )
    row_lst = []
    for c in range(len(names)):

        row = list(tmp["tc_mean"].iloc[[c, c + 5, c + 10]])
        row_lst.append(row)

    df_hm = pd.DataFrame(row_lst)
    df_hm.columns = ["meanP", "meanE", "meanF"]
    df_hm["genP"] = [1] * 5
    df_hm["genE"] = [1] * 5
    df_hm["genF"] = [1] * 5

    im = sns.heatmap(
        df_hm,
        ax=axes[i + 10],
        vmin=0,
        vmax=1.9,
        yticklabels=names,
        xticklabels=namesx,
        cmap="coolwarm",
        cbar=False,
    )

# Row and column headers in matplotlib's subplots
# See:
# https://stackoverflow.com/a/25814386/2377454
axes[0].set_title("(B1, T4)", size="xx-large")
axes[1].set_title("(B2, T3)", size="xx-large")
axes[2].set_title("(B3, T1)", size="xx-large")
axes[3].set_title("(B4, T2)", size="xx-large")
axes[4].set_title("(B5, T5)", size="xx-large")

axes[0].set_ylabel("Evaluation", rotation=90, size="xx-large")
axes[5].set_ylabel("Onboarding", rotation=90, size="xx-large")
axes[10].set_ylabel("User average", rotation=90, size="xx-large")

fig.tight_layout(rect=[0, 0, 0.9, 1])
mappable = im.get_children()[0]
plt.colorbar(mappable, orientation="vertical")
plt.savefig(
    input_path + "heatmaps_trips_onboarding_users.png",
    bbox_to_anchor=True,
    bbox_inches="tight",
)

In [None]:
ncols = 5
nrows = 2
names = [
    "Cycling and micro",
    "Private motorized",
    "Public long dist",
    "Public short dist",
    "Walking",
]
namesx = ["P", "E", "F", "GenP", "GenE", "GenF"]

fig, axes = plt.subplots(
    ncols=ncols, nrows=nrows, figsize=(20, 10), sharex=True, sharey=True
)
axes = axes.ravel()

for i in range(ncols):

    tmp = pd.read_csv(
        input_path_ob
        + "summary_table_onboarding_hierarchical_cl"
        + str(i)
        + "_mean.csv"
    )
    df_hm = tmp.iloc[:, -6:].astype("float")

    if i == 4:
        sns.heatmap(
            df_hm,
            ax=axes[i],
            vmin=0,
            vmax=1.9,
            yticklabels=names,
            xticklabels="",
            cbar=True,
        )
    else:
        sns.heatmap(
            df_hm,
            ax=axes[i],
            vmin=0,
            vmax=1.9,
            yticklabels=names,
            xticklabels="",
            cbar=False,
        )


for i in range(ncols):

    cl = ob_match_tr_distances[i]
    tmp = pd.read_csv(
        input_path_tr + "summary_table_trips_hierarchical_cl" + str(cl) + "_mean.csv"
    )
    df_hm = tmp.iloc[:, -6:].astype("float")

    if i == 4:
        sns.heatmap(
            df_hm,
            ax=axes[i + 5],
            vmin=0,
            vmax=1.9,
            yticklabels=names,
            xticklabels=namesx,
            cbar=True,
        )
    else:
        sns.heatmap(
            df_hm,
            ax=axes[i + 5],
            vmin=0,
            vmax=1.9,
            yticklabels=names,
            xticklabels=namesx,
            cbar=False,
        )

    # sns.heatmap(df_hm, ax=axes[i+5], vmin=0, vmax=1.9)


axes[0].set_title("B1 - T4")
axes[1].set_title("B2 - T3")
axes[2].set_title("B3 - T1")
axes[3].set_title("B4 - T2")
axes[4].set_title("B5 - T5")

plt.savefig(input_path + "heatmaps.png", bbox_to_anchor=True, bbox_inches="tight")

### TC plot

In [None]:
i = 0
tmp_ob = tmp = pd.read_csv(
    input_path_ob + "summary_table_onboarding_hierarchical_cl" + str(i) + "_mean.csv"
)
tmp_tr = tmp = pd.read_csv(
    input_path_tr
    + "summary_table_trips_hierarchical_cl"
    + str(ob_match_tr_distances[i])
    + "_mean.csv"
)