In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as sst
import os
import scipy.stats as sst
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import spatial

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression


from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import rand_score
from sklearn.metrics import davies_bouldin_score

from scipy.stats import pearsonr
from tqdm.notebook import tqdm
import time

import pickle

%matplotlib inline

In [None]:
parent_dir = os.path.split(os.getcwd())[0]

In [None]:
zones = gpd.read_file(parent_dir + '\\Data\\New\\lms_zone_du_new.shp') # LMS Zone data

In [None]:
# Density
dens = pd.read_csv((parent_dir + '\\Data\\New\\lms_zone_density.csv'), index_col=0)

# Diversity
landuse = pd.read_csv((parent_dir + '\\Data\\New\\lms_diversity_lu.csv'), index_col=0)
hist = pd.read_csv((parent_dir + '\\Data\\New\\lms_zone_historical.csv'), index_col=0)

# Design
design = pd.read_csv((parent_dir + '\\Data\\New\\lms_zone_design.csv'), index_col=0) 

# Destination accessibility
dest = pd.read_csv((parent_dir + '\\Data\\New\\lms_zone_dest_access.csv'), index_col=0) 

# Distance to transit
transit = pd.read_csv((parent_dir + '\\Data\\New\\lms_zone_transit.csv'), index_col=0) 

# Demography
demo = pd.read_csv((parent_dir + '\\Data\\New\\zone_demographics.csv'), index_col=0) 

In [None]:
ovin = pd.read_csv(parent_dir + '\\Data\\New\\Ovin_final.csv', index_col=0)

In [None]:
## Modal split travel behaviour
ovin_tb = pd.read_csv(parent_dir + '\\Data\\New\\lms_zone_ovin_travel_behaviour_newF.csv', index_col=0)
lms_tb = pd.read_csv(parent_dir + '\\Data\\New\\lms_zone_lms_modal_split.csv', index_col=0)

In [None]:
lms_tb2 = lms_tb.iloc[:, 1:8].copy()
lms_tb2.iloc[:, 3] = lms_tb2.iloc[:, 3:5].sum(axis=1)
lms_tb2 = lms_tb2.drop(columns='Tram/Metro_o')

In [None]:
lms_orig = pd.read_csv(parent_dir + '\\Data\\New\\lms_modal_split_orig_abs.csv', index_col=0)

In [None]:
lms_tot = lms_orig.iloc[:, 1:8].sum(axis=1) # Total trips for each zone

In [None]:
lms_tb3 = lms_tb2.copy()
lms_tb3['Factor'] = lms_tot

## Clustering

Goal is to create a set of clusters where the variance between the travel behaviours for OViN is as large as possible

In [None]:
def scale_data(features, method='scale'):
    """
    Scale the data to 0-1 or normalize the data

    Parameters:
    features: numpy array containing all the features
    method: the type of data-scaling, string

    Returns:
    The transformed data as a numpy array
    """
    if method == 'scale':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    else:
        return 'Not a valid scaler'
    
    return scaler.fit_transform(features)

In [None]:
ids = np.arange(1406) # Select which zones to use

Create list with travel behaviour OViN

In [None]:
ovin_tb.iloc[:, 1]
ovin_list = []

for i in range(6):
    ovin_list.append(ovin_tb.iloc[:, i + 1].copy())

    ovin_list[i][ovin_list[i] != ovin_list[i]] = 0

In [None]:
data_ovin = scale_data(list(zip(*ovin_list)))

Create list with all variables

In [None]:
d_var = []
dvar_labels = []

for i in range(len(dens.columns) - 1):
    d_var.append(dens.iloc[:, i + 1])

    dvar_labels.append(dens.columns[i + 1])


for i in range(len(landuse.columns) - 1):
    d_var.append(landuse.iloc[:, i + 1])

    dvar_labels.append(landuse.columns[i + 1])

for i in range(len(hist.columns) - 1):
    d_var.append(hist.iloc[:, i + 1])

    dvar_labels.append(hist.columns[i + 1])

for i in range(len(design.columns) - 1):
    d_var.append(design.iloc[:, i + 1])

    dvar_labels.append(design.columns[i + 1])

for i in range(len(dest.columns) - 1):
    d_var.append(dest.iloc[:, i + 1])

    dvar_labels.append(dest.columns[i + 1])

for i in range(len(transit.columns) - 1):
    d_var.append(transit.iloc[:, i + 1])

    dvar_labels.append(transit.columns[i + 1])

In [None]:
d_var.append(transit[['Distance_metro', 'Distance_tram']].min(axis=1))
dvar_labels.append('Distance_TM')

In [None]:
d_var.append(landuse[['Nature', 'Agricultural']].sum(axis=1))
dvar_labels.append('Nature_Agri')

In [None]:
d_var.append(transit[['Tram_stops', 'Metro_stops']].sum(axis=1))
dvar_labels.append('TM_stops')

In [None]:
len(d_var)

Deal with missing values

In [None]:
for i in range(48):
    l = len(d_var[i][d_var[i] != d_var[i]])

    if l > 0:
        print(f'There are {l} missing values in {i}:{dvar_labels[i]}')

    if l < 50:
        d_var[i][d_var[i] != d_var[i]] = d_var[i].mean()

In [None]:
d_var[36][d_var[36] != d_var[36]] = 5
d_var[37][d_var[37] != d_var[37]] = 5

d_var[40][d_var[40] != d_var[40]] = 0
d_var[41][d_var[41] != d_var[41]] = 0

In [None]:
d_var[-3][d_var[-3] != d_var[-3]] = 5

For now, use the mean value for all categories, except for the metro and tram related variables. Give those a distance of 10 km. This is double the max 'real' distance of 5 km.

In [None]:
data = scale_data(list(zip(*d_var)))

In [None]:
data[0]

## Create functions

In [None]:
def cluster(n, data):
    """Perform hierarchical clustering with n clusters on data.
    Return cluster labels"""

    hier_cluster = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='ward')
    labels_cluster = hier_cluster.fit_predict(data)

    return labels_cluster

In [None]:
def reindex_df(df, weight_col):
    """expand the dataframe to prepare for resampling
    result is 1 row per count per sample"""
    df = df.reindex(df.index.repeat(df[weight_col]))
    df.reset_index(drop=True, inplace=True)
    return(df)

In [None]:
def modal_split_ovin(n, labels_cluster, idx=np.arange(1406)):

    """"Calculate modal split for different clusters based on OViN data"""


    df_list = []
    tb_list = np.zeros((n, 6))
    std_list = np.zeros((n, 6))

    for i in range(n):

        df_list.append(ovin_tb.iloc[idx][labels_cluster[idx] == i].copy())
        tot = df_list[i].iloc[:, -3]
        tb_list[i] = np.array(df_list[i].iloc[:, 1:7].multiply(tot, axis='index').sum() / tot.sum())

        df_list[i][df_list[i].isnull()] = 0

        for m in range(6):
            if tot.sum() != 0:
                std_list[i, m] = np.sqrt(np.cov(df_list[i].iloc[:, 1 + m], aweights=tot))        
    
    tb_list[tb_list != tb_list] = 0
    
    return tb_list, std_list

In [None]:
def modal_split_lms(n, labels_cluster, idx=np.arange(1406)):
    """Calculate modal split for different clusters based on LMS data"""

    lms_list = []
    lms_tblist = np.zeros((n, 6))
    lms_stdlist = np.zeros((n, 6))


    for i in range(n):

        lms_list.append(lms_tb2.iloc[idx][labels_cluster[idx] == i].copy())

        pop = lms_tot[idx][labels_cluster[idx] == i]

        lms_tblist[i] = np.array(lms_list[i].multiply(pop, axis='index').sum() / pop.sum())

        lms_list[i][lms_list[i].isnull()] = 0

        for m in range(6):
            if pop.sum() != 0:
                lms_stdlist[i, m] = np.sqrt(np.cov(lms_list[i].iloc[:, m], aweights=pop))
    
    lms_tblist[lms_tblist != lms_tblist] = 0

    return lms_tblist, lms_stdlist

In [None]:
def average_variance(tb_list):
    """Calculate average variance for travel behaviour"""
    var_arr = np.zeros(6)

    for i in range(6):
        var_arr[i] = tb_list[:, i].var()

    return var_arr.mean()

In [None]:
def corr_matrix(idx, size=(10, 8), method='pearson'):
    """Plot correlation matrix""" 

    corr = pd.DataFrame(data[:, idx]).corr(method=method)
    pval = pd.DataFrame(data[:, idx]).corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*corr.shape)

    corr[pval > 0.05] = np.nan

    pval[pval > 0.05] = 1
    pval = pval[pval > 0.05]

    plt.figure(figsize=size)
    plt.imshow(pval, origin='lower', cmap='Greys', vmax=1, vmin=-1)
    plt.imshow(corr, origin='lower', cmap='RdBu', vmax=1, vmin=-1)
    cb = plt.colorbar()
    cb.set_label('Pearson correlation coefficient [-]')
    plt.title('Correlation matrix based on pearson correlation coefficient')
    plt.xticks(np.arange(len(idx)), labels=np.array(dvar_labels)[idx], rotation=90)
    plt.yticks(np.arange(len(idx)), labels=np.array(dvar_labels)[idx]);

In [None]:
def scatterplots(idx, size=(7, 7)):
    """Plot scatter plots"""

    fig = plt.figure(figsize=size)

    plotz = len(np.arange(idx[0], idx[1]))

    for i in range(plotz - 1):
        for j in range(plotz):
            if (j > i):
                ax = plt.subplot2grid((plotz-1, plotz-1), (i,j-1))
                
                if i == 0:
                    ax.set_title(f'{dvar_labels[j + idx[0]]}')

                if j == i + 1:
                    ax.set_ylabel(f'{dvar_labels[i + idx[0]]}')
                else:
                    ax.xaxis.set_ticklabels([])
                    ax.yaxis.set_ticklabels([])
                plt.scatter(data[:, i + idx[0]], data[:, j + idx[0]], s=0.1)

In [None]:
def sort_data(n, labels_cluster, tb_list, lms_list, std_tb_list, std_lms_list, idx):

    mean_be = np.zeros((len(data[0, idx]), n))

    for v in range(len(data[0, idx])):
        for c in range(n):
            
            mean_be[v, c] = data[:, idx[v]][labels_cluster == c].mean()

    x_sort = np.arange(n)[np.argsort(-tb_list[:, 0])]
    lms_tblist = lms_list[np.argsort(-tb_list[:, 0])]
    tb_list_n = tb_list[np.argsort(-tb_list[:, 0])]
    mean_be = mean_be.T[np.argsort(-tb_list[:, 0])].T
    std_tb_list_n = std_tb_list[np.argsort(-tb_list[:, 0])]
    std_lms_list_n = std_lms_list[np.argsort(-tb_list[:, 0])]

    cluster_size = np.zeros(n)

    for i in range(n):
        cluster_size[i] = len(labels_cluster[labels_cluster == i])

    cluster_size = cluster_size[np.argsort(-tb_list[:, 0])]

    return tb_list_n, lms_tblist, x_sort, mean_be, cluster_size, std_tb_list_n, std_lms_list_n

In [None]:
np.nanmax([np.nan, 2, 3])

In [None]:
def plot_modal_split(n, tb_list_n, lms_tblist, std_list, lms_std_list, x_sort, plot_std=True,
                     labels_cluster=None):

    if plot_std is True:
        height = (3, 15)
    else:
        height = (2, 10)
        height = (3, 15)


    f, ax = plt.subplots(height[0], 6)
    f.set_figwidth(17)
    f.set_figheight(height[1])

    x = np.arange(n)
    mode_labels = ['Car driver', 'Car passenger', 'Train',
            'BTM', '(e)-Bike', 'Walking']


    ax[0, 0].plot([], [], color="#D55E00", label='OViN', linewidth=5)
    ax[0, 0].plot([], [], color="#0072B2", label='LMS', linewidth=5)
    ax[0, 0].plot([], [], color="#009E73", label='Absolute difference OViN LMS', linewidth=5)
    ax[0, 0].plot([], [], color="#E69F00", label='Relative difference OViN LMS', linewidth=5)

    if plot_std is True:
        ax[0, 0].plot([], [], color='salmon', label='OViN - std', linewidth=5)
        ax[0, 0].plot([], [], color='lightgreen', label='LMS - std', linewidth=5)

    handles, labels = ax[0, 0].get_legend_handles_labels()

    diff = (lms_tblist - tb_list_n) / tb_list_n * 100
    max_diff = (np.nanmax(diff) // 10 + 2) * 10
    min_diff = (np.nanmin(diff) // 10  -1) * 10


    for i in range(6):
        ax[0, i].bar(x - 0.18, tb_list_n[:, i], width=0.3, color="#D55E00")
        ax[0, i].bar(x + 0.18, lms_tblist[:, i], width=0.3, color="#0072B2")

        
        ax[0, i].set_title(mode_labels[i])
        ax[0, i].set_yticks(np.arange(0, tb_list_n.max() + 7, 5))
        ax[0, i].set_xticks(np.arange(n), labels=x_sort)
        ax[0, i].set_axisbelow(True)
        ax[0, i].grid(axis='y')
        ax[0, i].set_xlabel('Cluster number')


        
        ax[1, i].bar(x, (lms_tblist[:, i] - tb_list_n[:, i]), 
                    width=0.8, color="#009E73")
        
        ax[2, i].bar(x, (lms_tblist[:, i] - tb_list_n[:, i]) / tb_list_n[:, i] * 100, 
                    width=0.8, color="#E69F00")
        

        ax[1, i].set_title(mode_labels[i])
        ax[1, i].set_yticks(np.arange(-14, 15, 2))
        ax[1, i].set_xticks(np.arange(n), labels=x_sort)
        ax[1, i].set_axisbelow(True)
        ax[1, i].grid(axis='y')
    
        ax[1, i].set_xlabel('Cluster number')

        
        ax[2, i].set_title(mode_labels[i])
        ax[2, i].set_yticks(np.arange(min_diff, max_diff + 5, 10))
        ax[2, i].set_xticks(np.arange(n), labels=x_sort)
        ax[2, i].set_axisbelow(True)
        ax[2, i].grid(axis='y')
    
        ax[2, i].set_xlabel('Cluster number')


    ax[0, 0].set_ylabel('Share of trips made by mode [%]')
    ax[1, 0].set_ylabel('Absolute difference OViN - LMS [%-point]')
    ax[2, 0].set_ylabel('Relative difference OViN - LMS [%]')
    
    if plot_std is True:
        supt = 'Top: Modal split LMS and OViN for different clusters \nMiddle: Difference OViN LMS \nBottom: Standard deviation modal split within cluster'
    else:
        supt = 'Top: Modal split LMS and OViN for different clusters \nMiddle and bottom: Difference OViN LMS'

    f.legend(handles, labels, loc='upper right', bbox_to_anchor=(0.9, 1.0))
    f.suptitle(supt,
            fontsize='xx-large');

In [None]:
def plot_nl(n, labels_cluster, idx=np.arange(1406), colormap='Set1'):

    f, ax = plt.subplots(1, 1)
    f.set_figwidth(15)
    f.set_figheight(10)

    cmap = plt.get_cmap(colormap, lut=n)

    zones.iloc[idx].plot(ax=ax, column=labels_cluster[idx], cmap=cmap, vmin=0, vmax=n - 1,
            legend=True, legend_kwds={"label": "Cluster number", "ticks": np.arange(n)})

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_facecolor('lightcyan')
    ax.set_title('Clustering');

In [None]:
def heatmap(n, labels_cluster, x_sort):

    heatmap_relative = np.zeros((6, n))

    for i in range(1, 7):
        for j in range(n):

            cluster_zones = zones[labels_cluster == x_sort[j]]
            size = len(cluster_zones[cluster_zones.deg_urba == i])
            # heatmap_relative[i - 1, j] = size / len(zones[zones.deg_urba == i])
            heatmap_relative[i - 1, j] = size / len(zones[labels_cluster == x_sort[j]])


    plt.imshow(heatmap_relative, cmap='YlOrRd', origin='lower')
    cb = plt.colorbar()
    cb.set_label('Relative share of zones from each new cluster in each DU')
    plt.xticks(np.arange(n), labels=x_sort)
    plt.yticks(np.arange(6), labels=np.arange(1, 7))
    plt.xlabel('New clusters')
    plt.ylabel('Degree of urbanisation')
    plt.title('Overlap degree of urbanisation and new clusters');

## Correlation

When plotting all variables, there sems to be correlation within the D-variables. So first, take a look at the individual d-variabeles and see how there are correlated. And then choose if to keep all of them or not.

In [None]:
corr_matrix(np.arange(49))

#### Density

In [None]:
dvar_labels[:5]

In [None]:
corr_matrix(np.arange(5), size=(5, 5))

In [None]:
scatterplots([0, 5])

#### Diversity

In [None]:
dvar_labels[5:13]

In [None]:
corr_matrix(np.arange(5, 13), size=(5, 5))

In [None]:
scatterplots([5, 13], size=(11, 11))

In [None]:
dvar_labels[13:18]

In [None]:
corr_matrix(np.arange(13, 18), size=(5, 5))

In [None]:
scatterplots([13, 18])

#### Design

In [None]:
dvar_labels[18:22]

In [None]:
corr_matrix(np.arange(18, 22), size=(5, 5))

In [None]:
scatterplots([18, 22])

#### Dest accessibility

In [None]:
dvar_labels[22:29]

In [None]:
corr_matrix(np.arange(22, 29), size=(5, 5))

In [None]:
scatterplots([22, 29])

#### Dist transit

In [None]:
dvar_labels[30:]

In [None]:
corr_matrix(np.arange(30, 42), size=(5, 5))

In [None]:
scatterplots([30, 42], size=(10, 10))

### Correlation with travel behaviour

In [None]:
corr = pd.DataFrame(np.hstack([data, data_ovin])).corr().iloc[:49, 49:]

pval = pd.DataFrame(np.hstack([data, data_ovin])).corr(method=lambda x, y: pearsonr(x, y)[1]).iloc[:49, 49:] - np.eye(*corr.shape)

corr[pval > 0.05] = np.nan

pval[pval > 0.05] = 1
pval = pval[pval > 0.05]

plt.figure(figsize=(19, 15))
plt.imshow(pval, origin='lower', cmap='Greys', vmax=1, vmin=-1)


plt.imshow(corr, origin='lower', cmap='RdBu', vmax=1, vmin=-1)
cb = plt.colorbar()
cb.set_label('Pearson correlation coefficient [-]')
plt.title('Correlation matrix based on pearson correlation coefficient')
plt.xticks(np.arange(6), labels=['Car driver', 'Car pas', 'Train', 'BTM', 'Bike', 'Walk'], rotation=90)
plt.yticks(np.arange(len(corr)), labels=np.array(dvar_labels));

## First set of variables

Optie 1: d_idx = np.array([4, 0, 1, 29, 18]) met 5 clusters. 6 kan evt. ook

Optie 2: d_idx = np.array([4, 0, 1, 29, 29, 29, 29, 18, 18, 18, 18, 3, 42, 42, 42, 42]) (dus met gewichten, 6 clusters of 7 voor beter landelijk)

In [None]:
corr = pd.DataFrame(np.hstack([data, data_ovin])).corr().iloc[:49, 49:]
corr.abs().mean(axis=1).sort_values(ascending=False).index
## Sorted variables based on average absolute correlation with travel behaviour

In [None]:
corr.abs().iloc[:, 4].sort_values(ascending=False).index

In [None]:
corr.abs().iloc[:, 2].sort_values(ascending=False).index

In [None]:
d_idx = np.array([4, 0, 1, 29, 18, 48, 6]) ## Unweighted cluster set
d_idx = np.array([0, 1, 3, 4, 29, 29, 29, 29, 18, 18, 18, 18, 48, 48, 43, 43, 6, 6, 13, 13, 28, 28, 28, 28]) # Weighted cluster set

In [None]:
corr_matrix(d_idx)

In [None]:
df = pd.DataFrame(np.hstack([data, data_ovin]))

corr = df.corr()
pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*corr.shape)

In [None]:
corr = pd.DataFrame(np.hstack([data[:, d_idx], data_ovin])).corr().iloc[:len(d_idx), len(d_idx):]
pval = pd.DataFrame(np.hstack([data[:, d_idx], data_ovin])).corr(method=lambda x, y: pearsonr(x, y)[1]).iloc[:len(d_idx), len(d_idx):] - np.eye(*corr.shape)

corr[pval > 0.05] = np.nan

pval[pval > 0.05] = 1
pval = pval[pval > 0.05]

plt.figure(figsize=(8, 8))
plt.imshow(pval, origin='lower', cmap='Greys', vmax=1, vmin=-1)
plt.imshow(corr, origin='lower', cmap='RdBu', vmax=1, vmin=-1)

cb = plt.colorbar()
cb.set_label('Pearson correlation coefficient [-]')
plt.title('Correlation matrix based on pearson correlation coefficient')
plt.xticks(np.arange(6), labels=['Car driver', 'Car pas', 'Train', 'BTM', 'Bike', 'Walk'], rotation=90)
plt.yticks(np.arange(len(corr)), labels=np.array(dvar_labels)[d_idx]);

## Clustering

In [None]:
linkage_data = linkage(data[:, d_idx], method='ward', metric='euclidean')
dendrogram(linkage_data, color_threshold=5.4)

# plt.axhline(2.61, color='k', linestyle='--', label='Cut off for 7 clusters')
plt.title('Dendrogram weighted cluster set')
plt.xticks([])
# plt.legend()

In [None]:
sil_list = []
cal_list = []
dav_list = []
var_list = []

sil_tb_list = []
cal_tb_list = []
dav_tb_list = []

for k in range(2, 16):
    hier_cluster = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
    
    labels_cluster = hier_cluster.fit_predict(data[:, d_idx])

    sil = silhouette_score(data[:, d_idx], labels_cluster, metric = 'euclidean') 
    sil_list.append(sil)
    cal = calinski_harabasz_score(data[:, d_idx], labels_cluster)
    cal_list.append(cal)
    dav = davies_bouldin_score(data[:, d_idx], labels_cluster)
    dav_list.append(dav)

    # Calculate indicators for clusters based on travel behaviour
    tb_list, std_list_s = modal_split_ovin(k, labels_cluster)

    var_list.append(average_variance(tb_list))
    sil_tb_list.append(silhouette_score(data_ovin, labels_cluster, metric = 'euclidean'))
    cal_tb_list.append(calinski_harabasz_score(data_ovin, labels_cluster))
    dav_tb_list.append(davies_bouldin_score(data_ovin, labels_cluster))

In [None]:
f, ax = plt.subplots(1, 3)
f.set_figwidth(15)

ax[0].plot(np.arange(2, 16), sil_list)
ax[0].set_xticks(np.arange(2, 16))
ax[0].set_ylabel('Silhouette score')
ax[0].set_xlabel('Number of clusters')
ax[0].set_title('Silhouette score')
ax[0].grid()


ax[1].plot(np.arange(2, 16), cal_list)
ax[1].set_xticks(np.arange(2, 16))
ax[1].set_ylabel('Calinski-Harabasz score')
ax[1].set_xlabel('Number of clusters')
ax[1].set_title('Calinski-Harabasz score')
ax[1].grid()


ax[2].plot(np.arange(2, 16), dav_list)
ax[2].set_xticks(np.arange(2, 16))
ax[2].set_ylabel('Davies-Bouldin score')
ax[2].set_xlabel('Number of clusters')
ax[2].set_title('Davies-Bouldin score')
ax[2].grid()


In [None]:
n = 7 ## Number of clusters

In [None]:
labels_cluster = cluster(n, data[:, d_idx])
tb_list, std_list = modal_split_ovin(n, labels_cluster)
lms_list, lms_std_list = modal_split_lms(n, labels_cluster)
average_variance(tb_list)

In [None]:
tb_list_n, lms_tblist, x_sort, mean_be, cluster_size, std_list_n, lms_std_list_n = sort_data(n, labels_cluster, tb_list, lms_list, std_list, lms_std_list, d_idx)

In [None]:
std_list_n.mean()

In [None]:
cluster_size, np.round(cluster_size / cluster_size.sum() * 100, 1)

In [None]:
lms_tblist

In [None]:
plot_modal_split(n, tb_list_n, lms_tblist, std_list_n, lms_std_list_n, x_sort, plot_std=False, labels_cluster=labels_cluster)

In [None]:
plot_nl(n, labels_cluster, colormap='tab10')

In [None]:
def heatmap(n, labels_cluster, x_sort):

    heatmap_relative = np.zeros((6, n))

    for i in range(1, 7):
        for j in range(n):

            cluster_zones = zones[labels_cluster == x_sort[j]]
            size = len(cluster_zones[cluster_zones.deg_urba == i])
            # heatmap_relative[i - 1, j] = size / len(zones[zones.deg_urba == i])
            heatmap_relative[i - 1, j] = size / len(zones[labels_cluster == x_sort[j]])


    plt.imshow(heatmap_relative, cmap='YlOrRd', origin='lower', vmax=1)
    cb = plt.colorbar()
    cb.set_label('Relative share of zones from each \nweighted cluster in each DU')
    plt.xticks(np.arange(n), labels=x_sort)
    plt.yticks(np.arange(6), labels=np.arange(1, 7))
    plt.xlabel('Weighted clusters')
    plt.ylabel('Degree of urbanisation')
    plt.title('Overlap degree of urbanisation \nand weighted cluster set');

In [None]:
heatmap(n, labels_cluster, x_sort)

In [None]:
data_cluster = []

for v in range(len(data[0, d_idx])):
    c_list  = []
    for c in x_sort:
        
       c_list.append(data[:, d_idx[v]][labels_cluster == c])
    
    data_cluster.append(c_list)

In [None]:
f, ax = plt.subplots(len(data[0, d_idx]), 1)
f.set_figheight(40)

for i in range(len(data[0, d_idx])):
    ax[i].plot(np.arange(n), mean_be[i], label=dvar_labels[d_idx[i]])
    ax[i].boxplot(data_cluster[i], positions=np.arange(n))
    ax[i].set_xticks(np.arange(n), labels=x_sort)
    ax[i].legend()