In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as sst
import os
import scipy.stats as sst

from scipy import spatial

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression


from tqdm.notebook import tqdm
import random

import pickle

%matplotlib inline

In [None]:
parent_dir = os.path.split(os.getcwd())[0]

In [None]:
zones = gpd.read_file(parent_dir + '\\Data\\New\\lms_zone_du_new.shp') # LMS Zone data

In [None]:
# Demography
demo = pd.read_csv((parent_dir + '\\Data\\New\\zone_demographics.csv'), index_col=0) 

In [None]:
ovin = pd.read_csv(parent_dir + '\\Data\\New\\Ovin_final.csv', index_col=0)

In [None]:
## Modal split travel behaviour
ovin_tb = pd.read_csv(parent_dir + '\\Data\\New\\lms_zone_ovin_travel_behaviour_newF.csv', index_col=0)
lms_tb = pd.read_csv(parent_dir + '\\Data\\New\\lms_zone_lms_modal_split.csv', index_col=0)

In [None]:
lms_tb2 = lms_tb.iloc[:, 1:8].copy()
lms_tb2.iloc[:, 3] = lms_tb2.iloc[:, 3:5].sum(axis=1)
lms_tb2 = lms_tb2.drop(columns='Tram/Metro_o')

lms_orig = pd.read_csv(parent_dir + '\\Data\\New\\lms_modal_split_orig_abs.csv', index_col=0)
lms_tot = lms_orig.iloc[:, 1:8].sum(axis=1) # Total trips for each zone

In [None]:
labels_advanced = np.load(parent_dir + '\\Data\\New\\cluster_labels_advanced.npy')
labels_simple = np.load(parent_dir + '\\Data\\New\\cluster_labels_simple.npy')
labels_du = np.array(zones.deg_urba - 1)

In [None]:
labels = labels_du # Select which cluster set to test

In [None]:
n = 6 # And corresponding no. of clusters

In [None]:
ovin.FactorV_final.min()

## Heatmaps

In [None]:
x_sort_simple = np.array([5, 6, 3, 0, 2, 4, 1])
x_sort_advanced = np.array([0, 4, 3, 6, 5, 2, 1])

In [None]:
heatmap = np.zeros((7, 7))

k = 0

for i in x_sort_advanced:
    l = 0

    for j in x_sort_simple:

        x_i = labels_simple[(labels_advanced == i) & (labels_simple == j)]

        relative_size = len(x_i) / len(labels_advanced[labels_simple == j])
        # relative_size = len(x_i) 


        heatmap[k, l] = relative_size

        l += 1
    
    k += 1


In [None]:
plt.imshow(heatmap, cmap='YlOrRd', origin='lower')
cb = plt.colorbar()
cb.set_label('Relative share of zones from each unweighted cluster \nbelonging to a weighted cluster')
plt.xticks(np.arange(7), labels=x_sort_simple)
plt.yticks(np.arange(7), labels=x_sort_advanced)
plt.xlabel('Unweighted clusters')
plt.ylabel('Weighted clusters')
plt.title('Overlap weighted and unweighted cluster sets');

In [None]:
def heatmap(n, labels_cluster, x_sort):

    heatmap_relative = np.zeros((6, n))

    for i in range(1, 7):
        for j in range(n):

            cluster_zones = zones[labels_cluster == x_sort[j]]
            size = len(cluster_zones[cluster_zones.deg_urba == i])
            heatmap_relative[i - 1, j] = size / len(zones[zones.deg_urba == i])

    plt.imshow(heatmap_relative, cmap='YlOrRd', origin='lower')
    cb = plt.colorbar()
    cb.set_label('Relative share of zones from each DU in each new cluster')
    plt.xticks(np.arange(n), labels=x_sort)
    plt.yticks(np.arange(6), labels=np.arange(1, 7))
    plt.xlabel('New clusters')
    plt.ylabel('Degree of urbanisation')
    plt.title('Overlap degree of urbanisation and new clusters');

# PSM

In [None]:
def scale_data(features, method='scale'):
    """
    Scale the data to 0-1 or normalize the data

    Parameters:
    features: numpy array containing all the features
    method: the type of data-scaling, string

    Returns:
    The transformed data as a numpy array
    """
    if method == 'scale':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    else:
        return 'Not a valid scaler'
    
    return scaler.fit_transform(features)

In [None]:
def reindex_df(df, weight_col):
    """expand the dataframe to prepare for resampling
    result is 1 row per count per sample"""
    df = df.reindex(df.index.repeat(df[weight_col]))
    df.reset_index(drop=True, inplace=True)
    return(df)

Select OViN trips based on departure zone and prepare variables

In [None]:
def get_ovin(n, labels_cluster):

    ovin_trips = []
    sizes = np.zeros(n)

    for c in range(n):
        zone_list = ovin_tb[labels_cluster == c].ZONE_ID
        ovin_trips.append(ovin[ovin.VertZone.isin(zone_list)])
        sizes[c] = len(ovin_trips[c])

    return ovin_trips, sizes


In [None]:
def get_ovin_cat(n, ovin_trips):

    # Get demographic data from OViN
    new_trips = []
    sizes = np.zeros(n)

   
    for i in range(n):
        # Get useful columns
        trips = ovin_trips[i][['Leeftijd', 'Geslacht', 'HHGestInkG', 'HHPers', 'HHSam', 
                       'MaatsPart', 'Opleiding', 'HHAuto', 'Rijbewijs', 'OVStKaart', 'KHvm', 'FactorV_final']]
        
        # Make variables dummies or continious
        new_trips.append(pd.DataFrame(trips['Leeftijd']))

        new_trips[i]['Gender'] = trips['Geslacht'] - 1

        new_trips[i]['Income'] = trips['HHGestInkG']
        new_trips[i].loc[new_trips[i]['Income'] == 11, 'Income'] = np.nan

        new_trips[i]['HH_size'] = trips['HHPers']

        new_trips[i]['1person_hh'] = np.zeros(len(trips))
        new_trips[i]['2+person_hh'] = np.zeros(len(trips))
        new_trips[i]['1parent_hh'] = np.zeros(len(trips))
        # new_trips[i]['2parent_hh'] = np.zeros(len(trips)) ## Reference

        new_trips[i].loc[trips.HHSam.isin([1]), '1person_hh'] = 1
        new_trips[i].loc[trips.HHSam.isin([2, 5, 8]), '2+person_hh'] = 1
        new_trips[i].loc[trips.HHSam.isin([6, 7]), '1parent_hh'] = 1
        # new_trips[i].loc[trips.HHSam.isin([3, 4]), '2parent_hh'] = 1 ## reference

        new_trips[i]['Part_time'] = np.zeros(len(trips))
        new_trips[i]['Full_time'] = np.zeros(len(trips))
        new_trips[i]['Student'] = np.zeros(len(trips))
        # new_trips[i]['Other_part'] = np.zeros(len(trips)) ## Reference

        new_trips[i].loc[trips.MaatsPart.isin([1]), 'Part_time'] = 1
        new_trips[i].loc[trips.MaatsPart.isin([2]), 'Full_time'] = 1
        new_trips[i].loc[trips.MaatsPart.isin([4]), 'Student'] = 1
        # new_trips[i].loc[trips.MaatsPart.isin([3, 5, 6, 7, 8, 10]), 'Other_part'] = 1 ## reference
        new_trips[i].loc[trips.MaatsPart.isin([9]), 'Part_time'] = np.nan # To filter unknown values

        new_trips[i]['Primary_or_less'] = np.zeros(len(trips))
        new_trips[i]['lbo_vmbo'] = np.zeros(len(trips))
        new_trips[i]['mbo_havo_vwo'] = np.zeros(len(trips))
        # new_trips[i]['hbo_wo'] = np.zeros(len(trips)) ## Use as reference
        new_trips[i]['Other_edu'] = np.zeros(len(trips))
        new_trips[i]['Younger_than_15'] = np.zeros(len(trips))


        new_trips[i].loc[trips.Opleiding.isin([0, 1]), 'Primary_or_less'] = 1
        new_trips[i].loc[trips.Opleiding.isin([2]), 'lbo_vmbo'] = 1
        new_trips[i].loc[trips.Opleiding.isin([3]), 'mbo_havo_vwo'] = 1
        # new_trips[i].loc[trips.Opleiding.isin([4]), 'hbo_wo'] = 1 ## reference
        new_trips[i].loc[trips.Opleiding.isin([5]), 'Other_edu'] = 1 
        new_trips[i].loc[trips.Opleiding.isin([7]), 'Younger_than_15'] = 1 
        new_trips[i].loc[trips.Opleiding.isin([6]), 'Other_edu'] = np.nan # To filter unknown values

        new_trips[i]['HH_cars'] = trips['HHAuto']
        new_trips[i].loc[trips['HHAuto'] == 10, 'HH_cars'] = np.nan

        new_trips[i]['Drivers_licence'] = np.zeros(len(trips))
        new_trips[i].loc[trips.Rijbewijs.isin([1]), 'Drivers_licence'] = 1
        new_trips[i].loc[trips.Rijbewijs.isin([2]), 'Drivers_licence'] = np.nan

        new_trips[i]['Student_OV'] = np.zeros(len(trips))
        new_trips[i].loc[trips.OVStKaart.isin([1, 2]), 'Student_OV'] = 1
        new_trips[i].loc[trips.OVStKaart.isin([3]), 'Student_OV'] = np.nan


        # Add variables for modal split
        new_trips[i]['Car_driver'] = np.zeros(len(trips))
        new_trips[i]['Car_passenger'] = np.zeros(len(trips))
        new_trips[i]['Train'] = np.zeros(len(trips))
        new_trips[i]['BTM'] = np.zeros(len(trips))
        new_trips[i]['Bike'] = np.zeros(len(trips))
        new_trips[i]['Walking'] = np.zeros(len(trips))

        new_trips[i].loc[trips.KHvm.isin([1]), 'Car_driver'] = 1
        new_trips[i].loc[trips.KHvm.isin([2]), 'Car_passenger'] = 1
        new_trips[i].loc[trips.KHvm.isin([3]), 'Train'] = 1
        new_trips[i].loc[trips.KHvm.isin([4]), 'BTM'] = 1
        new_trips[i].loc[trips.KHvm.isin([6]), 'Bike'] = 1
        new_trips[i].loc[trips.KHvm.isin([7]), 'Walking'] = 1

        new_trips[i].loc[trips.KHvm.isin([5, 8]), 'Walking'] = np.nan # Filter other modes
        new_trips[i].loc[trips.KHvm.isnull(), 'Walking'] = np.nan # Filter other modes

        new_trips[i]['FactorV'] = trips['FactorV_final']
        new_trips[i]['FactorV'] = new_trips[i].FactorV.round().astype(int)
        
        # Add cluster number
        new_trips[i]['Cluster'] = i


        new_trips[i] = new_trips[i].dropna()

        
        sizes[i] = len(new_trips[i])

    return new_trips, sizes

In [None]:
trips, sizes = get_ovin(n, labels)

In [None]:
n_trips, n_sizes = get_ovin_cat(n, trips)

In [None]:
np.round(n_sizes / sizes, 2) # See how much data was deleted

In [None]:
mean_arr = np.zeros((n, len(n_trips[0].columns)))

for i in range(n):
    mean_arr[i] = np.average(n_trips[i], axis=0, weights=n_trips[i].FactorV)

mean_df = pd.DataFrame(np.round(mean_arr, 2), columns=n_trips[0].columns).T
mean_df

# pd.DataFrame(mean_arr, columns=n_trips[0].columns).T
# mean_df

### Calculate p-values

First calculate p-values to see if they are statistically different. Do this based on unweighted values for now, to save memory and space

In [None]:
def calc_p(trips):

    p_arr = np.zeros((n, n, 6))

    df_list = []

    for i in range(n):
        for j in range(n):
            for m in range(6):
                _, p = sst.ttest_ind(trips[i].iloc[:, 18 + m], trips[j].iloc[:, 18 + m])

                p_arr[i, j, m] = p

    for m in range(6):

        df_list.append(np.round(pd.DataFrame(p_arr[:, :, m]), 6))

    return p_arr, df_list


In [None]:
p_arr, df_list = calc_p(n_trips)

In [None]:
df_list[4]

Travel behaviour in most clusters is statistically different.

### Calculate prospensity scores

Again, do this based on unweighted values to preserve memory. Later check if this works enough. prospensity score is only a means to an end and no end result.

In [None]:
def prepare_clusters(trips):

    df_list = []
    X_list = []
    y_list = []

    for i in range(n):
        for j in range(n):

            if (i != j) and (i > j):
                
                trips[i]['n_cluster'] = 0
                trips[j]['n_cluster'] = 1

                new_df = pd.concat([trips[i], trips[j]])
                new_df = new_df.reset_index()

                df_list.append(new_df)
                X_list.append(new_df.iloc[:, 1:19])
                y_list.append(new_df.iloc[:, -1])
    
    return df_list, X_list, y_list

In [None]:
def logistic_regr(df_list, X_list, y_list):

    coeff_df = pd.DataFrame({'Variable':X_list[0].columns})
    prob_list = []

    for i in range(len(df_list)):
        x_scaled = scale_data(X_list[i])

        lr = LogisticRegression()
        lr.fit(x_scaled, y_list[i])

        coeff_df[f'Cluster:{set(df_list[i]["Cluster"])}'] = lr.coef_.ravel()

        prob = lr.predict_proba(x_scaled)
        df_list[i]['ps'] = prob[:, 1]
        prob_list.append(prob)

    return coeff_df, df_list, prob_list

In [None]:
def plot_ps(n, df_list, plot_size=(3, 7)):

    f, ax = plt.subplots(*plot_size)
    f.set_figwidth(30)
    f.set_figheight(12)

    cmap = plt.get_cmap('Dark2', lut=n)

    k = 0
    for i in range(plot_size[0]):
        for j in range(plot_size[1]):
            cluster0 = df_list[k][df_list[k]['n_cluster'] == 0].iloc[0].Cluster
            cluster1 = df_list[k][df_list[k]['n_cluster'] == 1].iloc[0].Cluster

            ax[i, j].hist(df_list[k][df_list[k]['n_cluster'] == 0]['ps'], bins=20, label=f'Cluster {cluster0:.0f}', 
                            alpha=1)
            ax[i, j].hist(df_list[k][df_list[k]['n_cluster'] == 1]['ps'], bins=20, label=f'Cluster {cluster1:.0f}', 
                            alpha=0.7)
            ax[i, j].set_title(f'PS Cluster {cluster0:.0f} and {cluster1:.0f}')
            ax[i, j].legend()

            if i == plot_size[0] - 1:
                ax[i, j].set_xlabel('Propensity score')
            
            if j == 0:
                ax[i, j].set_ylabel('Number of data points')
                

            k += 1

In [None]:
df_list, X_list, y_list = prepare_clusters(n_trips)

In [None]:
coeff_df, df_list, prob_list = logistic_regr(df_list, X_list, y_list)

In [None]:
coeff_df

In [None]:
plot_ps(n, df_list, plot_size=(3, 5))

There seems to be sufficient overlap to find matches

### Matching

In the following fuction, PSM is done

In [None]:
def matching(df_list, n_neigh=20, caliper=0.01):

    # Create lists to store results from all iterations
    A_max_list = []
    A_min_list = []

    match_count_max_list = []
    match_count_list = []
    match_id_list = []

    for i in range(len(df_list)):

        # Get cluster id's for largest and smallest cluster
        cluster_id_max = np.argmax([len(df_list[i][df_list[i].n_cluster == 0]), len(df_list[i][df_list[i].n_cluster == 1])])
        cluster_id_min = np.argmin([len(df_list[i][df_list[i].n_cluster == 0]), len(df_list[i][df_list[i].n_cluster == 1])])

        # Get array with essential information
        A_max = np.array(df_list[i][df_list[i].n_cluster == cluster_id_max][['index', 'FactorV', 'ps']])
        A_min = np.array(df_list[i][df_list[i].n_cluster == cluster_id_min][['index', 'FactorV', 'ps']])
        
        # Get neighbours for each value in largest cluster
        dist, index = spatial.KDTree(A_min[:, 2].reshape(-1, 1)).query(A_max[:, 2].reshape(-1, 1), k=n_neigh, distance_upper_bound=caliper)
        index[dist == np.inf] = -99999

        # Get variables to store results
        match_count_max = A_max[:, 1].astype(int).copy()
        match_count = np.zeros(len(A_min))
        match_id = dict()

        # Loop over all values in largest cluster
        for idx in range(len(A_max)):

            match_id[idx] = [] # empty list to store matches from this index

            # Loop over neighbour indices
            for m_idx in index[idx]:

                if m_idx >= 0: # Check if index other cluster corresponds to real distance
                    
                    # Check if not all occurences of m_idx have already been matched
                    if match_count[m_idx] < A_min[m_idx, 1]:
                        
                        # Check if the remaining datapoints from match_count_max can be matched at the same time
                        if match_count_max[idx] <= (A_min[m_idx, 1] - match_count[m_idx]):
                            
                            # Add remaining value of match_count_max to match_count
                            match_count[m_idx] += match_count_max[idx]
                            
                            # Add trip ids to match_id dict
                            match_id[idx].extend([m_idx] * match_count_max[idx].astype(int))
                            
                            match_count_max[idx] = 0

                            break
                        
                        # Else match all remaining values from m_idx and the loop continues
                        else:
                            match_count_max[idx] -= (A_min[m_idx, 1] - match_count[m_idx])

                            match_id[idx].extend([m_idx] * (A_min[m_idx, 1] - match_count[m_idx]).astype(int))

                            match_count[m_idx] = A_min[m_idx, 1]
                
                # Extra check if all idx have already been matched
                if match_count_max[idx] == 0:
                    break

        
        A_max_list.append(A_max)
        A_min_list.append(A_min)

        match_count_max_list.append(match_count_max)
        match_count_list.append(match_count)
        match_id_list.append(match_id)


    return A_max_list, A_min_list, match_count_max_list, match_count_list, match_id_list

In [None]:
def merge_matching_results(df_list, A_max_list, A_min_list, match_count_max_list, match_count_list):
    
    for i in range(len(df_list)):
        match_max = A_max_list[i][:, 1] - match_count_max_list[i] # calculate no. of matches for large cluster

        # Add column with match count for each cluster
        # This will be the new factor V
        df_list[i].loc[df_list[i]['index'].isin(A_max_list[i][:, 0]), 'Matched'] = match_max
        df_list[i].loc[df_list[i]['index'].isin(A_min_list[i][:, 0]), 'Matched'] = match_count_list[i]
    
    return df_list

In [None]:
def check_match_size(df_list, A_max_list, A_min_list, match_count_list, match_count_max_list):

    size_array = np.zeros((len(df_list), 5))
    index_col = []

    for i in range(len(df_list)):

        matched_cluster_size = match_count_list[i].sum()

        large_cluster = A_max_list[i][:, 1].sum()
        small_cluster = A_min_list[i][:, 1].sum()

        size_array[i, 0] = matched_cluster_size
        size_array[i, 1] = np.round(matched_cluster_size / large_cluster  * 100, 2)
        size_array[i, 2] = np.round(matched_cluster_size / small_cluster * 100, 2)

        min_cluster = df_list[i][df_list[i]['index'] == int(A_min_list[i][0, 0])].Cluster.iloc[0]
        max_cluster = df_list[i][df_list[i]['index'] == int(A_max_list[i][0, 0])].Cluster.iloc[0]

        min_points = len(match_count_list[i][match_count_list[i] != 0])
        max_counts = A_max_list[i][:, 1] - match_count_max_list[i]
        
        max_points = len(max_counts[max_counts != 0])

        size_array[i, 3] = min_points
        size_array[i, 4] = max_points

        index_col.append(f'Cluster {max_cluster} & {min_cluster}')

    
    df = pd.DataFrame(size_array, columns=['Cluster size', f'% of matches large cluster',
                                             f'% of matches small cluster', 'Individual data points small cluster',
                                             'Individual data points large cluster'])
    
    df.index = index_col
    return df

In [None]:
# ## WARNING! TAKES VERY LONG TO RUN!
# for i in tqdm(range(100)):
#     results = matching(df_list, n_neigh=20, caliper=0.01)

In [None]:
# pickle.dump(results, open(parent_dir + '\\Data\\New\\results_clustering_DU.pickle', 'wb'))

In [None]:
results = pickle.load(open(parent_dir + '\\Data\\New\\results_clustering_du.pickle', 'rb'))

In [None]:
A_max_list, A_min_list, match_count_max_list, match_count_list, match_id_list = results[0], results[1], results[2], results[3], results[4]

In [None]:
df_list = merge_matching_results(df_list, A_max_list, A_min_list, match_count_max_list, match_count_list)

In [None]:
match_size_df = check_match_size(df_list, A_max_list, A_min_list, match_count_list, match_count_max_list)
match_size_df

The next step is to compare the demography to see if the matching worked.

In [None]:
def SMD(d1, d2):

    s1, s2 = d1.std(), d2.std()

    return 100 * (d1.mean() - d2.mean()) / np.sqrt((s1 * s1 + s2 * s2) / 2)

In [None]:
def get_demo_differences(df_list):

    cols = df_list[0].columns[1:19]

    df_smd_list = []

    for i in range(len(df_list)):

        cluster0 = df_list[i][df_list[i].n_cluster == 0]
        cluster1 = df_list[i][df_list[i].n_cluster == 1]

        smd_before = SMD(reindex_df(cluster0, 'FactorV').iloc[:, 1:19], reindex_df(cluster1, 'FactorV').iloc[:, 1:19])
        smd_after = SMD(reindex_df(cluster0, 'Matched').iloc[:, 1:19], reindex_df(cluster1, 'Matched').iloc[:, 1:19])

        df_smd = pd.DataFrame([smd_before, smd_after]).T
        df_smd = df_smd.rename(columns={0:'SMD before', 1:'SMD after'})

        df_smd_list.append(df_smd)
    
    return df_smd_list

In [None]:
def plot_effect_sizes(df_effect_size_list, df_list, plot_size=(7, 3)):

    f, ax = plt.subplots(*plot_size)
    f.set_figwidth(15)
    f.set_figheight(28)
    f.set_figheight(20)


    # cmap = plt.get_cmap('Dark2', lut=n)
    # colors = ['firebrick', 'darkorange', 'olivedrab', 'deepskyblue', 'blueviolet', 'pink']

    demo_labels = """A: Age; B: Gender; C: Income; D: Household size; E: 1 person household;\n
F: 2+ person household; G: 1 parent household; H: Part time worker; I: Full time worker;\n
J: Student; K: Primary education or less; L: lbo, vmbo; M: mbo, havo, vwo;\n
N: Other education; O: Younger than 15; P: Number of household cars; Q: Driver's licence;\n
R: Student OV"""

    demo_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R']

    k = 0

    x = np.arange(len(df_effect_size_list[0]))

    for i in range(plot_size[0]):
        for j in range(plot_size[1]):

            cluster0 = df_list[k][df_list[k]['n_cluster'] == 0].iloc[0].Cluster
            cluster1 = df_list[k][df_list[k]['n_cluster'] == 1].iloc[0].Cluster

            ax[i, j].bar(x - 0.15, df_effect_size_list[k].iloc[:, 0], width=0.25, label='before', color="#0072B2")
            ax[i, j].bar(x + 0.15, df_effect_size_list[k].iloc[:, 1], width=0.25, label='after', color="#E69F00")
            ax[i, j].axhline(0, color='black')
            ax[i, j].axhline(10, color="#009E73", linestyle='--', linewidth=2, label='|SMD| = 10%')
            ax[i, j].axhline(-10, color="#009E73", linestyle='--', linewidth=2)

            ax[i, j].set_title(f'Cluster {cluster0:.0f} and {cluster1:.0f}')
            ax[i, j].set_title(f'Degree of urbanisation {cluster0 + 1:.0f} and {cluster1 + 1:.0f}')

            ax[i, j].set_ylim(-30, 30)

            if j == 0:
                ax[i, j].set_ylabel('Standard Mean difference [%]')
            if j == plot_size[1] - 1:
                ax[i, j].legend(loc='upper right')

            
            ax[i, j].set_xticks(x, demo_letters, fontsize=8)
            # if i == plot_size[0] - 1:
            #     # ax[i, j].set_xticks(x, demo_labels, rotation=90)
            #     ax[i, j].set_xticks(x, demo_letters)
            # else:
            #     ax[i, j].set_xticks(x, demo_letters)

            ax[i, j].grid(axis='x', alpha=0.5)

            k += 1

    # f.suptitle('Standard mean difference before and after PSM', fontsize=25)
    # ax[0, 0].plot[]
    # props = dict(boxstyle='round', facecolor='lightcyan', alpha=0.5)
    # f.text(0, 1, demo_labels, verticalalignment='top', bbox=props)

In [None]:
df_smd_list = get_demo_differences(df_list) ## Takes ~4 minutes

In [None]:
plot_effect_sizes(df_smd_list, df_list, plot_size=(5, 3))

### ATE and OBE

In [None]:
def ate_obe(df_list):

    ate_obe_ratio = np.zeros((len(df_list), 7, 6))

    for i in range(len(df_list)):

        cluster0 = df_list[i][df_list[i].n_cluster == 0]
        cluster1 = df_list[i][df_list[i].n_cluster == 1]

        matched0 = reindex_df(cluster0, 'Matched').iloc[:, 19:25].mean()
        matched1 = reindex_df(cluster1, 'Matched').iloc[:, 19:25].mean()

        orig0 = reindex_df(cluster0, 'FactorV').iloc[:, 19:25].mean()
        orig1 = reindex_df(cluster1, 'FactorV').iloc[:, 19:25].mean()

        ate = matched1 - matched0
        obe = orig1 - orig0

        ratio = ate / obe

        ate_obe_ratio[i, 0] = ate * 100
        ate_obe_ratio[i, 1] = obe * 100
        ate_obe_ratio[i, 2] = ratio
        ate_obe_ratio[i, 3] = matched0
        ate_obe_ratio[i, 4] = matched1
        ate_obe_ratio[i, 5] = orig0
        ate_obe_ratio[i, 6] = orig1


    
    return ate_obe_ratio     

In [None]:
ate_obe_ratio = ate_obe(df_list)

In [None]:
ate_obe_results = []

for i in range(len(ate_obe_ratio)):

    cluster0 = df_list[i][df_list[i].n_cluster == 0].Cluster.iloc[0]
    cluster1 = df_list[i][df_list[i].n_cluster == 1].Cluster.iloc[0]

    ate_obe_results.append([f'Cluster {cluster0} & {cluster1}', *ate_obe_ratio[i][0], *ate_obe_ratio[i][1], *ate_obe_ratio[i][2],])

In [None]:
col1 = ['Car driver', 'Car passenger', 'Train', 'BTM', 'Bike', 'Walking']
col2 = ['ATE', 'OBE', 'ratio']

col = ['Clusters']

for i in range(3):
    for j in range(6):
        col.append(f'{col1[j]} - {col2[i]}')

In [None]:
df_ate_obe = np.round(pd.DataFrame(ate_obe_results, columns=col), 2)
df_ate_obe

In [None]:
df_ate_obe.iloc[:, 1:].mean()

Look how each clusters differs from the others

In [None]:
l = 2
df_ate_obe[df_ate_obe.Clusters.str.contains(f'{l}')]

#### Tb results

In [None]:
tb_results = []

for i in range(len(ate_obe_ratio)):

    cluster0 = df_list[i][df_list[i].n_cluster == 0].Cluster.iloc[0]
    cluster1 = df_list[i][df_list[i].n_cluster == 1].Cluster.iloc[0]

    tb_results.append([f'Cluster {cluster0} & {cluster1}', *ate_obe_ratio[i][3], *ate_obe_ratio[i][4], *ate_obe_ratio[i][5], *ate_obe_ratio[i][6]])

In [None]:
col1 = ['Car driver', 'Car passenger', 'Train', 'BTM', 'Bike', 'Walking']
col2 = ['M0', 'M1', 'O0', 'O1']

col = ['Clusters']

for i in range(4):
    for j in range(6):
        col.append(f'{col1[j]} - {col2[i]}')

In [None]:
df_tb = pd.DataFrame(tb_results, columns=col)
df_tb.iloc[:, 1:] = np.round(df_tb.iloc[:, 1:] * 100, 2)
df_tb.head()

1, 2, 5, 8, 12, 17

In [None]:
c = 0

cluster0 = df_list[c][df_list[c].n_cluster == 0].Cluster.iloc[0]
cluster1 = df_list[c][df_list[c].n_cluster == 1].Cluster.iloc[0]

f, ax = plt.subplots(1, 3)
f.set_figwidth(12)

x = np.arange(6)

ax[0].bar(x - 0.18, df_tb.iloc[c, 13:19], width=0.25, label=f'Cluster {cluster0}')
ax[0].bar(x + 0.18, df_tb.iloc[c, 19:25], width=0.25, label=f'Cluster {cluster1}')
ax[0].set_title(f'Travel behaviour before matching')

ax[1].bar(x - 0.18, df_tb.iloc[c, 1:7], width=0.25, label=f'Cluster {cluster0}')
ax[1].bar(x + 0.18, df_tb.iloc[c, 7:13], width=0.25, label=f'Cluster {cluster1}')
ax[1].set_title(f'Travel behaviour after matching')

ax[2].bar(x - 0.18, df_ate_obe.iloc[c, 7:13], width=0.25, label='OBE', color='firebrick')
ax[2].bar(x + 0.18, df_ate_obe.iloc[c, 1:7], width=0.25, label='ATE', color='forestgreen')
ax[2].set_title('Difference before (OBE) and after (ATE)')

for i in range(3):
    ax[i].set_xticks(x, col1, rotation=45, ha='right')
    if i < 2:
        ax[i].set_ylim(0, df_tb.iloc[c, 1:25].max() + 5)
        ax[i].set_yticks(np.arange(0, df_tb.iloc[c, 1:25].max() + 5, 5))
    ax[i].grid(axis='y')
    ax[i].set_axisbelow(True)
    ax[i].legend()

f.suptitle(f'Travel behaviour cluster {cluster0} and cluster {cluster1}', fontsize='xx-large');

Get mean values for demography

In [None]:
def get_demo_mean(df_list):

    mean_list = []

    for i in range(len(df_list)):

        c0_label = df_list[i][df_list[i].n_cluster == 0].Cluster.iloc[0]
        c1_label = df_list[i][df_list[i].n_cluster == 1].Cluster.iloc[0]

        cluster0 = df_list[i][df_list[i].n_cluster == 0]
        cluster1 = df_list[i][df_list[i].n_cluster == 1]

        mean0 = reindex_df(cluster0, 'Matched').iloc[:, 1:19].mean()
        mean1 = reindex_df(cluster1, 'Matched').iloc[:, 1:19].mean()

        df_mean = pd.DataFrame([mean_df.iloc[:18, c0_label], mean_df.iloc[:18, c1_label], np.round(mean0, 2), np.round(mean1, 2)]).T
        df_mean = df_mean.rename(columns={c0_label:f'Before: Cluster {c0_label}', c1_label:f'Before: Cluster {c1_label}',
                                          'Unnamed 0':f'After: Cluster {c0_label}', 'Unnamed 1':f'After: Cluster {c1_label}'})

        mean_list.append(df_mean)
    
    return mean_list

In [None]:
mean_list = get_demo_mean(df_list)

In [None]:
mean_list[0]

Calculate p-values

In [None]:
def p_vals(df_list):

    p_before_arr = np.zeros((len(df_list), 6))
    p_after_arr = np.zeros((len(df_list), 6))

    for i in range(len(df_list)):

        cluster0 = df_list[i][df_list[i].n_cluster == 0]
        cluster1 = df_list[i][df_list[i].n_cluster == 1]

        # Get sample size
        size_cluster0_before = len(df_list[i][(df_list[i].n_cluster == 0) & (df_list[i].Matched > 0)])
        size_cluster0_after = len(df_list[i][(df_list[i].n_cluster == 0) & (df_list[i].Matched > 0)])

        size_cluster1_before = len(df_list[i][(df_list[i].n_cluster == 0) & (df_list[i].FactorV > 0)])
        size_cluster1_after = len(df_list[i][(df_list[i].n_cluster == 0) & (df_list[i].FactorV > 0)])

        # Sample indices
        sample0_before = random.sample(list(cluster0.index), k=size_cluster0_before, counts=cluster0.FactorV.astype(int))
        sample0_after = random.sample(list(cluster0.index), k=size_cluster0_after, counts=cluster0.Matched.astype(int))

        sample1_before = random.sample(list(cluster1.index), k=size_cluster1_before, counts=cluster1.FactorV.astype(int))
        sample1_after = random.sample(list(cluster1.index), k=size_cluster1_after, counts=cluster1.Matched.astype(int))
    	
        # print(sample1_before)
        # Calculate p-values
        _, p_before = sst.ttest_ind(cluster0.loc[sample0_before].iloc[:, 19:25], cluster1.loc[sample1_before].iloc[:, 19:25])
        _, p_after = sst.ttest_ind(cluster0.loc[sample0_after].iloc[:, 19:25], cluster1.loc[sample1_after].iloc[:, 19:25])


        # _, p_before = sst.ttest_ind(reindex_df(cluster0, 'FactorV').iloc[:, 19:25], reindex_df(cluster1, 'FactorV').iloc[:, 19:25])

        # _, p_after = sst.ttest_ind(reindex_df(cluster0, 'Matched').iloc[:, 19:25], reindex_df(cluster1, 'Matched').iloc[:, 19:25])

        p_before_arr[i] = p_before
        p_after_arr[i] = p_after
    
    return p_before_arr, p_after_arr

In [None]:
p_before_tot_list = []
p_after_tot_list = []

for i in range(1000):
    p_before_arr, p_after_arr = p_vals(df_list)

    p_before_tot_list.append(p_before_arr)
    p_after_tot_list.append(p_after_arr)

In [None]:
p_before_succeed = np.zeros(p_after_arr.shape)
p_after_succeed = np.zeros(p_after_arr.shape)


for i in range(1000):

    p_before_succeed[p_before_tot_list[i] > 0.05] += 1
    p_after_succeed[p_after_tot_list[i] > 0.05] += 1



In [None]:
p_results = []

for i in range(len(ate_obe_ratio)):

    cluster0 = df_list[i][df_list[i].n_cluster == 0].Cluster.iloc[0]
    cluster1 = df_list[i][df_list[i].n_cluster == 1].Cluster.iloc[0]

    p_results.append([f'Cluster {cluster0} & {cluster1}', *p_after_succeed[i], *p_before_succeed[i]])


col1 = ['Car driver', 'Car passenger', 'Train', 'BTM', 'Bike', 'Walking']
col2 = ['p-after', 'p_before']

col = ['Clusters']

for i in range(2):
    for j in range(6):
        col.append(f'{col1[j]} - {col2[i]}')

df_p = np.round(pd.DataFrame(p_results, columns=col), 0)
df_p.iloc[:, 1:] = df_p.iloc[:, 1:] / 10

In [None]:
df_p

In [None]:
ate_obe_array = np.array(df_ate_obe.iloc[:, 1:13])
ate_obe_array[np.array(df_p.iloc[:, 1:]) >= 5] = np.nan

In [None]:
df_ao = df_ate_obe.copy()
df_ao.iloc[:, 1:13] = ate_obe_array

In [None]:
ratio = ate_obe_array[:, :6] / ate_obe_array[:, 6:]
df_ao.iloc[:, 13:] = np.round(ratio, 2)

In [None]:
df_ao

### Save to csv

In [None]:
df_ate_obe.to_csv(parent_dir + '\\Data\\New\\ate_obe_DU.csv')

In [None]:
df_p.to_csv(parent_dir + '\\Data\\New\\p_DU.csv')

In [None]:
df_ao.to_csv(parent_dir + '\\Data\\New\\ate_obe_DU_corrected.csv')

In [None]:
df_tb.to_csv(parent_dir + '\\Data\\New\\modalsplit_matched_DU.csv')

In [None]:
df_ao