# Modeling the disparity between experienced and residential nativity segregation

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import pandas as pd
import os
from scipy.stats import distributions
import numpy as np
from statsmodels.stats.weightstats import DescrStatsW
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm import tqdm
import seaborn as sns

In [5]:
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.style.use('seaborn-colorblind')
font = {'size': 14}
mpl.rc('font', **font)

In [6]:
df_exp = pd.read_parquet('results/data4model_individual.parquet')
df_exp = df_exp.loc[(df_exp['weekday'] == 1) & (df_exp['holiday'] == 0), :]

## 1. Select variables

In [7]:
cols = ['uid', 'region', 'wt_p', 'Lowest income group',
        'car_ownership', 'cum_jobs', 'cum_stops',
        'ice_birth_resi', 'ice_birth']
df_exp = df_exp[cols]

### 1.1 Add labels of home regions (A, B, and C)
A DeSO in category A is mostly outside major population concentrations or urban areas. DeSO in category B is mostly located in a population concentration or agglomeration, but not in the municipality's central location. Category C includes DeSO, which is mostly located in the municipality's central location. In total, 18 percent of DeSO are found in category A, 10 percent in category B and 72 percent in category C.

Source: [SCB.se](https://www.scb.se/hitta-statistik/regional-statistik-och-kartor/regionala-indelningar/deso---demografiska-statistikomraden/)

In [8]:
df_exp['deso_2'] = df_exp['region'].apply(lambda x: x[0:2])
df_exp['region_cat'] = df_exp['region'].apply(lambda x: x[4])

## 2. Grouping segregation patterns

In [9]:
def seg_direction(row):
    if row['ice_birth_resi'] < 0:
        if row['ice_birth'] < 0:
            return 'FF'
        else:
            return 'FD'
    else:
        if row['ice_birth'] < 0:
            return 'DF'
    return 'DD'

def delta_ice(ice_r, ice_e):
    if (ice_r < 0) & (ice_e < 0):
        return -(ice_e - ice_r)
    return ice_e - ice_r

def seg_abs_direction(x):
    if x >= 0:
        return 'inc'
    return 'dec'

In [10]:
df_exp['seg_gap'] = df_exp.apply(lambda row: delta_ice(row['ice_birth_resi'], row['ice_birth']), axis=1)
df_exp['seg_cross'] = df_exp.apply(lambda row: seg_direction(row), axis=1)
df_exp['seg_change'] = df_exp.apply(lambda row: seg_abs_direction(row['seg_gap']), axis=1)
df_exp['seg_change'] = df_exp.apply(lambda row: 'cross' if row['seg_cross'] in ('DF', 'FD') else row['seg_change'],
                                    axis=1)
df_exp.iloc[0]

uid                    00008608-f79e-414d-bf1c-25632d6bc059
region                                            1284C1040
wt_p                                              84.428571
Lowest income group                                0.088063
car_ownership                                      0.541455
cum_jobs                                        4813.309325
cum_stops                                               3.0
ice_birth_resi                                     0.324146
ice_birth                                          0.041918
deso_2                                                   12
region_cat                                                C
seg_gap                                           -0.282228
seg_cross                                                DD
seg_change                                              dec
Name: 570844, dtype: object

In [16]:
df_exp.to_parquet("results/transport_association/seg_disparity_patterns.parquet")

### 2.1 Statistics of these groups

In [11]:
df_exp.groupby(['seg_cross', 'region_cat'])['wt_p'].sum() / df_exp.wt_p.sum() * 100

seg_cross  region_cat
DD         A              8.127966
           B              4.470961
           C             19.642708
DF         A              6.061887
           B              2.961368
           C             24.473024
FD         A              0.011884
           B              0.071990
           C              0.984606
FF         A              0.083013
           B              1.199437
           C             31.911157
Name: wt_p, dtype: float64

## 3. Quantify segregation disparity

### 3.1 Calculate boxplot stats

In [12]:
def seg_grp_stats_com(data, var=None):
    stat_dict = {'share': data.wt_p.sum() / df_exp.wt_p.sum() * 100}
    wdf = DescrStatsW(data[var], weights=data['wt_p'], ddof=1)
    sts = wdf.quantile([0.25,0.50,0.75])
    q25 = sts.values[0]
    q50 = sts.values[1]
    q75 = sts.values[2]
    stat_dict['mean'] = wdf.mean
    stat_dict['q25'] = q25
    stat_dict['q50'] = q50
    stat_dict['q75'] = q75
    stat_dict['var'] = var
    return pd.Series(stat_dict)

def stats_calculation(df=None, focus_thr=False):
    if focus_thr:
        df_exp = df.loc[df['deso_2'].isin(['01', '12', '14']), :]
    else:
        df_exp = df.copy()
    list_df_seg = []
    for var in ['Lowest income group', 'car_ownership', 'cum_jobs', 'cum_stops']:
        list_df_seg.append(df_exp.groupby(['seg_cross', 'seg_change', 'region_cat']).
                           apply(lambda x: seg_grp_stats_com(x, var)).
                           reset_index())
    df_seg = pd.concat(list_df_seg)
    df_seg.loc[:, 'region_cat'] = df_seg['region_cat'].map({'A': 'Rural residents',
                                                            'B': 'Suburban residents',
                                                            'C': 'Urban residents'})
    df_seg.loc[:, 'seg_change'] = df_seg['seg_change'].map({'inc': 'Increase',
                                                            'dec': 'Decrease',
                                                            'cross': 'Cross'})
    if focus_thr:
        df_seg.loc[:, 'focus'] = 'yes'
    else:
        df_seg.loc[:, 'focus'] = 'no'
    return df_seg

In [13]:
df_seg = pd.concat([stats_calculation(df=df_exp, focus_thr=False),
                    stats_calculation(df=df_exp, focus_thr=True)])

In [18]:
df_seg.to_parquet('results/transport_association/seg_disarity_patterns_stats.parquet')
df_seg

Unnamed: 0,seg_cross,seg_change,region_cat,share,mean,q25,q50,q75,var,focus
0,DD,Decrease,Rural residents,8.056738,0.210380,0.000000,0.187500,0.321429,Lowest income group,no
1,DD,Decrease,Suburban residents,4.231991,0.194271,0.075000,0.168919,0.292683,Lowest income group,no
2,DD,Decrease,Urban residents,18.317019,0.169147,0.078947,0.148855,0.238095,Lowest income group,no
3,DD,Increase,Rural residents,0.071227,0.302991,0.125000,0.299242,0.500000,Lowest income group,no
4,DD,Increase,Suburban residents,0.238969,0.261078,0.121212,0.229508,0.363636,Lowest income group,no
...,...,...,...,...,...,...,...,...,...,...
13,FF,Decrease,Suburban residents,0.090218,20.570029,6.000000,17.000000,23.000000,cum_stops,yes
14,FF,Decrease,Urban residents,5.237716,119.392993,18.000000,29.000000,94.000000,cum_stops,yes
15,FF,Increase,Rural residents,0.043086,18.492562,10.000000,17.000000,26.000000,cum_stops,yes
16,FF,Increase,Suburban residents,0.426397,13.587079,6.000000,11.000000,20.000000,cum_stops,yes


### 3.2 Vislualization
This part is done in `30-1-seg-disp-patterns-cat.R`.

## 4. Weighted Mann–Whitney U test

In [16]:
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri as rpyn
r_weights = importr('survey')

In [38]:
def wmu_test(data=None, region=None, grps=None, var=None, weight=None):
    if len(grps) == 1:
        data1 = data.loc[(data.region_cat == region) &
                           (data.seg_cross == grps[0]) &
                           (data.seg_change == 'dec'), [var, weight]]
        data2 = data.loc[(data.region_cat == region) &
                           (data.seg_cross == grps[0]) &
                           (data.seg_change == 'inc'), [var, weight]]
    else:
        data1 = data.loc[(data.region_cat == region) &
                           (data.seg_cross == grps[1]), [var, weight]]
        data2 = data.loc[(data.region_cat == region) &
                           (data.seg_cross == grps[0]), [var, weight]]
    weights = np.concatenate([data1[weight].values, data2[weight].values])
    group1 = data1[var].values
    group2 = data2[var].values

    ro.r.assign('group1', rpyn.numpy2rpy(group1))
    ro.r.assign('group2', rpyn.numpy2rpy(group2))
    ro.r.assign('weights', rpyn.numpy2rpy(weights))
    ro.r('''data <- data.frame(group = c(group1, group2),
                    group_indicator = rep(c(1, 2), c(length(group1), length(group2))))''')
    ro.r('''design <- svydesign(ids = ~0, data = data, weights = ~weights)''')
    ro.r('''result <- svyranktest(formula = group ~ group_indicator, design=design, test = "wilcoxon")''')
    ro.r('''est <- unname(result$estimate)''')
    ro.r('''pvalue <- unname(result$p.value)''')
    est = ro.globalenv['est'][0]
    pvalue = ro.globalenv['pvalue'][0]
    return est, pvalue

In [46]:
weight = 'wt_p'
res_list = []
for var in ['Lowest income group', 'car_ownership', 'cum_jobs', 'cum_stops']:
    for rg in ['A', 'B', 'C']:
        for grp in [['DD'], ['DF', 'DD'], ['FD', 'FF'], ['FF']]:
            est, pvalue = wmu_test(data=df_exp, region=rg, grps=grp, var=var, weight=weight)
            grp_type = grp[0]
            if pvalue > 0.001:
                sig = '-'
            else:
                sig = '*'
            res_list.append((var, rg, grp_type, pvalue, sig, est))
df_res = pd.DataFrame(res_list, columns=['var', 'region_cat', 'seg_cross', 'p', 'sig', 'difference in mean rank score'])
df_res.loc[:, 'region_cat'] = df_res['region_cat'].map({'A': 'Rural residents',
                                                        'B': 'Suburban residents',
                                                        'C': 'Urban residents'})
df_res

Unnamed: 0,var,region_cat,seg_cross,p,sig,difference in mean rank score
0,Lowest income group,Rural residents,DD,0.0001988216,*,0.116499
1,Lowest income group,Rural residents,DF,0.1835285,-,0.004824
2,Lowest income group,Rural residents,FD,0.006308765,-,-0.177649
3,Lowest income group,Rural residents,FF,0.8085682,-,-0.015457
4,Lowest income group,Suburban residents,DD,6.309561e-10,*,0.091217
5,Lowest income group,Suburban residents,DF,6.159356e-46,*,0.076626
6,Lowest income group,Suburban residents,FD,0.007452969,-,-0.069051
7,Lowest income group,Suburban residents,FF,0.5533845,-,-0.008221
8,Lowest income group,Urban residents,DD,4.1394939999999994e-50,*,0.111313
9,Lowest income group,Urban residents,DF,1.676484e-95,*,0.04897


In [40]:
df_res.to_clipboard(index=False)

In [47]:
df_res.to_parquet('results/transport_association/stats_test.parquet')