# Group exposure analysis
Group refers to F, D (N), and N (M) groups by individuals' residential segregation level.


In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import os
os.environ['USE_PYGEOS'] = '0'
import numpy as np
import pandas as pd
import sqlalchemy
from lib import preprocess
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm import tqdm
import seaborn as sns
from matplotlib.lines import Line2D
from matplotlib.ticker import FormatStrFormatter
import helpers as hp
mpl.rcParams.update(mpl.rcParamsDefault)
font = {'size': 14}
mpl.rc('font', **font)

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load data
### 1.1 Individual attributes

In [4]:
df = pd.read_parquet('results/data4model_individual_hex_w1h0.parquet')
# Add region county code and land-use type
df.loc[:, 'deso_2'] = df['region'].apply(lambda x: x[0:2])
df.loc[:, 'region_cat'] = df['region']. \
    apply(lambda x: 'Rural/Suburban' if x[4] in ('A', 'B') else 'Urban')

In [5]:
df.groupby('grp_r')['wt_p'].sum()/df.wt_p.sum()*100

grp_r
D    42.292131
F    17.605313
N    40.102556
Name: wt_p, dtype: float64

In [6]:
D, F, N = 42.3, 17.6, 40.1

### 1.2 Individual interactions

In [17]:
def inter_proccess(data):
    f = data['f'].sum()
    d = data['d'].sum()
    n = data['n'].sum()
    total = f + d + n
    if total > 0:
        return pd.Series(dict(f=f/total*100,
                              d=d/total*100,
                              n=n/total*100))
    else:
        return pd.Series(dict(f=0,
                              d=0,
                              n=0))

In [18]:
df_inter = pd.read_sql(sql='''SELECT * FROM bipartite_graph.hex_interactions_indi;''', con=engine)
tqdm.pandas()
df_inter = df_inter.groupby('uid').progress_apply(inter_proccess).reset_index()
df_inter.to_parquet('results/plot/group_exposure.parquet', index=False)

  0%|          | 0/321594 [00:00<?, ?it/s]

### 1.3 Simulations - interactions

In [26]:
def inter_proccess_sim(data):
    # By individual
    f_50, d_50, n_50 = data['f'].median(), data['d'].median(), data['n'].median()
    f_25, d_25, n_25 = data['f'].quantile(0.25), data['d'].quantile(0.25), data['n'].quantile(0.25)
    f_75, d_75, n_75 = data['f'].quantile(0.75), data['d'].quantile(0.75), data['n'].quantile(0.75)
    return pd.Series(dict(f_50=f_50, d_50=d_50, n_50=n_50,
                          f_25=f_25, d_25=d_25, n_25=n_25,
                          f_75=f_75, d_75=d_75, n_75=n_75))

In [27]:
df_inter_s1 = pd.read_sql(sql='''SELECT * FROM bipartite_graph.hex_interactions_indi_sim1;''', con=engine)
tqdm.pandas()
df_inter_s1 = df_inter_s1.groupby('uid').progress_apply(inter_proccess_sim).reset_index()
df_inter_s1.to_parquet('results/plot/group_exposure_sim1.parquet', index=False)

  0%|          | 0/278005 [00:00<?, ?it/s]

In [28]:
df_inter_s2 = pd.read_sql(sql='''SELECT * FROM bipartite_graph.hex_interactions_indi_sim2;''', con=engine)
tqdm.pandas()
df_inter_s2 = df_inter_s2.groupby('uid').progress_apply(inter_proccess_sim).reset_index()
df_inter_s2.to_parquet('results/plot/group_exposure_sim2.parquet', index=False)

  0%|          | 0/283659 [00:00<?, ?it/s]

### 1.4 Combine individual level exposure together
For the simulated results, take the median value for each individuals' 100 simulations.

In [12]:
df = pd.read_parquet('results/plot/group_exposure.parquet')
df.loc[:, 'src'] = 'Empirical'
df1 = pd.read_parquet('results/plot/group_exposure_sim1.parquet')[['uid', 'f_50', 'd_50', 'n_50']]
df1.columns = ['uid', 'f', 'd', 'n']
df1.loc[:, 'src'] = 'Sim1'
df2 = pd.read_parquet('results/plot/group_exposure_sim2.parquet')[['uid', 'f_50', 'd_50', 'n_50']]
df2.columns = ['uid', 'f', 'd', 'n']
df2.loc[:, 'src'] = 'Sim2'
df = pd.concat([df, df1, df2])

In [None]:
df.head()

In [14]:
df.to_parquet('results/statistical_tests/exposure.parquet', index=False)

## 2. Combine data and analyze exposure

In [21]:
# Prepare data for visualization in R
def data_prep(data=None, data_inter=None, file2save=None, sim=False):
    data_n = pd.merge(data, data_inter, on='uid', how='left')
    if sim:
        data_n.loc[:, 'f_diff'] = data_n.loc[:, 'f_50'] - F
        data_n.loc[:, 'n_diff'] = data_n.loc[:, 'n_50'] - N
        data_n.loc[:, 'd_diff'] = data_n.loc[:, 'd_50'] - D
    else:
        data_n.loc[:, 'f_diff'] = data_n.loc[:, 'f'] - F
        data_n.loc[:, 'n_diff'] = data_n.loc[:, 'n'] - N
        data_n.loc[:, 'd_diff'] = data_n.loc[:, 'd'] - D
    df2plot = pd.melt(data_n, id_vars=['wt_p', 'grp_r'], value_vars= ['f_diff', 'n_diff', 'd_diff'])
    df2plot.to_parquet(file2save, index=False)

In [22]:
file_input = 'results/plot/group_exposure.parquet'
file2save = 'results/plot/group_exposure_plot.parquet'
data_prep(data=df, data_inter=pd.read_parquet(file_input), file2save=file2save)

In [23]:
file_input = 'results/plot/group_exposure_sim1.parquet'
file2save = 'results/plot/group_exposure_sim1_plot.parquet'
data_prep(data=df, data_inter=pd.read_parquet(file_input), file2save=file2save, sim=True)
file_input = 'results/plot/group_exposure_sim2.parquet'
file2save = 'results/plot/group_exposure_sim2_plot.parquet'
data_prep(data=df, data_inter=pd.read_parquet(file_input), file2save=file2save, sim=True)

In [5]:
# Combine data
inter_type_dict = {
    ('F', 'f_diff'): ('FF', 1),
    ('D', 'd_diff'): ('DD', 1),
    ('N', 'n_diff'): ('NN', 1),
    ('F', 'd_diff'): ('FD', 2),
    ('D', 'f_diff'): ('DF', 2),
    ('N', 'f_diff'): ('NF', 3),
    ('N', 'd_diff'): ('ND', 3),
    ('F', 'n_diff'): ('FN', 3),
    ('D', 'n_diff'): ('DN', 3)
}
df_inter_combined = []
for i, sr in zip(('', '_sim1', '_sim2'), ('Empirical', 'No-homophily', 'Equalized mobility & no-homophily')):
    print(f'Process - {sr}')
    df_p = pd.read_parquet(f'results/plot/group_exposure{i}_plot.parquet')
    df_p.loc[:, 'inter_type'] = df_p.apply(lambda row: inter_type_dict[(row['grp_r'], row['variable'])][0], axis=1)
    df_p.loc[:, 'Group'] = df_p.apply(lambda row: inter_type_dict[(row['grp_r'], row['variable'])][1], axis=1)
    df_p.loc[:, 'Source'] = sr
    df_p.drop(columns=['variable'], inplace=True)
    df_inter_combined.append(df_p)
df_inter_combined = pd.concat(df_inter_combined)
df_inter_combined.to_parquet('results/plot/group_exposure_plot_combined.parquet', index=False)
df_inter_combined.head()

Process - Empirical
Process - No-homophily
Process - Equalized mobility & no-homophily


Unnamed: 0,wt_p,grp_r,value,inter_type,Group,Source
0,84.428571,D,-1.583836,DF,2,Empirical
1,26.753623,F,4.416642,FF,1,Empirical
2,40.516129,D,-8.960143,DF,2,Empirical
3,13.445455,N,-14.116879,NF,3,Empirical
4,54.882353,N,-7.456915,NF,3,Empirical


## 3. Statistical test of exposure results

In [4]:
df_i = pd.read_parquet('results/plot/group_exposure_plot_combined.parquet')
df_i.iloc[0]

wt_p          84.428571
grp_r                 D
value         -1.583836
inter_type           DF
Group                 2
Source        Empirical
Name: 0, dtype: object

### 3.1 Median bootstrap

In [33]:
def inter_stats(x=None):
    stat_dict = dict()
    stat_dict['median_estimate'], stat_dict['se_median'] = hp.bootstrap_median_and_error(x, target_col='value', 
                                                                                         weight_col='wt_p', n_bootstrap=1000)
    return pd.Series(stat_dict)

In [34]:
tqdm.pandas()
df_iv = df_i.groupby(['Source', 'inter_type']).progress_apply(inter_stats).reset_index()
df_iv

100%|██████████| 27/27 [08:52<00:00, 19.72s/it]


Unnamed: 0,Source,inter_type,median_estimate,se_median
0,Empirical,DD,17.745048,0.142632
1,Empirical,DF,-8.681245,0.045175
2,Empirical,DN,-10.550956,0.111868
3,Empirical,FD,-23.319673,0.078587
4,Empirical,FF,29.97628,0.171519
5,Empirical,FN,-8.438452,0.102733
6,Empirical,ND,-16.923267,0.055122
7,Empirical,NF,-3.989602,0.02545
8,Empirical,NN,18.36098,0.062204
9,Equalized mobility & no-homophily,DD,-1.794916,0.018945


In [37]:
df_iv.loc[df_iv.inter_type.isin(['FF', 'DD', 'NN', 'FD', 'DF']), :].sort_values(by='inter_type')

Unnamed: 0,Source,inter_type,median_estimate,se_median
0,Empirical,DD,17.745048,0.142632
9,Equalized mobility & no-homophily,DD,-1.794916,0.018945
18,No-homophily,DD,0.599048,0.0999
1,Empirical,DF,-8.681245,0.045175
10,Equalized mobility & no-homophily,DF,0.595594,0.006929
19,No-homophily,DF,-3.059763,0.022428
3,Empirical,FD,-23.319673,0.078587
12,Equalized mobility & no-homophily,FD,-7.252026,0.011227
21,No-homophily,FD,-14.342763,0.083179
4,Empirical,FF,29.97628,0.171519


### 3.2 Weighted Wilcoxon test comparing with 0

In [68]:
def comp_zero(x):
    stat_dict = dict()
    res = hp.weighted_wilcoxon(x, value_col='value', weight_col='wt_p', baseline=0, n_bootstrap=1000)
    stat_dict['p'], stat_dict['p_std'], stat_dict['stats'], stat_dict['stats_std'] = (res[0], res[1], 
                                                                                      res[2], res[3])
    return pd.Series(stat_dict)

In [69]:
tqdm.pandas()
df_iv0 = df_i.groupby(['Source', 'inter_type']).progress_apply(comp_zero).reset_index()
df_iv0

100%|██████████| 27/27 [13:19<00:00, 29.63s/it]


Unnamed: 0,Source,inter_type,p,p_std,stats,stats_std
0,Empirical,DD,0.0,0.0,726287100.0,5738036.0
1,Empirical,DF,0.0,0.0,685094400.0,6762584.0
2,Empirical,DN,0.0,0.0,1070726000.0,7635589.0
3,Empirical,FD,0.0,0.0,21723510.0,668784.2
4,Empirical,FF,0.0,0.0,8893003.0,262577.0
5,Empirical,FN,0.0,0.0,252018800.0,2518931.0
6,Empirical,ND,0.0,0.0,591596500.0,6682515.0
7,Empirical,NF,0.0,0.0,2634422000.0,14222820.0
8,Empirical,NN,0.0,0.0,263718700.0,3615165.0
9,Equalized mobility & no-homophily,DD,0.0,0.0,1600944000.0,9486316.0


In [71]:
df_i_s = pd.merge(df_iv, df_iv0, on=['Source', 'inter_type'], how='left')
df_i_s.to_parquet('results/statistical_tests/group_exposure.parquet', index=False)

### 3.3 Significant deviation from random mix

In [5]:
thre_dict = {'D': 16, 'F': 11, 'N': 16}

In [6]:
# Test if at the aggregate level segregated
def inter_deviation_test(data=None):
    it = data['inter_type'].values[0]
    threshold = thre_dict[it[-1]]
    stat_dict = dict()
    # Median standard error
    res = hp.weighted_wilcoxon(data=data, value_col='value', weight_col='wt_p', baseline=threshold, m='greater', n_bootstrap=1000)
    greater_ = res[0]
    if greater_ > 0.001:
        res = hp.weighted_wilcoxon(data=data, value_col='value', weight_col='wt_p', baseline=-threshold, m='less', n_bootstrap=1000)
        less_ = res[0]
        if less_ > 0.001:
            stat_dict['deviation'] = 0
        else:
            stat_dict['deviation'] = 1
    else:
        stat_dict['deviation'] = 1
    return pd.Series(stat_dict)

tqdm.pandas()
df_iv_dev = df_i.groupby(['Source', 'inter_type']).progress_apply(inter_deviation_test).reset_index()
df_iv_dev

100%|██████████| 27/27 [23:56<00:00, 53.20s/it]


Unnamed: 0,Source,inter_type,deviation
0,Empirical,DD,1
1,Empirical,DF,0
2,Empirical,DN,0
3,Empirical,FD,1
4,Empirical,FF,1
5,Empirical,FN,0
6,Empirical,ND,0
7,Empirical,NF,0
8,Empirical,NN,1
9,Equalized mobility & no-homophily,DD,0
