# Grid level aggregation of individuals' segregation and other features

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [41]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import weighted
from tqdm.notebook import tqdm
from lib import preprocess

In [4]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load individual data

In [5]:
df_ind = pd.read_sql("""SELECT DISTINCT ON (uid)
                            *
                            FROM segregation.mobi_seg_deso_individual
                            WHERE weekday=1 AND holiday=0;""", con=engine)
df_ind.drop(columns=['weekday', 'holiday', 'time_seq', 
                     'evenness_income', 'ice_birth', 'num_coexistence'],
            inplace=True)

In [9]:
def ind_features(data, wt=True):
    res_dict = dict()
    res_dict['ind_count'] = len(data)
    res_dict['ind_count_wt'] = data.wt_p.sum()
    for v in ('number_of_locations', 'number_of_visits',  
              'median_distance_from_home', 'average_displacement', 'radius_of_gyration',
              'Other', 'Lowest income group', 'car_ownership', 'Not Sweden',
              'cum_jobs_pt', 'cum_jobs_car'):
        if wt:
            res_dict[v] = weighted.median(data[v], data['wt_p'])
        else:
            res_dict[v] = np.median(data[v])
    return pd.Series(res_dict)

In [10]:
tqdm.pandas()
df_ind_stats = df_ind.groupby('zone').progress_apply(lambda x: ind_features(data=x, wt=True)).reset_index()

100%|██████████| 56539/56539 [01:01<00:00, 913.18it/s] 


In [12]:
df_ind_stats.columns

Index(['zone', 'ind_count', 'ind_count_wt', 'number_of_locations',
       'number_of_visits', 'median_distance_from_home', 'average_displacement',
       'radius_of_gyration', 'Other', 'Lowest income group', 'car_ownership',
       'Not Sweden', 'cum_jobs_pt', 'cum_jobs_car'],
      dtype='object')

In [28]:
uid_zone_dict = dict(zip(df_ind.uid, df_ind.zone))
uid_wt_dict = dict(zip(df_ind.uid, df_ind.wt_p))

## 2. Load individual segregation data
### 2.1 DeSO level

In [15]:
df_deso = pd.read_sql("""SELECT uid, zone, weekday, holiday, time_seq, ice_birth, num_coexistence, wt_p
                         FROM segregation.mobi_seg_deso_individual;""", con=engine)

In [31]:
def seg_features(data, reso='deso', wt=True):
    res_dict = dict()
    res_dict[f'ind_count_{reso}'] = len(data)
    res_dict[f'ind_count_wt_{reso}'] = data.wt_p.sum()
    v = 'ice_birth'
    if wt:
        res_dict[f'{v}_{reso}'] = weighted.median(data[v], data['wt_p'])
    else:
        res_dict[f'{v}_{reso}'] = np.median(data[v])
    return pd.Series(res_dict)

def seg_features_day(data, reso='deso'):
    res_dict = dict()
    res_dict[f'ind_count_{reso}'] = np.median(data[f'ind_count_{reso}'])
    res_dict[f'ind_count_wt_{reso}'] = np.median(data[f'ind_count_wt_{reso}'])
    res_dict[f'ice_birth_{reso}'] = np.median(data[f'ice_birth_{reso}'])
    res_dict[f'time_seq_count_{reso}'] = len(data)
    return pd.Series(res_dict)

In [23]:
tqdm.pandas()
df_deso_stats_t = df_deso.groupby(['zone', 'weekday', 'holiday', 'time_seq']).\
    progress_apply(lambda x: seg_features(data=x, wt=True)).reset_index()
df_deso_stats_t.iloc[0]

  0%|          | 0/9107570 [00:00<?, ?it/s]

zone                 2700006534000
weekday                          0
holiday                          0
time_seq                       3.0
ind_count_deso                 1.0
ind_count_wt_deso        58.387097
ice_birth_deso           -0.065279
Name: 0, dtype: object

In [26]:
tqdm.pandas()
df_deso_stats = df_deso_stats_t.\
    groupby(['zone', 'weekday', 'holiday']).\
    progress_apply(seg_features_day).reset_index()
df_deso_stats.iloc[0]

  0%|          | 0/217153 [00:00<?, ?it/s]

zone                   2700006534000
weekday                            0
holiday                            0
ind_count_deso                   1.0
ind_count_wt_deso          58.387097
ice_birth_deso             -0.046149
time_seq_count_deso             16.0
Name: 0, dtype: object

In [27]:
del df_deso

### 2.2 Mixed-hexagon level

In [29]:
df_hex = pd.read_sql("""SELECT uid, weekday, holiday, time_seq, ice_birth
                         FROM segregation.mobi_seg_hex_individual;""", con=engine)

In [30]:
df_hex.loc[:, 'zone'] = df_hex.loc[:, 'uid'].map(uid_zone_dict)
df_hex.loc[:, 'wt_p'] = df_hex.loc[:, 'uid'].map(uid_wt_dict)

In [32]:
tqdm.pandas()
df_hex_stats_t = df_hex.groupby(['zone', 'weekday', 'holiday', 'time_seq']).\
    progress_apply(lambda x: seg_features(data=x, reso='hex', wt=True)).reset_index()
df_hex_stats_t.iloc[0]

  0%|          | 0/9046961 [00:00<?, ?it/s]

zone                2700006534000
weekday                         0
holiday                         0
time_seq                      3.0
ind_count_hex                 1.0
ind_count_wt_hex        58.387097
ice_birth_hex            0.059898
Name: 0, dtype: object

In [33]:
tqdm.pandas()
df_hex_stats = df_hex_stats_t.\
    groupby(['zone', 'weekday', 'holiday']).\
    progress_apply(lambda x: seg_features_day(data=x, reso='hex')).reset_index()
df_hex_stats.iloc[0]

  0%|          | 0/216628 [00:00<?, ?it/s]

zone                  2700006534000
weekday                           0
holiday                           0
ind_count_hex                   1.0
ind_count_wt_hex          58.387097
ice_birth_hex             -0.002382
time_seq_count_hex             16.0
Name: 0, dtype: object

In [34]:
del df_hex

### 2.3 POI level

In [35]:
df_poi = pd.read_sql("""SELECT uid, weekday, holiday, time_seq, ice_birth
                         FROM segregation.mobi_seg_poi_individual;""", con=engine)
df_poi.loc[:, 'zone'] = df_poi.loc[:, 'uid'].map(uid_zone_dict)
df_poi.loc[:, 'wt_p'] = df_poi.loc[:, 'uid'].map(uid_wt_dict)

In [36]:
tqdm.pandas()
df_poi_stats_t = df_poi.groupby(['zone', 'weekday', 'holiday', 'time_seq']).\
    progress_apply(lambda x: seg_features(data=x, reso='poi', wt=True)).reset_index()
df_poi_stats_t.iloc[0]

  0%|          | 0/8655786 [00:00<?, ?it/s]

zone                2700006534000
weekday                         0
holiday                         0
time_seq                      3.0
ind_count_poi                 1.0
ind_count_wt_poi        58.387097
ice_birth_poi            0.468343
Name: 0, dtype: object

In [37]:
tqdm.pandas()
df_poi_stats = df_poi_stats_t.\
    groupby(['zone', 'weekday', 'holiday']).\
    progress_apply(lambda x: seg_features_day(data=x, reso='poi')).reset_index()
df_poi_stats.iloc[0]

  0%|          | 0/214180 [00:00<?, ?it/s]

zone                  2700006534000
weekday                           0
holiday                           0
ind_count_poi                   1.0
ind_count_wt_poi          58.387097
ice_birth_poi              0.241174
time_seq_count_poi             16.0
Name: 0, dtype: object

In [38]:
del df_poi

## 3. Merge segregation metrics
### 3.1 Time sequence

In [39]:
df_t = pd.merge(df_deso_stats_t, df_hex_stats_t, on=['zone', 'weekday', 'holiday', 'time_seq'], how='left')
df_t = pd.merge(df_t, df_poi_stats_t, on=['zone', 'weekday', 'holiday', 'time_seq'], how='left')
df_t.iloc[0]

zone                 2700006534000
weekday                          0
holiday                          0
time_seq                       3.0
ind_count_deso                 1.0
ind_count_wt_deso        58.387097
ice_birth_deso           -0.065279
ind_count_hex                  1.0
ind_count_wt_hex         58.387097
ice_birth_hex             0.059898
ind_count_poi                  1.0
ind_count_wt_poi         58.387097
ice_birth_poi             0.468343
Name: 0, dtype: object

### 3.2 Daily average

In [40]:
df = pd.merge(df_deso_stats, df_hex_stats, on=['zone', 'weekday', 'holiday'], how='left')
df = pd.merge(df, df_poi_stats, on=['zone', 'weekday', 'holiday'], how='left')
df.iloc[0]

zone                   2700006534000
weekday                            0
holiday                            0
ind_count_deso                   1.0
ind_count_wt_deso          58.387097
ice_birth_deso             -0.046149
time_seq_count_deso             16.0
ind_count_hex                    1.0
ind_count_wt_hex           58.387097
ice_birth_hex              -0.002382
time_seq_count_hex              16.0
ind_count_poi                    1.0
ind_count_wt_poi           58.387097
ice_birth_poi               0.241174
time_seq_count_poi              16.0
Name: 0, dtype: object

## 4. Save data
### 4.1 Spatial units - grids

In [45]:
gdf_grids = gpd.GeoDataFrame.from_postgis(sql="""SELECT * FROM grids;""", con=engine)
gdf_grids.rename(columns={'birth_other': 'birth_othr'}, inplace=True)
gdf_grids.to_file('dbs/grid_stats/grids.shp')

In [46]:
gdf_grids = gpd.read_file('dbs/grid_stats/grids.shp')
gdf_grids.head()

Unnamed: 0,zone,grid_size,income_q1,income_q2,income_q3,income_q4,income_med,pop_income,birth_se,birth_nord,birth_eu,birth_othr,pop,job,geometry
0,3235006407750,250,278.0,119.0,48.0,12.0,155114.0,457.0,544.0,21.0,41.0,832.0,1438.0,40.0,"POLYGON ((323500.003 6407750.001, 323500.003 6..."
1,5490006234000,1000,3.0,0.0,0.0,0.0,139173.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,"POLYGON ((548999.998 6233999.997, 548999.998 6..."
2,5815006284750,250,3.0,6.0,6.0,5.0,274565.0,20.0,55.0,0.0,3.0,3.0,61.0,3.0,"POLYGON ((581500.004 6284749.997, 581500.004 6..."
3,3840006674000,1000,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,6.0,0.0,"POLYGON ((384000.002 6673999.997, 384000.002 6..."
4,4812506758250,250,3.0,0.0,3.0,0.0,272661.0,6.0,9.0,0.0,0.0,0.0,9.0,3.0,"POLYGON ((481249.997 6758249.998, 481249.997 6..."


In [51]:
gdf_grids.iloc[0].to_clipboard()

### 4.2 Aggregated individual mobility statistics

In [47]:
df_ind_stats.to_parquet('dbs/grid_stats/zonal_individual_stats.parquet', index=False)

In [48]:
pd.read_parquet('dbs/grid_stats/zonal_individual_stats.parquet').head()

Unnamed: 0,zone,ind_count,ind_count_wt,number_of_locations,number_of_visits,median_distance_from_home,average_displacement,radius_of_gyration,Other,Lowest income group,car_ownership,Not Sweden,cum_jobs_pt,cum_jobs_car
0,2700006534000,1.0,58.387097,17.0,45.0,397.375712,16.230608,200.146109,0.06383,0.0,0.712155,0.12766,66.0,0.0
1,2700006535000,1.0,58.387097,30.0,108.0,52.589189,42.977145,502.925843,0.036585,0.27907,0.712155,0.036585,31.0,0.0
2,2700006536000,1.0,58.387097,19.0,43.0,1.284088,11.230065,76.513283,0.0,0.0,0.712155,0.0,28.0,0.0
3,2710006533000,1.0,58.387097,12.0,17.0,2.227025,20.71318,73.772804,0.0,0.222222,0.712155,0.1,66.0,0.0
4,2720006534000,1.0,58.387097,60.0,292.0,172.552038,7.805596,60.578801,0.2,0.5,0.712155,0.2,22.0,0.0


In [52]:
df_ind_stats.iloc[0].to_clipboard()

### 4.3 Aggregated segregation statistics

In [49]:
df_t.to_parquet('dbs/grid_stats/zonal_seg_stats_half_hour.parquet', index=False)
df.to_parquet('dbs/grid_stats/zonal_seg_stats.parquet', index=False)

In [50]:
pd.read_parquet('dbs/grid_stats/zonal_seg_stats.parquet').head()

Unnamed: 0,zone,weekday,holiday,ind_count_deso,ind_count_wt_deso,ice_birth_deso,time_seq_count_deso,ind_count_hex,ind_count_wt_hex,ice_birth_hex,time_seq_count_hex,ind_count_poi,ind_count_wt_poi,ice_birth_poi,time_seq_count_poi
0,2700006534000,0,0,1.0,58.387097,-0.046149,16.0,1.0,58.387097,-0.002382,16.0,1.0,58.387097,0.241174,16.0
1,2700006534000,0,1,1.0,58.387097,0.111467,44.0,1.0,58.387097,0.112896,44.0,1.0,58.387097,-0.081965,44.0
2,2700006534000,1,0,1.0,58.387097,-0.064258,36.0,1.0,58.387097,-0.0794,36.0,1.0,58.387097,0.22197,36.0
3,2700006534000,1,1,1.0,58.387097,-0.05817,39.0,1.0,58.387097,0.055367,39.0,1.0,58.387097,-0.168385,39.0
4,2700006535000,0,0,1.0,58.387097,,48.0,1.0,58.387097,0.09282,36.0,1.0,58.387097,0.22197,20.0


In [54]:
df_t.iloc[0].to_clipboard()

In [55]:
df.iloc[0].to_clipboard()