# Construct features for predicting individual-level experienced nativity segregation
Mixed-hexagon zones

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
from tqdm import tqdm
from lib import preprocess
from lib import metrics

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load data
### 1.1 Individual features

In [4]:
df_ind = pd.read_sql("""SELECT DISTINCT ON (uid)
                            *
                            FROM segregation.mobi_seg_deso_individual
                            WHERE weekday=1 AND holiday=0;""", con=engine)
df_ind.drop(columns=['weekday', 'holiday', 'time_seq', 
                     'evenness_income', 'ice_birth', 'num_coexistence'],
            inplace=True)

## 1.2 Nativity segregation results

In [5]:
def group_data(data):
    if len(data) > 3:
        grp = metrics.ice_group(data['ice_birth'].values, threshold=0.2)
    else:
        grp = 'NN'
    wt = data['wt_p'].values[0]
    ice_r = data['ice_birth_resi'].values[0]
    if ice_r > 0.2:
        grp_r = 'D'
    elif ice_r < -0.2:
        grp_r = 'F'
    else:
        grp_r = 'N'
    return pd.Series(dict(grp=grp, wt_p=wt, ice_r=ice_r, grp_r=grp_r))

In [6]:
df = pd.read_sql("""SELECT * FROM segregation.mobi_seg_hex_individual_w1h0;""", con=engine)
df = pd.merge(df, df_ind[['uid', 'wt_p', 'ice_birth_resi']], how='left')

In [7]:
tqdm.pandas()
df_grp = df.groupby('uid').progress_apply(group_data).reset_index()

  res = hypotest_fun_out(*samples, **kwds)
100%|██████████| 322904/322904 [09:34<00:00, 562.23it/s]


In [10]:
df_grp = df_grp.loc[df_grp.grp != 'NN', :]
len(df_grp)

322880

In [11]:
df_grp.groupby(['grp_r', 'grp'])['wt_p'].sum() / df_grp.wt_p.sum() * 100

grp_r  grp
D      D      12.501663
       F       2.258645
       N      28.339690
F      D       0.118234
       F      14.027115
       N       3.116443
N      D       1.575910
       F       9.748560
       N      28.313740
Name: wt_p, dtype: float64

In [9]:
df_grp.groupby('grp')['wt_p'].sum() / df_grp.wt_p.sum() * 100

grp
D     14.195441
F     26.033649
N     59.768330
NN     0.002580
Name: wt_p, dtype: float64

### 1.3 Merge data

In [13]:
df_ind = pd.merge(df_ind, df_grp, on='uid', how='left')

In [14]:
df_stats = df.groupby('uid')['ice_birth'].progress_apply(np.mean).reset_index()
df_stats.head()

100%|██████████| 322904/322904 [00:25<00:00, 12523.43it/s]


Unnamed: 0,uid,ice_birth
0,00008608-f79e-414d-bf1c-25632d6bc059,0.201666
1,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,-0.207058
2,0000cd68-c931-4e3c-96f6-7c5837f59b08,0.009751
3,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,0.231972
4,000115f0-937a-4716-8d8b-09b1ed54c5ce,-0.017157


In [16]:
df_ind = pd.merge(df_ind, df_stats, on='uid', how='left')
df_ind.drop(columns=['ice_birth_resi', 'wt_p_x'], inplace=True)
df_ind.rename(columns={'ice_birth': 'ice_e', 'wt_p_y': 'wt_p'}, inplace=True)
df_ind.iloc[0]

uid                          00008608-f79e-414d-bf1c-25632d6bc059
number_of_locations                                            43
number_of_visits                                               96
average_displacement                                    34.301877
radius_of_gyration                                     114.406844
median_distance_from_home                                 0.21554
zone                                                3490006228000
region                                                  1284C1040
car_ownership                                            0.541455
evenness_income_resi                                         0.12
Lowest income group                                      0.088063
Not Sweden                                               0.082517
Other                                                    0.042657
cum_jobs_car                                              65662.0
cum_jobs_pt                                                   296
grp       

In [17]:
print(f"Data length is {len(df_ind)}")
df_ind.dropna(how='any', inplace=True)
print(f"Data length is {len(df_ind)} after dropping any NaN columns.")

Data length is 322477
Data length is 312813 after dropping any NaN columns.


In [18]:
df_ind.columns

Index(['uid', 'number_of_locations', 'number_of_visits',
       'average_displacement', 'radius_of_gyration',
       'median_distance_from_home', 'zone', 'region', 'car_ownership',
       'evenness_income_resi', 'Lowest income group', 'Not Sweden', 'Other',
       'cum_jobs_car', 'cum_jobs_pt', 'grp', 'wt_p', 'ice_r', 'grp_r',
       'ice_e'],
      dtype='object')

## 2. Save data

In [19]:
extras = ['uid', 'zone', 'region', 'wt_p']
features = ['ice_r', 'grp_r', 'ice_e', 'grp',
            'number_of_locations', 'number_of_visits',  
            'median_distance_from_home', 'average_displacement', 'radius_of_gyration',
            'Other', 'Lowest income group', 'car_ownership', 'Not Sweden',
            'cum_jobs_pt', 'cum_jobs_car', 'evenness_income_resi'
            ]

In [20]:
df_ind[extras + list(features)].to_parquet('results/data4model_individual_hex_w1h0.parquet', index=False)