# Construct features for predicting individual-level experienced nativity segregation
DeSO zones

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [6]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
from tqdm import tqdm
import scipy.stats as stats
from lib import preprocess
from lib import metrics

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load prediction target and features

In [4]:
df = pd.read_sql("""SELECT * FROM segregation.mobi_seg_deso_individual;""", con=engine)

In [5]:
df.iloc[0]

weekday                                                         0
holiday                                                         0
uid                          0004870d-5b84-4d1f-8604-5793ff026c32
time_seq                                                      1.0
evenness_income                                          0.070595
ice_birth                                               -0.154335
num_coexistence                                            8329.0
number_of_locations                                             5
number_of_visits                                               11
average_displacement                                    75.047898
radius_of_gyration                                     107.280782
median_distance_from_home                                     0.0
zone                                                6467506531500
region                                                  0488C1010
car_ownership                                            0.552294
evenness_i

In [15]:
uids = df.uid.unique()

In [16]:
df_eg = df.loc[(df.uid==uids[89]) & (df['weekday'] == 1) & (df['holiday'] == 0), :]

In [52]:
def group_data(data):
    if len(data) > 3:
        grp = metrics.ice_group(data['ice_birth'].values, threshold=0.2)
    else:
        grp = 'NN'
    wt = data['wt_p'].values[0]
    ice_r = data['ice_birth_resi'].values[0]
    if ice_r > 0.2:
        grp_r = 'D'
    elif ice_r < -0.2:
        grp_r = 'F'
    else:
        grp_r = 'N'
    return pd.Series(dict(grp=grp, wt_p=wt, ice_r=ice_r, grp_r=grp_r))
tqdm.pandas()
df_grp = df.loc[(df['weekday'] == 1) & (df['holiday'] == 0) & (df['uid'].isin(uids[:1000])), ['uid', 'wt_p', 'ice_birth', 'ice_birth_resi']].groupby('uid').progress_apply(group_data).reset_index()

100%|██████████| 997/997 [00:01<00:00, 539.97it/s]


In [53]:
df_grp.groupby(['grp_r', 'grp'])['wt_p'].sum() / df_grp.wt_p.sum() * 100

grp_r  grp
D      D       6.118452
       F       1.772839
       N      33.802430
F      F      15.731669
       N       1.794741
N      D       0.247026
       F      12.093630
       N      28.439212
Name: wt_p, dtype: float64

### 1.1 Aggregate (drop time sequence)

In [6]:
df_ind = df.drop(columns=['weekday', 'holiday', 'time_seq', 'evenness_income', 'ice_birth']).drop_duplicates(subset=['uid'])

In [7]:
tqdm.pandas()
df_stats = df.groupby(['weekday', 'holiday', 'uid'])['ice_birth'].progress_apply(np.mean).reset_index()
df_stats.head()

100%|██████████| 1204518/1204518 [01:46<00:00, 11320.37it/s]


Unnamed: 0,weekday,holiday,uid,ice_birth
0,0,0,00008608-f79e-414d-bf1c-25632d6bc059,0.171538
1,0,0,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,-0.290253
2,0,0,0000cd68-c931-4e3c-96f6-7c5837f59b08,-0.030291
3,0,0,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,-0.116429
4,0,0,000115f0-937a-4716-8d8b-09b1ed54c5ce,-0.141422


In [8]:
df = pd.merge(df_stats, df_ind, on='uid', how='left')

In [9]:
# Add built environment features of residential regions (DeSO zones)
df_built_env = pd.merge(pd.read_sql("""SELECT deso, num_jobs, num_stops, gsi FROM built_env.features_deso;""", con=engine),
                        pd.read_sql("""SELECT deso, length_density FROM built_env.walk_density_deso;""", con=engine),
                        on='deso', how='left')
df = pd.merge(df, df_built_env.rename(columns={'deso': 'region'}), on='region', how='left')
df.columns

Index(['weekday', 'holiday', 'uid', 'ice_birth', 'num_coexistence',
       'number_of_locations', 'number_of_visits', 'average_displacement',
       'radius_of_gyration', 'median_distance_from_home', 'zone', 'region',
       'car_ownership', 'evenness_income_resi', 'ice_birth_resi',
       'Lowest income group', 'Not Sweden', 'Other', 'wt_p', 'cum_jobs_car',
       'cum_jobs_pt', 'num_jobs', 'num_stops', 'gsi', 'length_density'],
      dtype='object')

### 1.2 Missing values

In [10]:
print(f"Data length is {len(df)}")
df.dropna(how='any', inplace=True)
print(f"Data length is {len(df)} after dropping any NaN columns.")

Data length is 1204518
Data length is 1164610 after dropping any NaN columns.


### 1.3 Define features and target

In [11]:
extras = ['uid', 'zone', 'region', 'wt_p']
features = ['weekday', 'holiday',
            'number_of_locations', 'number_of_visits', 'average_displacement', 'radius_of_gyration', 'median_distance_from_home',
            'Other', 'Lowest income group', 'car_ownership',
            'cum_jobs_pt', 'cum_jobs_car', 'evenness_income_resi', 'ice_birth_resi',
            'num_jobs', 'num_stops', 'gsi', 'length_density']
target = 'ice_birth'

In [12]:
X = df[features]
y = df[target]

## 2. Feature analysis
### 2.1 VIF test

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

In [14]:
vif_data

Unnamed: 0,feature,VIF
0,weekday,1.996237
1,holiday,1.854641
2,number_of_locations,3.36478
3,number_of_visits,3.679187
4,average_displacement,2.897741
5,radius_of_gyration,3.024795
6,median_distance_from_home,1.289507
7,Other,4.119417
8,Lowest income group,3.807369
9,car_ownership,7.157534


### 2.2 Feature importance
After removing features with VIF > 10

In [15]:
features = vif_data.loc[vif_data.VIF < 10, 'feature'].values
X = df[features]

In [16]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
num_feature = len(features)
fit = SelectKBest(f_regression, k=num_feature).fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
df_F = pd.DataFrame( [X.columns] + [x for x in f_regression(X, y)]).transpose()
df_F.columns = ["f", "F", "p-value"]
df_F = df_F.merge(featureScores.nlargest(num_feature,'Score'), left_on = "f", right_on = "Specs").drop(columns=["Specs"]).sort_values(by=["Score"], ascending=False)
df_F

Unnamed: 0,f,F,p-value,Score
13,ice_birth_resi,853979.043859,0.0,853979.043859
7,Other,479719.797605,0.0,479719.797605
9,car_ownership,224218.577325,0.0,224218.577325
8,Lowest income group,86235.862661,0.0,86235.862661
12,evenness_income_resi,55580.886279,0.0,55580.886279
11,cum_jobs_car,50034.670435,0.0,50034.670435
16,length_density,34760.914738,0.0,34760.914738
15,num_stops,12014.883501,0.0,12014.883501
14,num_jobs,5951.462602,0.0,5951.462602
10,cum_jobs_pt,5731.566366,0.0,5731.566366


In [17]:
df[extras + list(features) + [target]].to_parquet('results/data4model_individual.parquet', index=False)