# Construct features for predicting individual-level experienced income segregation

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
from tqdm import tqdm
import seaborn as sns
from lib import preprocess

  shapely_geos_version, geos_capi_version_string


In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

## 1. Load prediction target and features

In [14]:
df = pd.read_sql("""SELECT * FROM segregation.mobi_seg_deso_individual;""", con=engine)

### 1.0 Replace grid-based residential segregation with DeSO-level results

In [5]:
df_seg_resi = pd.read_sql(sql='''SELECT region, evenness AS evenness_income_resi
                                 FROM segregation.resi_seg_deso WHERE var='income';''', con=engine)

In [6]:
df = pd.merge(df.drop(columns=['evenness_income_resi']), df_seg_resi, on='region', how='left')
df.iloc[0]

weekday                                                         0
holiday                                                         0
uid                          0004870d-5b84-4d1f-8604-5793ff026c32
time_seq                                                      1.0
evenness_income                                          0.070595
ice_birth                                               -0.076981
num_coexistence                                            8329.0
number_of_locations                                             5
number_of_visits                                               11
average_displacement                                    75.047898
radius_of_gyration                                     107.280782
median_distance_from_home                                     0.0
zone                                                6467506531500
region                                                  0488C1010
car_ownership                                            0.552294
Lowest inc

### 1.1 Aggregate (drop time sequence)

In [15]:
df_ind = df.drop(columns=['weekday', 'holiday', 'time_seq', 'evenness_income', 'ice_birth']).drop_duplicates(subset=['uid'])

In [16]:
tqdm.pandas()
df_stats = df.groupby(['weekday', 'holiday', 'uid'])['ice_birth'].progress_apply(np.mean).reset_index()
df_stats.head()

100%|██████████| 1204518/1204518 [01:02<00:00, 19257.64it/s]


Unnamed: 0,weekday,holiday,uid,ice_birth
0,0,0,00008608-f79e-414d-bf1c-25632d6bc059,0.240499
1,0,0,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,-0.270656
2,0,0,0000cd68-c931-4e3c-96f6-7c5837f59b08,0.006959
3,0,0,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,0.062034
4,0,0,000115f0-937a-4716-8d8b-09b1ed54c5ce,-0.144524


In [17]:
df = pd.merge(df_stats, df_ind, on='uid', how='left')

In [18]:
# Add built environment features of residential regions (DeSO zones)
df_built_env = pd.merge(pd.read_sql("""SELECT deso, num_jobs, num_stops, gsi FROM built_env.features_deso;""", con=engine),
                        pd.read_sql("""SELECT deso, length_density FROM built_env.walk_density_deso;""", con=engine),
                        on='deso', how='left')
df = pd.merge(df, df_built_env.rename(columns={'deso': 'region'}), on='region', how='left')
df.columns

Index(['weekday', 'holiday', 'uid', 'ice_birth', 'num_coexistence',
       'number_of_locations', 'number_of_visits', 'average_displacement',
       'radius_of_gyration', 'median_distance_from_home', 'zone', 'region',
       'car_ownership', 'evenness_income_resi', 'Lowest income group',
       'Not Sweden', 'cum_jobs', 'cum_stops', 'num_jobs', 'num_stops', 'gsi',
       'length_density'],
      dtype='object')

### 1.2 Deal with missing values

In [19]:
# evenness_income_resi, Not Sweden, Lowest income group
#df_fillna = pd.merge(pd.read_sql("""SELECT region, "Not Sweden", "Lowest income group" FROM zone_stats;""", con=engine),
#                     pd.read_sql("""SELECT region, evenness AS evenness_income_resi FROM segregation.resi_seg_deso
#                                    WHERE var='income';""", con=engine),
#                     on='region', how='left')
df_fillna = pd.read_sql("""SELECT region, "Not Sweden", "Lowest income group" FROM zone_stats;""", con=engine)
df_fillna.head()

Unnamed: 0,region,Not Sweden,Lowest income group
0,0114A0010,0.15443,0.21
1,0114C1010,0.195896,0.15
2,0114C1020,0.180124,0.15
3,0114C1030,0.173784,0.17
4,0114C1040,0.375959,0.25


In [20]:
df2proc = df.loc[df['Lowest income group'].isna(), :]

In [21]:
df2proc = pd.merge(df2proc.drop(columns=['Not Sweden', 'Lowest income group']),
                   df_fillna,
                   on='region',
                   how='left')
df = pd.concat([df.loc[~df['Lowest income group'].isna(), :], df2proc])

In [22]:
df.dropna(how='any', inplace=True)

### 1.3 Define features and target

In [23]:
features = ['weekday', 'holiday',
            'number_of_locations', 'number_of_visits', 'average_displacement', 'radius_of_gyration', 'median_distance_from_home',
            'Not Sweden', 'Lowest income group', 'car_ownership',
            'cum_jobs', 'cum_stops', 'evenness_income_resi',
            'num_jobs', 'num_stops', 'gsi', 'length_density']
target = 'ice_birth'

In [24]:
X = df[features]
y = df[target]

## 2. Feature analysis
### 2.1 VIF test

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

  import pandas.util.testing as tm


In [26]:
vif_data

Unnamed: 0,feature,VIF
0,weekday,1.987033
1,holiday,1.848574
2,number_of_locations,3.361874
3,number_of_visits,3.676536
4,average_displacement,2.918641
5,radius_of_gyration,3.056024
6,median_distance_from_home,1.282983
7,Not Sweden,2.924421
8,Lowest income group,3.557527
9,car_ownership,4.104032


### 2.2 Feature importance
After removing features with VIF > 10

In [27]:
features = vif_data.loc[vif_data.VIF < 10, 'feature'].values
X = df[features]

In [28]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
num_feature = len(features)
fit = SelectKBest(f_regression, k=num_feature).fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
df_F = pd.DataFrame( [X.columns] + [x for x in f_regression(X, y)]).transpose()
df_F.columns = ["f", "F", "p-value"]
df_F = df_F.merge(featureScores.nlargest(num_feature,'Score'), left_on = "f", right_on = "Specs").drop(columns=["Specs"]).sort_values(by=["Score"], ascending=False)
df_F

Unnamed: 0,f,F,p-value,Score
7,Not Sweden,512107.699613,0.0,512107.699613
9,car_ownership,245648.256722,0.0,245648.256722
8,Lowest income group,67147.646453,0.0,67147.646453
15,length_density,44189.10826,0.0,44189.10826
10,cum_jobs,25514.511282,0.0,25514.511282
14,num_stops,13417.946725,0.0,13417.946725
13,num_jobs,8934.674975,0.0,8934.674975
11,cum_stops,4531.957306,0.0,4531.957306
12,evenness_income_resi,2936.226934,0.0,2936.226934
2,number_of_locations,2640.407322,0.0,2640.407322


In [29]:
df[list(features) + [target]].to_parquet('results/data4model_individual.parquet', index=False)