# Construct features for predicting individual-level experienced income segregation

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [9]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
from tqdm import tqdm
import seaborn as sns
from lib import preprocess

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

## 1. Load prediction target and features

In [None]:
df = pd.read_sql("""SELECT * FROM segregation.mobi_seg_deso_individual;""", con=engine)

In [6]:
df.iloc[0]

weekday                                                         0
holiday                                                         0
uid                          0000c837-ef82-4dfd-b2a5-00bdc8680b0b
time_seq                                                      1.0
evenness_income                                          0.146859
num_coexistence                                             165.0
number_of_locations                                            11
number_of_visits                                              114
average_displacement                                     2.591139
radius_of_gyration                                       8.969815
median_distance_from_home                               10.043577
zone                                                6712506575000
region                                                  0180C1870
car_ownership                                            0.203142
evenness_income_resi                                     0.105595
Lowest inc

### 1.1 Aggregate (drop time sequence)

In [10]:
df_ind = df.drop(columns=['weekday', 'holiday', 'time_seq', 'evenness_income']).drop_duplicates(subset=['uid'])

In [11]:
tqdm.pandas()
df_stats = df.groupby(['weekday', 'holiday', 'uid'])['evenness_income'].progress_apply(np.mean).reset_index()
df_stats.head()

100%|██████████| 1204518/1204518 [01:07<00:00, 17737.92it/s]


Unnamed: 0,weekday,holiday,uid,evenness_income
0,0,0,00008608-f79e-414d-bf1c-25632d6bc059,0.163031
1,0,0,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,0.183732
2,0,0,0000cd68-c931-4e3c-96f6-7c5837f59b08,0.092451
3,0,0,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,0.225289
4,0,0,000115f0-937a-4716-8d8b-09b1ed54c5ce,0.11798


In [13]:
df = pd.merge(df_stats, df_ind, on='uid', how='left')

In [14]:
# Add built environment features of residential regions (DeSO zones)
df_built_env = pd.merge(pd.read_sql("""SELECT deso, num_jobs, num_stops, gsi FROM built_env.features_deso;""", con=engine),
                        pd.read_sql("""SELECT deso, length_density FROM built_env.walk_density_deso;""", con=engine),
                        on='deso', how='left')
df = pd.merge(df, df_built_env.rename(columns={'deso': 'region'}), on='region', how='left')
df.columns

Index(['weekday', 'holiday', 'uid', 'evenness_income', 'num_coexistence',
       'number_of_locations', 'number_of_visits', 'average_displacement',
       'radius_of_gyration', 'median_distance_from_home', 'zone', 'region',
       'car_ownership', 'evenness_income_resi', 'Lowest income group',
       'Not Sweden', 'cum_jobs', 'cum_stops', 'num_jobs', 'num_stops', 'gsi',
       'length_density'],
      dtype='object')

### 1.2 Deal with missing values

In [19]:
# evenness_income_resi, Not Sweden, Lowest income group
df_fillna = pd.merge(pd.read_sql("""SELECT region, "Not Sweden", "Lowest income group" FROM zone_stats;""", con=engine),
                     pd.read_sql("""SELECT region, evenness AS evenness_income_resi FROM segregation.resi_seg_deso
                                    WHERE var='income';""", con=engine),
                     on='region', how='left')
df_fillna.head()

Unnamed: 0,region,Not Sweden,Lowest income group,evenness_income_resi
0,0114A0010,0.15443,0.21,0.16
1,0114C1010,0.195896,0.15,0.246667
2,0114C1020,0.180124,0.15,0.24
3,0114C1030,0.173784,0.17,0.186667
4,0114C1040,0.375959,0.25,0.093333


In [None]:
df2proc = df.loc[df.evenness_income_resi.isna(), :]
df2proc = pd.merge(df2proc.drop(columns=['evenness_income_resi', 'Not Sweden', 'Lowest income group']),
                   df_fillna,
                   on='region',
                   how='left')
df = pd.concat([df.loc[~df.evenness_income_resi.isna(), :], df2proc])

In [22]:
df.dropna(how='any', inplace=True)

### 1.3 Define features and target

In [23]:
features = ['weekday', 'holiday',
            'number_of_locations', 'number_of_visits', 'average_displacement', 'radius_of_gyration', 'median_distance_from_home',
            'Not Sweden', 'Lowest income group', 'car_ownership',
            'cum_jobs', 'cum_stops', 'evenness_income_resi',
            'num_jobs', 'num_stops', 'gsi', 'length_density']
target = 'evenness_income'

In [24]:
X = df[features]
y = df[target]

## 2. Feature analysis
### 2.1 VIF test

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

In [26]:
vif_data

Unnamed: 0,feature,VIF
0,weekday,1.986784
1,holiday,1.848317
2,number_of_locations,3.356594
3,number_of_visits,3.673928
4,average_displacement,2.847582
5,radius_of_gyration,3.021622
6,median_distance_from_home,1.276586
7,Not Sweden,2.931753
8,Lowest income group,3.58194
9,car_ownership,4.015029


### 2.2 Feature importance

In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
num_feature = len(features)
fit = SelectKBest(f_regression, k=num_feature).fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
df_F = pd.DataFrame( [X.columns] + [x for x in f_regression(X, y)]).transpose()
df_F.columns = ["f", "F", "p-value"]
df_F = df_F.merge(featureScores.nlargest(num_feature,'Score'), left_on = "f", right_on = "Specs").drop(columns=["Specs"]).sort_values(by=["Score"], ascending=False)
df_F

Unnamed: 0,f,F,p-value,Score
7,Not Sweden,119019.0794,0.0,119019.0794
8,Lowest income group,45231.191273,0.0,45231.191273
2,number_of_locations,30499.117369,0.0,30499.117369
4,average_displacement,30308.058812,0.0,30308.058812
12,evenness_income_resi,22140.599217,0.0,22140.599217
3,number_of_visits,22128.595692,0.0,22128.595692
11,cum_stops,13040.304694,0.0,13040.304694
13,num_jobs,12779.474983,0.0,12779.474983
10,cum_jobs,11539.24217,0.0,11539.24217
5,radius_of_gyration,11190.095309,0.0,11190.095309


In [30]:
df[features + [target]].to_parquet('results/data4model_individual.parquet', index=False)