# Construct features for predicting individual-level experienced income segregation

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
from tqdm import tqdm
from lib import preprocess

In [4]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load prediction target and features

In [5]:
df = pd.read_sql("""SELECT * FROM segregation.mobi_seg_deso_individual;""", con=engine)

In [6]:
df.iloc[0]

weekday                                                         0
holiday                                                         0
uid                          0002291c-bf6f-4451-a6e6-bf3ef54bef6b
time_seq                                                      1.0
evenness_income                                          0.027401
ice_birth                                               -0.160005
num_coexistence                                           16419.0
number_of_locations                                             5
number_of_visits                                               14
average_displacement                                   252.521531
radius_of_gyration                                     287.339987
median_distance_from_home                              367.282803
zone                                                6735006579500
region                                                  0180C3800
car_ownership                                            0.247067
evenness_i

### 1.1 Aggregate (drop time sequence)

In [7]:
df_ind = df.drop(columns=['weekday', 'holiday', 'time_seq', 'evenness_income', 'ice_birth']).drop_duplicates(subset=['uid'])

In [8]:
tqdm.pandas()
df_stats = df.groupby(['weekday', 'holiday', 'uid'])['ice_birth'].progress_apply(np.mean).reset_index()
df_stats.head()

100%|██████████| 1204518/1204518 [01:08<00:00, 17611.37it/s]


Unnamed: 0,weekday,holiday,uid,ice_birth
0,0,0,00008608-f79e-414d-bf1c-25632d6bc059,0.171538
1,0,0,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,-0.290253
2,0,0,0000cd68-c931-4e3c-96f6-7c5837f59b08,-0.030291
3,0,0,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,-0.116429
4,0,0,000115f0-937a-4716-8d8b-09b1ed54c5ce,-0.141422


In [9]:
df = pd.merge(df_stats, df_ind, on='uid', how='left')

In [10]:
# Add built environment features of residential regions (DeSO zones)
df_built_env = pd.merge(pd.read_sql("""SELECT deso, num_jobs, num_stops, gsi FROM built_env.features_deso;""", con=engine),
                        pd.read_sql("""SELECT deso, length_density FROM built_env.walk_density_deso;""", con=engine),
                        on='deso', how='left')
df = pd.merge(df, df_built_env.rename(columns={'deso': 'region'}), on='region', how='left')
df.columns

Index(['weekday', 'holiday', 'uid', 'ice_birth', 'num_coexistence',
       'number_of_locations', 'number_of_visits', 'average_displacement',
       'radius_of_gyration', 'median_distance_from_home', 'zone', 'region',
       'car_ownership', 'evenness_income_resi', 'ice_birth_resi',
       'Lowest income group', 'Not Sweden', 'Other', 'wt_p', 'cum_jobs',
       'cum_stops', 'num_jobs', 'num_stops', 'gsi', 'length_density'],
      dtype='object')

### 1.2 Missing values

In [11]:
print(f"Data length is {len(df)}")
df.dropna(how='any', inplace=True)
print(f"Data length is {len(df)} after dropping any NaN columns.")

Data length is 1204518
Data length is 1164605 after dropping any NaN columns.


### 1.3 Define features and target

In [14]:
extras = ['uid', 'zone', 'region', 'wt_p']
features = ['weekday', 'holiday',
            'number_of_locations', 'number_of_visits', 'average_displacement', 'radius_of_gyration', 'median_distance_from_home',
            'Other', 'Lowest income group', 'car_ownership',
            'cum_jobs', 'cum_stops', 'evenness_income_resi', 'ice_birth_resi',
            'num_jobs', 'num_stops', 'gsi', 'length_density']
target = 'ice_birth'

In [15]:
X = df[features]
y = df[target]

## 2. Feature analysis
### 2.1 VIF test

In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

  import pandas.util.testing as tm


In [17]:
vif_data

Unnamed: 0,feature,VIF
0,weekday,1.992963
1,holiday,1.852608
2,number_of_locations,3.359803
3,number_of_visits,3.679089
4,average_displacement,2.918828
5,radius_of_gyration,3.052553
6,median_distance_from_home,1.287108
7,Other,4.036313
8,Lowest income group,3.481183
9,car_ownership,7.210569


### 2.2 Feature importance
After removing features with VIF > 10

In [18]:
features = vif_data.loc[vif_data.VIF < 10, 'feature'].values
X = df[features]

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
num_feature = len(features)
fit = SelectKBest(f_regression, k=num_feature).fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
df_F = pd.DataFrame( [X.columns] + [x for x in f_regression(X, y)]).transpose()
df_F.columns = ["f", "F", "p-value"]
df_F = df_F.merge(featureScores.nlargest(num_feature,'Score'), left_on = "f", right_on = "Specs").drop(columns=["Specs"]).sort_values(by=["Score"], ascending=False)
df_F

Unnamed: 0,f,F,p-value,Score
13,ice_birth_resi,853693.597378,0.0,853693.597378
7,Other,479476.001172,0.0,479476.001172
9,car_ownership,224208.004371,0.0,224208.004371
8,Lowest income group,86146.294363,0.0,86146.294363
12,evenness_income_resi,55532.475389,0.0,55532.475389
16,length_density,34716.959467,0.0,34716.959467
10,cum_jobs,15267.727217,0.0,15267.727217
15,num_stops,12036.861777,0.0,12036.861777
14,num_jobs,5955.952236,0.0,5955.952236
2,number_of_locations,5568.034056,0.0,5568.034056


In [20]:
df[extras + list(features) + [target]].to_parquet('results/data4model_individual.parquet', index=False)