# Construct features for predicting individual-level experienced segregation
Mixed-hexagon zones

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
# Load libs
import pandas as pd
import sqlalchemy
import numpy as np
from tqdm import tqdm
from lib import preprocess
from lib import metrics

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load data
### 1.1 Individual features

In [4]:
df_ind = pd.read_sql("""SELECT DISTINCT ON (uid)
                            *
                            FROM segregation.mobi_seg_deso_individual
                            WHERE weekday=1 AND holiday=0;""", con=engine)
df_ind.drop(columns=['weekday', 'holiday', 'time_seq', 
                     'evenness_income', 'ice_birth', 'num_coexistence'],
            inplace=True)

## 1.2 Nativity segregation results

In [5]:
def group_data(data):
    if len(data) > 3:
        grp = metrics.ice_group(data['ice_birth'].values, threshold=0.2)
    else:
        grp = 'NN'
    wt = data['wt_p'].values[0]
    ice_r = data['ice_birth_resi'].values[0]
    if ice_r > 0.2:
        grp_r = 'D'
    elif ice_r < -0.2:
        grp_r = 'F'
    else:
        grp_r = 'N'
    return pd.Series(dict(grp=grp, wt_p=wt, ice_r=ice_r, grp_r=grp_r))

In [6]:
df = pd.read_sql("""SELECT * FROM segregation.mobi_seg_hex_individual_w1h0;""", con=engine)
df = pd.merge(df, df_ind[['uid', 'wt_p', 'ice_birth_resi']], how='left')

In [None]:
tqdm.pandas()
df_grp = df.groupby('uid').progress_apply(group_data).reset_index()

In [10]:
df_grp = df_grp.loc[df_grp.grp != 'NN', :]
len(df_grp)

322880

In [11]:
df_grp.groupby(['grp_r', 'grp'])['wt_p'].sum() / df_grp.wt_p.sum() * 100

grp_r  grp
D      D      12.501663
       F       2.258645
       N      28.339690
F      D       0.118234
       F      14.027115
       N       3.116443
N      D       1.575910
       F       9.748560
       N      28.313740
Name: wt_p, dtype: float64

In [9]:
df_grp.groupby('grp')['wt_p'].sum() / df_grp.wt_p.sum() * 100

grp
D     14.195441
F     26.033649
N     59.768330
NN     0.002580
Name: wt_p, dtype: float64

### 1.3 Merge data

In [13]:
df_ind = pd.merge(df_ind, df_grp, on='uid', how='left')

In [None]:
df_stats = df.groupby('uid')['ice_birth'].progress_apply(np.mean).reset_index()
df_stats.head()

In [None]:
df_ind = pd.merge(df_ind, df_stats, on='uid', how='left')
df_ind.drop(columns=['ice_birth_resi', 'wt_p_x'], inplace=True)
df_ind.rename(columns={'ice_birth': 'ice_e', 'wt_p_y': 'wt_p'}, inplace=True)
df_ind.iloc[0]

In [None]:
print(f"Data length is {len(df_ind)}")
df_ind.dropna(how='any', inplace=True)
print(f"Data length is {len(df_ind)} after dropping any NaN columns.")

In [None]:
df_ind.columns

## 2. Save data

In [19]:
extras = ['uid', 'zone', 'region', 'wt_p']
features = ['ice_r', 'grp_r', 'ice_e', 'grp',
            'number_of_locations', 'number_of_visits',  
            'median_distance_from_home', 'average_displacement', 'radius_of_gyration',
            'Other', 'Lowest income group', 'car_ownership', 'Not Sweden',
            'cum_jobs_pt', 'cum_jobs_car', 'evenness_income_resi'
            ]

In [20]:
df_ind[extras + list(features)].to_parquet('results/data4model_individual_hex_w1h0.parquet', index=False)