# Home detection
Temporal rules for home detection among top 3 visited clusters:

We infer the home area of each individual using its most common visited cluster between the hours of 10:00 p.m. and 6:00 a.m.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [45]:
# Load libs
import pandas as pd
import geopandas as gpd
import sqlalchemy
from tqdm import tqdm
from lib import preprocess as preprocess
from shapely.geometry import MultiPoint
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

## 1. Load temporal profiles

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

In [4]:
df = pd.read_sql_query(sql="""SELECT * FROM description.tempo_top3;""", con=engine)

In [5]:
df.head()

Unnamed: 0,half_hour,freq,freq_wt,uid,cluster
0,0,0.0,0.0,00008608-f79e-414d-bf1c-25632d6bc059,3
1,1,0.0,0.0,00008608-f79e-414d-bf1c-25632d6bc059,3
2,2,0.0,0.0,00008608-f79e-414d-bf1c-25632d6bc059,3
3,3,0.0,0.0,00008608-f79e-414d-bf1c-25632d6bc059,3
4,4,0.0,0.0,00008608-f79e-414d-bf1c-25632d6bc059,3


### 1.1 Summarise the weighted frequency count for the half-hour sequence 0-11 and 44-47

In [6]:
tempo_range = list(range(0, 12)) + list(range(44, 48))
def sum_home_stay(data):
    home_freq = data.loc[data.half_hour.isin(tempo_range), 'freq_wt'].sum()
    home_share = home_freq / data.freq_wt.sum() * 100
    return pd.Series(dict(home_freq=home_freq, home_share=home_share))
tqdm.pandas()
df_h = df.groupby(['uid', 'cluster']).progress_apply(sum_home_stay).reset_index()

100%|██████████| 569366/569366 [07:05<00:00, 1339.29it/s]


In [7]:
df_h.head()

Unnamed: 0,uid,cluster,home_freq,home_share
0,00008608-f79e-414d-bf1c-25632d6bc059,3,0.0,0.0
1,00008608-f79e-414d-bf1c-25632d6bc059,11,0.641834,7.271018
2,00008608-f79e-414d-bf1c-25632d6bc059,15,3.666667,100.0
3,00009689-c524-4a99-95d8-a2397d87db62,1,0.8,9.184845
4,00009689-c524-4a99-95d8-a2397d87db62,9,0.0,0.0


In [8]:
df_h.to_sql('home', engine, schema='description', index=False, method='multi', if_exists='replace', chunksize=10000)

## 2. Select home cluster and add DeSO zone information

In [9]:
def ind_select(data):
    home_ = data.loc[data.home_freq == data.home_freq.max(), ['cluster', 'home_freq', 'home_share']].values[0]
    return pd.Series(dict(home=home_[0], home_freq=home_[1], home_share=home_[2]))
tqdm.pandas()
df_h_selected = df_h.groupby('uid').progress_apply(ind_select).reset_index().astype({"home": int})

100%|██████████| 198916/198916 [04:47<00:00, 691.79it/s]


In [10]:
df_h_selected.head()

Unnamed: 0,uid,home,home_freq,home_share
0,00008608-f79e-414d-bf1c-25632d6bc059,15,3.666667,100.0
1,00009689-c524-4a99-95d8-a2397d87db62,1,0.8,9.184845
2,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,1,0.0,0.0
3,0000cd68-c931-4e3c-96f6-7c5837f59b08,20,7.741087,42.844024
4,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,4,6.748871,48.747843


### 2.1 Keep the top clusters with non-zero home-time stay

In [11]:
len(df_h_selected.loc[df_h_selected.home_freq > 0, :]) / len(df_h_selected)

0.8945836433469404

In [12]:
df_h_selected = df_h_selected.loc[df_h_selected.home_freq > 0, :]

### 2.2 Add cluster characteristics

In [13]:
df_cls = pd.read_sql_query(sql="""SELECT * FROM description.clusters_top3_wt;""", con=engine)
df_cls.head()

Unnamed: 0,uid,cluster,holiday,freq,freq_wt,dur
0,00008608-f79e-414d-bf1c-25632d6bc059,3,0,197.0,606.919326,5621.9
1,00008608-f79e-414d-bf1c-25632d6bc059,15,0,2.0,133.333333,61.033333
2,00008608-f79e-414d-bf1c-25632d6bc059,11,0,20.0,98.748821,658.95
3,00009689-c524-4a99-95d8-a2397d87db62,1,0,8.0,54.1,423.566667
4,00009689-c524-4a99-95d8-a2397d87db62,14,0,1.0,12.0,56.35


In [14]:
df_home = pd.merge(df_h_selected, df_cls, left_on=['uid', 'home'], right_on=['uid', 'cluster'])
df_home.head()

Unnamed: 0,uid,home,home_freq,home_share,cluster,holiday,freq,freq_wt,dur
0,00008608-f79e-414d-bf1c-25632d6bc059,15,3.666667,100.0,15,0,2.0,133.333333,61.033333
1,00009689-c524-4a99-95d8-a2397d87db62,1,0.8,9.184845,1,0,8.0,54.1,423.566667
2,0000cd68-c931-4e3c-96f6-7c5837f59b08,20,7.741087,42.844024,20,0,175.0,950.455111,7115.833333
3,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,4,6.748871,48.747843,4,0,15.0,123.688345,788.45
4,000115f0-937a-4716-8d8b-09b1ed54c5ce,4,2.18205,13.002723,4,0,73.0,478.875373,6023.35


Get cluster centroids

In [19]:
df_stops = pd.read_sql_query(sql="""SELECT uid, lat, lng, cluster FROM stops_subset;""", con=engine)
df_stops = pd.merge(df_stops, df_home.loc[:, ['uid', 'cluster']], on=['uid', 'cluster'], how='inner')
df_stops.head()

Unnamed: 0,uid,lat,lng,cluster
0,0d3a07aa-e73c-4f98-b10e-cb8d3da77d79,60.000009,15.791768,1
1,0d3a07aa-e73c-4f98-b10e-cb8d3da77d79,60.000009,15.791768,1
2,0d3a07aa-e73c-4f98-b10e-cb8d3da77d79,60.000694,15.788607,1
3,0d3a07aa-e73c-4f98-b10e-cb8d3da77d79,60.000009,15.791768,1
4,0d3a07aa-e73c-4f98-b10e-cb8d3da77d79,60.000009,15.791768,1


In [20]:
def centroid_calculation(data):
    coords = data.loc[:, ['lng', 'lat']].values
    centroid = (MultiPoint(coords).centroid.x, MultiPoint(coords).centroid.y)  # x='longitude', y='latitude'
    return pd.Series({'lng': centroid[0], 'lat': centroid[1]})
tqdm.pandas()
df_centroids = df_stops.groupby(['uid', 'cluster']).progress_apply(centroid_calculation).reset_index()
df_home = pd.merge(df_home, df_centroids, on=['uid', 'cluster'], how='left')

100%|██████████| 177947/177947 [03:46<00:00, 784.59it/s]


### 2.3 Add DeSO zone information

In [22]:
gdf = gpd.GeoDataFrame.from_postgis("SELECT deso, geom FROM public.zones", con=engine).to_crs(4326)
gdf_home = preprocess.df2gdf_point(df_home, 'lng', 'lat', crs=4326, drop=True)
gdf_home = gpd.sjoin(gdf_home, gdf)
df_home_deso = pd.merge(df_home, gdf_home.loc[:, ['uid', 'deso']], on='uid', how='inner')

Save the data.

In [23]:
df_home_deso.drop(columns=['cluster', 'holiday']).to_sql('home', engine, schema='public', index=False, method='multi', if_exists='replace', chunksize=10000)

## 3 Population representativeness

In [37]:
len(df_home_deso.loc[df_home_deso.freq >= 3, :]) / len(df_home_deso) * 100

79.05874760101224

In [55]:
df_home_deso_center = df_home_deso.loc[df_home_deso.deso.isin(['0180C4040',
                                                               '0180C4010',
                                                               '2361A0020']), :]
df_home_deso_center = df_home_deso_center.groupby(['deso', 'lat', 'lng'])['uid'].count().sort_values(ascending=False).reset_index()

In [58]:
abnormal_centroids_dict = {'2361A0020': [62.0, 15.0],
                      '0180C4040': [59.3333, 18.05],
                      '0180C4010': [59.3247, 18.056]}
abnormal_centroids = [v for _, v in abnormal_centroids_dict.items()]

1) Keep those that have at least 3 nights at home.
2) Remove abnormal centroids suggesting place-based resolution instead of more precise GPS coordinates.

In [59]:
df_home_deso = df_home_deso.loc[df_home_deso.freq >= 3, :]

In [60]:
for ab_c in abnormal_centroids:
    df_home_deso = df_home_deso.loc[~((df_home_deso.lat == ab_c[0]) & (df_home_deso.lng == ab_c[1])), :]

In [61]:
len(df_home_deso)

136065

In [65]:
df_home_deso.drop(columns=['cluster', 'holiday']).to_sql('home_sub', engine, schema='public', index=False, method='multi', if_exists='replace', chunksize=10000)