# Home detection
Temporal rules for home detection among top 3 visited clusters:

We infer the home area of each individual using its most common visited cluster between the hours of 10:00 p.m. and 6:00 a.m.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [23]:
# Load libs
import pandas as pd
import numpy as np
import geopandas as gpd
import sqlalchemy
from tqdm import tqdm
from lib import preprocess as preprocess
from shapely.geometry import MultiPoint
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

## 1. Load temporal profiles

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

In [4]:
df = pd.read_sql_query(sql="""SELECT * FROM description.tempo_top3_p;""", con=engine)

In [5]:
df.head()

Unnamed: 0,half_hour,freq,freq_wt,uid,loc
0,0,1.0,1.0,00008608-f79e-414d-bf1c-25632d6bc059,1
1,1,1.0,1.0,00008608-f79e-414d-bf1c-25632d6bc059,1
2,2,1.0,1.0,00008608-f79e-414d-bf1c-25632d6bc059,1
3,3,1.0,1.0,00008608-f79e-414d-bf1c-25632d6bc059,1
4,4,1.0,1.0,00008608-f79e-414d-bf1c-25632d6bc059,1


### 1.1 Summarise the weighted frequency count for the half-hour sequence 0-11 and 44-47

In [6]:
tempo_range = list(range(0, 12)) + list(range(44, 48))
def sum_home_stay(data):
    home_freq = data.loc[data.half_hour.isin(tempo_range), 'freq_wt'].sum()
    home_share = home_freq / data.freq_wt.sum() * 100
    return pd.Series(dict(home_freq=home_freq, home_share=home_share))
tqdm.pandas()
df_h = df.groupby(['uid', 'loc']).progress_apply(sum_home_stay).reset_index()

  after removing the cwd from sys.path.
100%|██████████| 1407134/1407134 [16:03<00:00, 1459.94it/s]


In [7]:
df_h.head()

Unnamed: 0,uid,loc,home_freq,home_share
0,00008608-f79e-414d-bf1c-25632d6bc059,1,7.089974,100.0
1,00008608-f79e-414d-bf1c-25632d6bc059,2,6.809171,100.0
2,00008608-f79e-414d-bf1c-25632d6bc059,3,7.0,100.0
3,00009689-c524-4a99-95d8-a2397d87db62,1,10.376228,83.20198
4,00009689-c524-4a99-95d8-a2397d87db62,2,6.295741,100.0


## 2. Select home cluster and add zone information

In [8]:
def ind_select(data):
    home_ = data.loc[data.home_freq == data.home_freq.max(), ['loc', 'home_freq', 'home_share']].values[0]
    return pd.Series(dict(home=home_[0], home_freq=home_[1], home_share=home_[2]))
tqdm.pandas()
df_h_selected = df_h.groupby('uid').progress_apply(ind_select).reset_index().astype({"home": int})

100%|██████████| 503003/503003 [09:04<00:00, 923.78it/s] 


In [9]:
df_h_selected.head()

Unnamed: 0,uid,home,home_freq,home_share
0,00008608-f79e-414d-bf1c-25632d6bc059,1,7.089974,100.0
1,00009689-c524-4a99-95d8-a2397d87db62,18,12.0,92.307692
2,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,4,8.846676,100.0
3,0000cd68-c931-4e3c-96f6-7c5837f59b08,2,9.790556,72.68895
4,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,7,9.842917,77.576692


### 2.1 Keep the top clusters with non-zero home-time stay

In [10]:
len(df_h_selected.loc[df_h_selected.home_freq > 0, :]) / len(df_h_selected)

0.9913419999483104

In [11]:
df_h_selected = df_h_selected.loc[df_h_selected.home_freq > 0, :]

### 2.2 Add cluster characteristics

In [12]:
df_cls = pd.read_sql_query(sql="""SELECT * FROM description.clusters_top3_wt_p;""", con=engine)
df_cls.head()

Unnamed: 0,uid,loc,holiday_s,freq,freq_wt,dur
0,00008608-f79e-414d-bf1c-25632d6bc059,1,0,39.0,929.240152,7060.083333
1,00008608-f79e-414d-bf1c-25632d6bc059,2,0,6.0,122.617716,970.183333
2,00008608-f79e-414d-bf1c-25632d6bc059,3,0,3.0,70.060781,539.95
3,00009689-c524-4a99-95d8-a2397d87db62,1,0,24.0,491.830045,5360.35
4,00009689-c524-4a99-95d8-a2397d87db62,18,0,1.0,54.189186,364.35


In [13]:
df_home = pd.merge(df_h_selected, df_cls, left_on=['uid', 'home'], right_on=['uid', 'loc'])
df_home.head()

Unnamed: 0,uid,home,home_freq,home_share,loc,holiday_s,freq,freq_wt,dur
0,00008608-f79e-414d-bf1c-25632d6bc059,1,7.089974,100.0,1,0,39.0,929.240152,7060.083333
1,00009689-c524-4a99-95d8-a2397d87db62,18,12.0,92.307692,18,0,1.0,54.189186,364.35
2,0000c837-ef82-4dfd-b2a5-00bdc8680b0b,4,8.846676,100.0,4,0,10.0,1224.285714,2170.783333
3,0000cd68-c931-4e3c-96f6-7c5837f59b08,2,9.790556,72.68895,2,0,3.0,34.387364,816.9
4,0000f6ad-ffa4-4af2-9c2a-49d6dc86ec3a,7,9.842917,77.576692,7,0,55.0,637.063179,10490.7


Get cluster centroids

In [14]:
df_stops = pd.read_sql_query(sql="""SELECT uid, lat, lng, loc FROM stops_p;""", con=engine)
df_stops = pd.merge(df_stops, df_home.loc[:, ['uid', 'loc']], on=['uid', 'loc'], how='inner')
df_stops.head()

Unnamed: 0,uid,lat,lng,loc
0,00008608-f79e-414d-bf1c-25632d6bc059,56.174205,12.569499,1
1,00008608-f79e-414d-bf1c-25632d6bc059,56.174205,12.569499,1
2,00008608-f79e-414d-bf1c-25632d6bc059,56.174205,12.569499,1
3,00008608-f79e-414d-bf1c-25632d6bc059,56.174205,12.569499,1
4,00008608-f79e-414d-bf1c-25632d6bc059,56.174205,12.569499,1


In [15]:
df_stops = df_stops.drop_duplicates(subset=['uid', 'lat', 'lng', 'loc'])
df_stops.uid.nunique(), len(df_stops)

(498648, 498648)

In [16]:
df_home = pd.merge(df_home, df_stops, on=['uid', 'loc'], how='left')

### 2.3 Add zone information - grid

In [17]:
gdf = gpd.GeoDataFrame.from_postgis("SELECT zone, geom FROM public.grids", con=engine).to_crs(4326)
gdf_home = preprocess.df2gdf_point(df_home, 'lng', 'lat', crs=4326, drop=True)
gdf_home = gpd.sjoin(gdf_home, gdf)
df_home_deso = pd.merge(df_home, gdf_home.loc[:, ['uid', 'zone']], on='uid', how='inner')

### 2.4 Add zone information - DeSO zone

In [18]:
gdf = gpd.GeoDataFrame.from_postgis("SELECT deso, geom FROM public.zones", con=engine).to_crs(4326)
gdf_home_ds = preprocess.df2gdf_point(df_home, 'lng', 'lat', crs=4326, drop=True)
gdf_home_ds = gpd.sjoin(gdf_home_ds, gdf)
df_home_deso = pd.merge(df_home_deso, gdf_home_ds.loc[:, ['uid', 'deso']], on='uid', how='inner')

### 2.5 Filtering

In [19]:
share_reliable_home = len(df_home_deso.loc[df_home_deso.freq >= 3, :]) / len(df_home_deso) * 100
print("Share of detected home locations with at least 3 appearances: %.2f %%"%share_reliable_home)

Share of detected home locations with at least 3 appearances: 74.44 %


In [20]:
df_home_deso = df_home_deso.loc[df_home_deso.freq >= 3, :]
len(df_home_deso)

322920

## 3. Individual weight
The inverse ratio of device number over population has extreme values in some regions, where only a few devices are included.
Weight trimming technique is applied:
Van de Kerckhove, Wendy, Leyla Mohadjer, and Thomas Krenzke. "A Weight Trimming Approach to Achieve a Comparable Increase to Bias Across Countries in the Programme for the International Assessment of Adult Competencies." JSM Proceedings, Survey Research Methods Section. Alexandria, VA: American Statistical Association (2014): 655-666. [Link](http://www.asasrms.org/Proceedings/y2014/files/311170_87007.pdf)

In [25]:
df_deso = pd.read_sql("SELECT deso, befolkning FROM public.zones", con=engine)
df_deso = df_deso.rename(columns={'befolkning': 'pop'})
df_deso_m = df_home_deso.groupby('deso').size().reset_index(name='count')
df_deso_m = pd.merge(df_deso, df_deso_m, how='left')
df_deso_m.fillna(0, inplace=True)
df_deso_m.loc[:, 'wt_p'] = df_deso_m.loc[:, 'pop'] / df_deso_m.loc[:, 'count']
df_deso_m.head()

Unnamed: 0,deso,pop,count,wt_p
0,0114A0010,790,19,41.578947
1,0114C1010,1608,12,134.0
2,0114C1020,1610,26,61.923077
3,0114C1120,2148,620,3.464516
4,0180C4390,1111,32,34.71875


### 3.1 Weight trimming

In [26]:
w0 = ((np.std(df_deso_m.loc[:, 'wt_p']) / np.mean(df_deso_m.loc[:, 'wt_p'])) ** 2 + 1) ** 0.5 * 3.5 * np.median(df_deso_m.loc[:, 'wt_p'])
df_deso_m.loc[df_deso_m['wt_p'] > w0, 'wt_p'] = w0

### 3.2 Get weight

In [27]:
df_home_deso = pd.merge(df_home_deso, df_deso_m[['deso', 'wt_p']], on='deso', how='left')

Save the data.

In [28]:
df_home_deso.drop(columns=['loc', 'holiday_s']).to_sql('home_p', engine, schema='public', index=False, method='multi', if_exists='replace', chunksize=10000)