# DiD visits data processing
Organize data ready for DiD modeling.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from tqdm import tqdm
import workers
import sqlalchemy
import numpy as np
import wquantiles
import time
from statsmodels.stats.weightstats import DescrStatsW

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [4]:
data_folder = os.path.join('dbs/poi2visits_day_did/')
paths2stops = {int(x.split('_')[-1].split('.')[0]): os.path.join(data_folder, x)\
               for x in list(os.walk(data_folder))[0][2]}
paths2stops_list = list(paths2stops.values())
paths2stops_list[0]

'dbs/poi2visits_day_did/stops_0.parquet'

In [5]:
df_cat = pd.read_excel('dbs/poi/categories.xlsx').rename(columns={'category': 'theme', 'subcategory': 'label'})
label_list = df_cat['label'].unique()

In [6]:
def ice(ai=None, bi=None, popi=None, share_a=0.25, share_b=0.25):
    oi = popi - ai - bi
    share_o = 1 - share_a - share_b
    return (ai / share_a - bi / share_b) / (ai / share_a + bi / share_b + oi / share_o)

## 1. Load data

In [7]:
df_osm = pd.read_parquet('dbs/places_matching/matched_places_wt.parquet')
osm_ids = list(df_osm['osm_id'].unique())

In [8]:
def load_label_visits(lb=None, paths2stops_list=None):
    df_t_list = []
    for i in tqdm(paths2stops_list, desc=f'Getting {lb}'):
        tp = pd.read_parquet(i)
        if lb is not None:
            tp = tp.loc[tp['label'] == lb, :]
        df_t_list.append(tp)
    df_t = pd.concat(df_t_list)
    return df_t

In [9]:
df_t = load_label_visits(lb=None, paths2stops_list=paths2stops_list)
df_t = df_t.loc[df_t.osm_id.isin(osm_ids), :]

Getting None: 100%|██████████| 300/300 [01:08<00:00,  4.39it/s]


In [10]:
for lb in tqdm(label_list, desc='Writing by label'):
    df_t.loc[df_t.label==lb,:].to_parquet(f'dbs/temp/{lb}.parquet', index=False)

Writing by label: 100%|██████████| 52/52 [02:05<00:00,  2.42s/it]


## 2. Calculate visitation attributes - daily DiD

In [11]:
def visit_patterns(data):
    data.loc[:, 'date'] = data.loc[:, 'date'].astype(str)
    metrics_dict = dict()
    # osm_id info
    for var in ('osm_id', 'date', 'year', 'month', 'weekday', 'theme', 'label', 'precipitation', 'pt_station_num'):
        metrics_dict[var] = data[var].values[0]
    # Visits
    metrics_dict['num_visits_wt'] = data['wt_p'].sum()
    metrics_dict['num_unique_device'] = data.device_aid.nunique()
    # Duration
    metrics_dict['dur_total_wt'] = sum(data['dur'] * data['wt_p'])   # min

    # Distance from home
    ## Weighted percentiles
    d, wt = data.loc[data['d_h'] > 0, 'd_h'], data.loc[data['d_h'] > 0, 'wt_p']
    wdf = DescrStatsW(d, weights=wt, ddof=1)
    sts = wdf.quantile([0.25, 0.5, 0.75])
    bds = sts.values
    metrics_dict['d_h25_wt'], metrics_dict['d_h50_wt'], metrics_dict['d_h75_wt'] = bds[0], bds[1], bds[2]
    
    # Segregation metric
    pop = np.sum(data.wt_p)
    a = np.sum(data.loc[data.grdi_grp=='H', 'wt_p'])
    b = np.sum(data.loc[data.grdi_grp=='L', 'wt_p'])
    metrics_dict['ice'] = ice(ai=a, bi=b, popi=pop, share_a=0.25, share_b=0.25)
    
    ## weighted average
    d_lg = d.apply(lambda x: np.log10(x))
    metrics_dict['d_ha_wt'] = 10**np.average(d_lg, weights=wt)
    return pd.Series(metrics_dict)  # pd.DataFrame(metrics_dict, index=[0])

In [12]:
loc_number_list = []
df_list = []
for lb in label_list:
    print(lb)
    df_t = pd.read_parquet(f'dbs/temp/{lb}.parquet')
    tqdm.pandas()
    df_v = df_t.groupby(['osm_id', 'date_time']).progress_apply(visit_patterns).reset_index(drop=True)
    df_v.to_parquet(f"dbs/visits_day_did/{lb}.parquet", index=False)

Automotive and services


100%|██████████| 519078/519078 [43:26<00:00, 199.15it/s] 


Home & Lifestyle


100%|██████████| 227694/227694 [18:57<00:00, 200.19it/s]


Office


100%|██████████| 535370/535370 [44:24<00:00, 200.91it/s]  


Supermarket


100%|██████████| 703459/703459 [58:13<00:00, 201.35it/s]  


Accomodations


100%|██████████| 778114/778114 [1:05:04<00:00, 199.29it/s]


Art & Culture


100%|██████████| 155058/155058 [12:51<00:00, 201.05it/s]


Café


100%|██████████| 406410/406410 [33:36<00:00, 201.55it/s] 


Entertainment venues


100%|██████████| 18908/18908 [01:26<00:00, 218.54it/s]


Fast food


100%|██████████| 526409/526409 [43:45<00:00, 200.47it/s]  


Games and activities


100%|██████████| 13685/13685 [01:03<00:00, 215.72it/s]


Health care and services


100%|██████████| 845791/845791 [1:10:41<00:00, 199.41it/s] 


Historic


100%|██████████| 748741/748741 [1:02:03<00:00, 201.06it/s]


Information and services


100%|██████████| 338035/338035 [28:03<00:00, 200.79it/s] 


Nightclub


100%|██████████| 62567/62567 [05:45<00:00, 181.32it/s]


Parks and gardens


100%|██████████| 6234/6234 [00:28<00:00, 216.37it/s]


Recreation & Sports Centres


100%|██████████| 526062/526062 [43:46<00:00, 200.32it/s]  


Recreational facilities


100%|██████████| 404626/404626 [33:13<00:00, 202.94it/s] 


Retail stores


100%|██████████| 566364/566364 [47:35<00:00, 198.35it/s]  


Tourist attractions


100%|██████████| 63071/63071 [04:58<00:00, 211.45it/s]


Viewing and observation


100%|██████████| 16047/16047 [01:21<00:00, 196.58it/s]


Water Sports


100%|██████████| 32147/32147 [02:43<00:00, 196.90it/s]


Wellness & Relaxation


100%|██████████| 59489/59489 [05:08<00:00, 192.83it/s]


Adventure & Wildlife


100%|██████████| 377/377 [00:01<00:00, 189.69it/s]


Animal


100%|██████████| 6443/6443 [00:32<00:00, 199.07it/s]


Ball Sports


100%|██████████| 10851/10851 [00:50<00:00, 215.68it/s]


Beverages


100%|██████████| 150924/150924 [12:12<00:00, 206.08it/s]


College


100%|██████████| 42750/42750 [03:55<00:00, 181.33it/s]


Community center


100%|██████████| 289065/289065 [23:57<00:00, 201.10it/s]


Cosmetics and beauty


100%|██████████| 368759/368759 [30:47<00:00, 199.57it/s] 


Equestrian & Riding


100%|██████████| 12746/12746 [01:01<00:00, 205.62it/s]


Events and fairs


0it [00:00, ?it/s]


Extreme & Adventure Sports


100%|██████████| 16674/16674 [01:23<00:00, 200.87it/s]


Fashion and clothing


100%|██████████| 241592/241592 [20:48<00:00, 193.51it/s]


Financial services


100%|██████████| 395858/395858 [35:14<00:00, 187.18it/s] 


Food shop


100%|██████████| 660469/660469 [1:00:05<00:00, 183.17it/s]


Kindergarten and childcare


100%|██████████| 564410/564410 [49:36<00:00, 189.64it/s]  


Library


100%|██████████| 61900/61900 [05:47<00:00, 178.37it/s]


Medical supplies


100%|██████████| 89000/89000 [08:08<00:00, 182.04it/s]


Other facilities


100%|██████████| 114548/114548 [10:05<00:00, 189.03it/s]


Place of worship


100%|██████████| 1486380/1486380 [2:06:50<00:00, 195.30it/s]  


Pub


100%|██████████| 541894/541894 [45:08<00:00, 200.10it/s] 


Public services


100%|██████████| 616964/616964 [51:39<00:00, 199.04it/s]  


Racket & Martial Arts


100%|██████████| 13922/13922 [01:02<00:00, 224.54it/s]


Rentals and sharing


100%|██████████| 139653/139653 [11:01<00:00, 211.09it/s]


Restaurant


100%|██████████| 1860272/1860272 [2:23:38<00:00, 215.84it/s]  


School


100%|██████████| 287655/287655 [20:56<00:00, 228.95it/s]


Services & Repairs


100%|██████████| 148458/148458 [10:41<00:00, 231.27it/s]


Social facilities


100%|██████████| 388404/388404 [28:06<00:00, 230.34it/s] 


Technology & Hobbies


100%|██████████| 159257/159257 [11:28<00:00, 231.39it/s]


Training center


100%|██████████| 56098/56098 [04:03<00:00, 230.22it/s]


Wellness and fitness


100%|██████████| 20357/20357 [01:28<00:00, 231.23it/s]


Winter Sports


100%|██████████| 148/148 [00:00<00:00, 236.56it/s]
