# DiD visits data processing
Organize data ready for DiD modeling.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from tqdm import tqdm
import workers
import sqlalchemy
import numpy as np
import wquantiles
import time
from statsmodels.stats.weightstats import DescrStatsW

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [4]:
data_folder = os.path.join('dbs/poi2visits_day_did/')
paths2stops = {int(x.split('_')[-1].split('.')[0]): os.path.join(data_folder, x)\
               for x in list(os.walk(data_folder))[0][2]}
paths2stops_list = list(paths2stops.values())
paths2stops_list[0]

'dbs/poi2visits_day_did/stops_0.parquet'

In [6]:
df_cat = pd.read_excel('dbs/poi/categories.xlsx').rename(columns={'category': 'theme', 'subcategory': 'label'})
label_list = df_cat['label'].unique()

In [12]:
def ice(ai=None, bi=None, popi=None, share_a=0.25, share_b=0.25):
    oi = popi - ai - bi
    share_o = 1 - share_a - share_b
    return (ai / share_a - bi / share_b) / (ai / share_a + bi / share_b + oi / share_o)

## 1. Load data

In [7]:
df_osm = pd.read_parquet('dbs/places_matching/matched_places.parquet')
osm_ids = list(df_osm['osm_id'].unique())

In [9]:
def load_label_visits(lb=None, paths2stops_list=None):
    df_t_list = []
    for i in tqdm(paths2stops_list, desc=f'Getting {lb}'):
        tp = pd.read_parquet(i)
        if lb is not None:
            tp = tp.loc[tp['label'] == lb, :]
        df_t_list.append(tp)
    df_t = pd.concat(df_t_list)
    return df_t

In [10]:
df_t = load_label_visits(lb=None, paths2stops_list=paths2stops_list)
df_t = df_t.loc[df_t.osm_id.isin(osm_ids), :]

Getting None: 100%|██████████| 300/300 [01:25<00:00,  3.50it/s]


In [11]:
for lb in tqdm(label_list, desc='Writing by label'):
    df_t.loc[df_t.label==lb,:].to_parquet(f'dbs/temp/{lb}.parquet', index=False)

Writing by label: 100%|██████████| 52/52 [01:43<00:00,  1.98s/it]


## 2. Calculate visitation attributes - daily DiD

In [13]:
def visit_patterns(data):
    data.loc[:, 'date'] = data.loc[:, 'date'].astype(str)
    metrics_dict = dict()
    # osm_id info
    for var in ('osm_id', 'date', 'year', 'month', 'weekday', 'theme', 'label', 'precipitation', 'pt_station_num'):
        metrics_dict[var] = data[var].values[0]
    # Visits
    metrics_dict['num_visits_wt'] = data['wt_p'].sum()
    metrics_dict['num_unique_device'] = data.device_aid.nunique()
    # Duration
    metrics_dict['dur_total_wt'] = sum(data['dur'] * data['wt_p'])   # min

    # Distance from home
    ## Weighted percentiles
    d, wt = data.loc[data['d_h'] > 0, 'd_h'], data.loc[data['d_h'] > 0, 'wt_p']
    wdf = DescrStatsW(d, weights=wt, ddof=1)
    sts = wdf.quantile([0.25, 0.5, 0.75])
    bds = sts.values
    metrics_dict['d_h25_wt'], metrics_dict['d_h50_wt'], metrics_dict['d_h75_wt'] = bds[0], bds[1], bds[2]
    
    # Segregation metric
    pop = np.sum(data.wt_p)
    a = np.sum(data.loc[data.grdi_grp=='H', 'wt_p'])
    b = np.sum(data.loc[data.grdi_grp=='L', 'wt_p'])
    metrics_dict['ice'] = ice(ai=a, bi=b, popi=pop, share_a=0.25, share_b=0.25)
    
    ## weighted average
    d_lg = d.apply(lambda x: np.log10(x))
    metrics_dict['d_ha_wt'] = 10**np.average(d_lg, weights=wt)
    return pd.Series(metrics_dict)  # pd.DataFrame(metrics_dict, index=[0])

In [17]:
loc_number_list = []
df_list = []
for lb in label_list:
    print(lb)
    df_t = pd.read_parquet(f'dbs/temp/{lb}.parquet')
    tqdm.pandas()
    df_v = df_t.groupby(['osm_id', 'date_time']).progress_apply(visit_patterns).reset_index(drop=True)
    df_v.to_parquet(f"dbs/visits_day_did/{lb}.parquet", index=False)

Automotive and services


100%|██████████| 301059/301059 [18:43<00:00, 268.05it/s]


Home & Lifestyle


100%|██████████| 141067/141067 [08:43<00:00, 269.23it/s]


Office


100%|██████████| 444412/444412 [27:52<00:00, 265.76it/s] 


Supermarket


100%|██████████| 521469/521469 [32:57<00:00, 263.76it/s]  


Accomodations


100%|██████████| 491209/491209 [30:25<00:00, 269.13it/s] 


Art & Culture


100%|██████████| 126767/126767 [07:50<00:00, 269.64it/s]


Café


100%|██████████| 349417/349417 [21:32<00:00, 270.32it/s] 


Entertainment venues


100%|██████████| 16466/16466 [01:00<00:00, 272.61it/s]


Fast food


100%|██████████| 468679/468679 [28:58<00:00, 269.54it/s] 


Games and activities


100%|██████████| 11207/11207 [00:40<00:00, 275.82it/s]


Health care and services


100%|██████████| 675504/675504 [41:44<00:00, 269.70it/s]  


Historic


100%|██████████| 555349/555349 [35:03<00:00, 264.03it/s]  


Information and services


100%|██████████| 176005/176005 [10:53<00:00, 269.37it/s]


Nightclub


100%|██████████| 53847/53847 [03:18<00:00, 271.60it/s]


Parks and gardens


100%|██████████| 2724/2724 [00:10<00:00, 271.31it/s]


Recreation & Sports Centres


100%|██████████| 377535/377535 [23:56<00:00, 262.90it/s] 


Recreational facilities


100%|██████████| 254372/254372 [15:55<00:00, 266.08it/s]


Retail stores


100%|██████████| 485275/485275 [30:06<00:00, 268.61it/s] 


Tourist attractions


100%|██████████| 42769/42769 [02:36<00:00, 272.50it/s]


Viewing and observation


100%|██████████| 10345/10345 [00:37<00:00, 274.04it/s]


Water Sports


100%|██████████| 19645/19645 [01:18<00:00, 251.60it/s]


Wellness & Relaxation


100%|██████████| 49132/49132 [03:01<00:00, 271.36it/s]


Adventure & Wildlife


100%|██████████| 99/99 [00:00<00:00, 267.57it/s]


Animal


100%|██████████| 3092/3092 [00:11<00:00, 277.56it/s]


Ball Sports


100%|██████████| 6124/6124 [00:24<00:00, 249.04it/s]


Beverages


100%|██████████| 100791/100791 [06:17<00:00, 266.84it/s]


College


100%|██████████| 41459/41459 [02:33<00:00, 270.01it/s]


Community center


100%|██████████| 170238/170238 [10:30<00:00, 269.97it/s]


Cosmetics and beauty


100%|██████████| 297304/297304 [18:18<00:00, 270.66it/s]


Equestrian & Riding


100%|██████████| 3483/3483 [00:12<00:00, 273.39it/s]


Events and fairs


0it [00:00, ?it/s]


Extreme & Adventure Sports


100%|██████████| 8324/8324 [00:30<00:00, 274.97it/s]


Fashion and clothing


100%|██████████| 220630/220630 [13:33<00:00, 271.07it/s]


Financial services


100%|██████████| 307452/307452 [18:58<00:00, 269.99it/s]


Food shop


100%|██████████| 484534/484534 [29:55<00:00, 269.91it/s] 


Kindergarten and childcare


100%|██████████| 397897/397897 [24:38<00:00, 269.17it/s] 


Library


100%|██████████| 52875/52875 [03:14<00:00, 272.18it/s]


Medical supplies


100%|██████████| 82253/82253 [05:03<00:00, 270.92it/s]


Other facilities


100%|██████████| 107827/107827 [06:38<00:00, 270.67it/s]


Place of worship


100%|██████████| 778276/778276 [48:16<00:00, 268.69it/s]  


Pub


100%|██████████| 466079/466079 [29:15<00:00, 265.43it/s] 


Public services


100%|██████████| 475993/475993 [29:27<00:00, 269.36it/s] 


Racket & Martial Arts


100%|██████████| 9918/9918 [00:36<00:00, 272.75it/s]


Rentals and sharing


100%|██████████| 111762/111762 [06:49<00:00, 273.15it/s]


Restaurant


100%|██████████| 1289712/1289712 [1:20:04<00:00, 268.42it/s] 


School


100%|██████████| 207627/207627 [12:51<00:00, 269.07it/s]


Services & Repairs


100%|██████████| 132463/132463 [08:09<00:00, 270.58it/s]


Social facilities


100%|██████████| 321183/321183 [19:46<00:00, 270.64it/s]


Technology & Hobbies


100%|██████████| 131068/131068 [08:01<00:00, 272.23it/s]


Training center


100%|██████████| 46196/46196 [02:50<00:00, 270.75it/s]


Wellness and fitness


100%|██████████| 16875/16875 [01:02<00:00, 268.03it/s]


Winter Sports


100%|██████████| 123/123 [00:00<00:00, 262.82it/s]
