# DiD visits data processing
Organize data ready for DiD modeling.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from tqdm import tqdm
import workers
import sqlalchemy
import numpy as np
from multiprocessing import cpu_count
from p_tqdm import p_map
from statsmodels.stats.weightstats import DescrStatsW

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [4]:
data_folder = os.path.join('dbs/poi2visits_day_did/')
paths2stops = {int(x.split('_')[-1].split('.')[0]): os.path.join(data_folder, x)\
               for x in list(os.walk(data_folder))[0][2]}
paths2stops_list = list(paths2stops.values())
paths2stops_list[0]

'dbs/poi2visits_day_did/stops_0.parquet'

In [5]:
df_cat = pd.read_excel('dbs/poi/categories.xlsx').rename(columns={'category': 'theme', 'subcategory': 'label'})
label_list = df_cat['label'].unique()

In [6]:
def ice(ai=None, bi=None, popi=None, share_a=0.25, share_b=0.25):
    oi = popi - ai - bi
    share_o = 1 - share_a - share_b
    return (ai / share_a - bi / share_b) / (ai / share_a + bi / share_b + oi / share_o)

## 1. Load data

In [7]:
df_osm = pd.read_parquet('dbs/places_matching/matched_places_wt.parquet')
df_osm_a = pd.read_parquet('dbs/places_matching/places_co_ys.parquet')
print(df_osm.osm_id.nunique(), df_osm_a.osm_id.nunique())
osm_ids = list(df_osm_a['osm_id'].unique())

145267 412767


In [8]:
def load_label_visits(lb=None, paths2stops_list=None):
    df_t_list = []
    for i in tqdm(paths2stops_list, desc=f'Getting {lb}'):
        tp = pd.read_parquet(i)
        if lb is not None:
            tp = tp.loc[tp['label'] == lb, :]
        df_t_list.append(tp)
    df_t = pd.concat(df_t_list)
    return df_t

In [11]:
df_t = load_label_visits(lb=None, paths2stops_list=paths2stops_list)
df_t = df_t.loc[df_t.osm_id.isin(osm_ids), :]

Getting None: 100%|██████████| 300/300 [01:13<00:00,  4.08it/s]


In [12]:
for lb in tqdm(label_list, desc='Writing by label'):
    df_t.loc[df_t.label==lb,:].to_parquet(f'dbs/temp/{lb}.parquet', index=False)

Writing by label: 100%|██████████| 52/52 [07:12<00:00,  8.32s/it]


## 2. Calculate visitation attributes - daily DiD

In [8]:
def visit_patterns(data):
    data.loc[:, 'date'] = data.loc[:, 'date'].astype(str)
    metrics_dict = dict()
    # osm_id info
    for var in ('osm_id', 'date', 'year', 'month', 'weekday', 'theme', 'label', 'precipitation', 'pt_station_num'):
        metrics_dict[var] = data[var].values[0]
    # Visits
    metrics_dict['num_visits_wt'] = data['wt_p'].sum()
    metrics_dict['num_unique_device'] = data.device_aid.nunique()
    # Duration
    metrics_dict['dur_total_wt'] = sum(data['dur'] * data['wt_p'])   # min

    # Distance from home
    ## Weighted percentiles
    d, wt = data.loc[data['d_h'] > 0, 'd_h'], data.loc[data['d_h'] > 0, 'wt_p']
    wdf = DescrStatsW(d, weights=wt, ddof=1)
    sts = wdf.quantile([0.25, 0.5, 0.75])
    bds = sts.values
    metrics_dict['d_h25_wt'], metrics_dict['d_h50_wt'], metrics_dict['d_h75_wt'] = bds[0], bds[1], bds[2]
    
    # Segregation metric
    pop = np.sum(data.wt_p)
    a = np.sum(data.loc[data.grdi_grp=='H', 'wt_p'])
    b = np.sum(data.loc[data.grdi_grp=='L', 'wt_p'])
    metrics_dict['ice'] = ice(ai=a, bi=b, popi=pop, share_a=0.25, share_b=0.25)
    metrics_dict['H'], metrics_dict['L'], metrics_dict['M'] = a/pop, b/pop, (pop-a-b)/pop
    
    ## weighted average
    d_lg = d.apply(lambda x: np.log10(x))
    metrics_dict['d_ha_wt'] = 10**np.average(d_lg, weights=wt)
    return pd.Series(metrics_dict)  # pd.DataFrame(metrics_dict, index=[0])

In [16]:
# Define the function you want to apply in parallel
def visit_patterns_parallel(group):
    return visit_patterns(group)
loc_number_list = []
df_list = []
for lb in label_list:
    print(lb)
    df_t = pd.read_parquet(f'dbs/temp/{lb}.parquet')
    # Group by the necessary columns and split into separate groups
    grouped_data = [group for _, group in df_t.groupby(['osm_id', 'date_time'])]
    
    # Apply the function in parallel using p_map
    df_v = p_map(visit_patterns_parallel, grouped_data, num_cpus=cpu_count())
    
    # Concatenate the results back into a DataFrame
    df_v = pd.concat(df_v).reset_index(drop=True)
    # tqdm.pandas()
    # df_v = df_t.groupby(['osm_id', 'date_time']).progress_apply(visit_patterns).reset_index(drop=True)
    df_v.to_parquet(f"dbs/visits_day_did/{lb}.parquet", index=False)

Automotive and services


KeyboardInterrupt: 

In [None]:
# Function to process a batch of groups
def process_batch(batch):
    import pandas as pd
    from statsmodels.stats.weightstats import DescrStatsW
    import numpy as np
    import workers
    return pd.concat([visit_patterns(group) for group in batch])

for lb in label_list:
    print(lb)
    df_t = pd.read_parquet(f'dbs/temp/{lb}.parquet')
    # Split the DataFrame into groups
    grouped_data = [group for _, group in df_t.groupby(['osm_id', 'date_time'])]
    
    # Determine the batch size based on available CPU cores
    num_batches = cpu_count() * 2  # Adjust this multiplier based on your system's capacity
    batch_size = int(np.ceil(len(grouped_data) / num_batches))
    
    # Create batches of groups
    batches = [grouped_data[i:i + batch_size] for i in range(0, len(grouped_data), batch_size)]
    
    # Process each batch in parallel using p_map
    df_v_batches = p_map(process_batch, batches, num_cpus=cpu_count())
    
    # Concatenate all batches into a single DataFrame
    df_v = pd.concat(df_v_batches).reset_index(drop=True)
    df_v.to_parquet(f"dbs/visits_day_did/{lb}.parquet", index=False)

Automotive and services


  0%|          | 0/40 [00:00<?, ?it/s]