# Visitation patterns
Output: `dbs/visits_day_sg/`

In [2]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [1]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import workers
import h3
from tqdm import tqdm
import sqlalchemy
import numpy as np
from p_tqdm import p_map

In [3]:
# Pyspark set up
os.environ['JAVA_HOME'] = "C:/Java/jdk-1.8"
from pyspark.sql import SparkSession
import sys
from pyspark import SparkConf
# Set up pyspark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# Create new context
spark_conf = SparkConf().setMaster("local[18]").setAppName("MobiSeg")
spark_conf.set("spark.executor.heartbeatInterval","3600s")
spark_conf.set("spark.network.timeout","7200s")
spark_conf.set("spark.sql.files.ignoreCorruptFiles","true")
spark_conf.set("spark.driver.memory", "56g")
spark_conf.set("spark.driver.maxResultSize", "0")
spark_conf.set("spark.executor.memory","8g")
spark_conf.set("spark.memory.fraction", "0.6")
spark_conf.set("spark.sql.session.timeZone", "UTC")
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
java_version = spark._jvm.System.getProperty("java.version")
print(f"Java version used by PySpark: {java_version}")
print('Web UI:', spark.sparkContext.uiWebUrl)

Java version used by PySpark: 1.8.0_401
Web UI: http://C19YUEI.net.chalmers.se:4040


In [12]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Device filtering

In [15]:
df_d = pd.read_sql("""SELECT device_aid, "2019", "2022", "2023", grid_1km FROM home_g;""", con=engine)

In [16]:
# Share of devices that only appear in one year
for year in ('2019', '2022', '2023'):
    print(f'Share of devices with only year {year}', len(df_d.loc[df_d[year] == 100, :]) / len(df_d))

Share of devices with only year 2019 0.14504647621383096
Share of devices with only year 2022 0.1471900843238135
Share of devices with only year 2023 0.372127896291974


In [17]:
print(f'Share of devices with 2022 and 2019', len(df_d.loc[(df_d['2019'] > 0) & (df_d['2022'] > 0), :]) / len(df_d))

Share of devices with 2022 and 2019 0.016036688410012042


In [18]:
print(f'Share of devices with 2022 and 2023', len(df_d.loc[(df_d['2023'] > 0) & (df_d['2022'] > 0), :]) / len(df_d))

Share of devices with 2022 and 2023 0.3192819005186756


In [19]:
print(f'Share of devices with all years', len(df_d.loc[(df_d['2019'] > 0) & (df_d['2022'] > 0) & (df_d['2023'] > 0), :]) / len(df_d))

Share of devices with all years 0.011414662610191844


### 2.1 Devices sharing same home grids for three years

In [20]:
def year_coverage(data):
    y1, y2, y3 = data['2019'].sum(), data['2022'].sum(), data['2023'].sum()
    if (y1 > 0) & (y2 > 0) & (y3 > 0):
        return pd.Series(dict(yr_c=1))
    else:
        return pd.Series(dict(yr_c=0))

tqdm.pandas()
df_d_yc = df_d.groupby('grid_1km').progress_apply(year_coverage, include_groups=False).reset_index()

100%|██████████| 156450/156450 [00:54<00:00, 2893.09it/s]


In [21]:
shared_grids = df_d_yc.loc[df_d_yc.yr_c == 1, 'grid_1km'].unique()
devices2keep = df_d.loc[df_d['grid_1km'].isin(shared_grids), 'device_aid'].unique()
len(devices2keep)

22209798

## 2. Find the devices having records Mar-May 2022 and 2023

In [3]:
data_folder = os.path.join('dbs/combined_hex2visits_day/')
paths2stops = {int(x.split('_')[-1].split('.')[0]): os.path.join(data_folder, x)\
               for x in list(os.walk(data_folder))[0][2]}
paths2stops_list = list(paths2stops.values())

In [8]:
def devices_period_stats(file_path=None):
    import pandas as pd
    def period_stats(data):
        return pd.Series(dict(no_active_days=data['date'].nunique(),
                              no_rec=len(data),
                              no_hex=data['h3_id'].nunique()))
    df_g = pd.read_parquet(file_path)
    df_g['date'] = pd.to_datetime(df_g['date'])
    # Define filtering condition (Month: 3 to 5, Year: 2022 or 2023)
    filtered_df = df_g[
        ((df_g['date'].dt.month >= 3) & (df_g['date'].dt.month <= 5)) &  # Months: March to May
        (df_g['date'].dt.year.isin([2022, 2023]))                     # Years: 2022 & 2023
    ]
    return filtered_df.groupby('device_aid').apply(period_stats).reset_index()

In [9]:
# Use p_map for parallel processing with progress bar
df_indi_list = p_map(devices_period_stats, paths2stops_list, num_cpus=18)

  0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
df_indi = pd.concat(df_indi_list)
print("No. of individual devices covered", len(df_indi))

No. of individual devices covered 15372651


In [14]:
df_indi.to_sql('stops_hex_indi_dt', engine, schema='data_desc', index=False, method='multi', if_exists='replace', chunksize=10000)

15372651

In [15]:
df_indi[['no_active_days', 'no_rec', 'no_hex']].describe()

Unnamed: 0,no_active_days,no_rec,no_hex
count,15372650.0,15372650.0,15372650.0
mean,12.35823,20.1381,6.851908
std,12.81696,29.00836,6.127666
min,1.0,1.0,1.0
25%,4.0,5.0,3.0
50%,8.0,11.0,5.0
75%,16.0,24.0,9.0
max,167.0,1182.0,173.0


In [16]:
df_indi['no_rec'].sum()

309575943

## 3. Individual weight
We only use individuals with over 15 nights at home.
### 3.1 Focus on a subset of individual devices


In [18]:
df_home = pd.read_sql("""SELECT device_aid, grid_1km, pop_1km FROM home_g;""", con=engine)
df_h = pd.read_sql("""SELECT device_aid, count FROM home WHERE count > 14;""", con=engine)
df_home = pd.merge(df_home, df_h, on='device_aid', how='left')
df_home.drop(columns=['count'], inplace=True)
tqdm.pandas()
df_home_s = df_home.groupby('grid_1km').progress_apply(lambda x: pd.Series(dict(count=len(x))), include_groups=False).reset_index()
df_home_s = pd.merge(df_home, df_home_s, on='grid_1km', how='left')
df_home_s.loc[:, 'wt_p'] = df_home_s.loc[:, 'pop_1km'] / df_home_s.loc[:, 'count']

100%|██████████| 156450/156450 [00:33<00:00, 4622.33it/s]


In [19]:
w0 = ((np.std(df_home_s.loc[:, 'wt_p']) / np.mean(df_home_s.loc[:, 'wt_p'])) ** 2 + 1) ** 0.5 * 3.5 * np.median(df_home_s.loc[:, 'wt_p'])
df_home_s.loc[df_home_s['wt_p'] > w0, 'wt_p'] = w0
df_home_s[['device_aid', 'wt_p']].to_sql('weight', engine, schema='public', index=False,
                                    method='multi', if_exists='replace', chunksize=10000)

22738298

## 4. Device filtering for h3 grids

In [10]:
data_folder = os.path.join('dbs/combined_hex2visits_day/')
paths2stops = {int(x.split('_')[-1].split('.')[0]): os.path.join(data_folder, x)\
               for x in list(os.walk(data_folder))[0][2]}
paths2stops_list = list(paths2stops.values())

In [23]:
df_indi = pd.read_sql("""SELECT device_aid, grdi, net_rent_100m FROM home_g;""", con=engine)
df_indi.loc[df_indi['grdi'] < 0, 'grdi'] = 0
df_wt = pd.read_sql("""SELECT * FROM weight;""", con=engine)
df_indi = pd.merge(df_wt, df_indi, on='device_aid', how='left')
grdi = df_indi['grdi'].median()
net_rent_100m = df_indi['net_rent_100m'].median()
df_indi['grdi'] = df_indi['grdi'].fillna(grdi)
df_indi['net_rent_100m'] = df_indi['net_rent_100m'].fillna(net_rent_100m)

In [24]:
h3_id_list = []
for k, v in tqdm(paths2stops.items(), desc='Adding individual group/weight to devices'):
    df = pd.read_parquet(v)
    if 'wt_p' not in df.columns:
        df = pd.merge(df, df_indi[['device_aid', 'grdi', 'net_rent_100m', 'wt_p']], on='device_aid', how='left')
        df.dropna(inplace=True)
        df = df.loc[df.device_aid.isin(devices2keep), :]
        h3_id_list += list(df['h3_id'].unique())
        h3_id_list = list(set(h3_id_list))
        df.to_parquet(f'dbs/combined_hex2visits_day_sg/stops_{k}.parquet', index=False)

Adding individual group/weight to devices: 100%|██████████| 300/300 [2:38:42<00:00, 31.74s/it]  


In [25]:
df_h3 = pd.DataFrame(h3_id_list, columns=['h3_id'])
upper_reso = 3
tqdm.pandas()
df_h3.loc[:, f'h3_parent_{upper_reso}'] = df_h3['h3_id'].progress_apply(lambda x: h3.cell_to_parent(x, upper_reso))
print(df_h3[f'h3_parent_{upper_reso}'].nunique())
df_h3.to_parquet('dbs/combined_hex2visits_day_sg_h3_batches.parquet', index=False)

100%|██████████| 306353/306353 [00:00<00:00, 553981.84it/s]


49
