In [82]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Optional
import time
import pytz
from datetime import datetime

import warnings 
warnings.filterwarnings("ignore")

# Intermediate Home-Visit table generation

In [70]:
def home_visit(timezone: str, start_date: int, end_date: int, *,
    demo_path: str = "./demo_data/demo_raw.csv",
    cbg_path: str = "./demo_data/TCMA_cbg.geojson",) -> pd.DataFrame:

    if start_date > end_date:
        raise ValueError("start_date must be ≤ end_date")

    # Load data
    demo_raw = pd.read_csv(demo_path, usecols=["device_key", "unix_time", "Target_CBG", "Visitor_CBG"])
    msa_cbg = gpd.read_file(cbg_path)[["GISJOIN"]]

    # Convert timestamps *vectorised and select time range
    true_tz = pytz.timezone(timezone)
    demo_raw['real_time'] = demo_raw['unix_time'].apply(lambda x: datetime.fromtimestamp(x,true_tz))
    demo_raw["hour"] = demo_raw["real_time"].dt.floor("H")
    demo_raw["day"]  = demo_raw["real_time"].dt.day
    demo_raw = demo_raw.loc[demo_raw["day"].between(start_date, end_date)]

    if demo_raw.empty:
        return pd.DataFrame(
            columns=["Time", "Target_CBG", "Visitor_CBG","Num_visitor", "Num_visit"])

    # Aggregate visits 
    hourly_counts = (demo_raw
        .groupby(["Target_CBG", "Visitor_CBG", "hour"], as_index=False)
        .agg(Num_visit=("device_key", "size"), Num_visitor=("device_key", "nunique")))

    # Attach missing CBGs (left join)
    result = (msa_cbg.merge(hourly_counts, left_on="GISJOIN", right_on="Target_CBG",how="left")
              .dropna(subset=["Num_visit", "Num_visitor"]))

    # ISO-style timestamp (yyyy-mm-dd HH)
    result["Time"] = result["hour"].dt.strftime("%Y-%m-%d %H")

    # reorder & clean
    result = result[["Time", "Target_CBG", "Visitor_CBG", "Num_visitor", "Num_visit"]].sort_values(["Time", "Target_CBG", "Visitor_CBG"]).reset_index(drop=True)
    return result

In [71]:
# TCMA (Central time), Juy 19-25 (all records in demo raw)
demo_homevisit = home_visit('US/Central', 19, 25)

In [72]:
demo_homevisit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95225 entries, 0 to 95224
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Time         95225 non-null  object 
 1   Target_CBG   95225 non-null  object 
 2   Visitor_CBG  95225 non-null  object 
 3   Num_visitor  95225 non-null  float64
 4   Num_visit    95225 non-null  float64
dtypes: float64(2), object(3)
memory usage: 3.6+ MB


In [76]:
demo_homevisit.to_csv('./demo_data/demo_homevisit.csv')

# Visitor census in one month -- visitor-based or visit-based

In [91]:
def visitor_census_aggregation(agg_type: str, *,
    homevisit_path: str = "./demo_data/demo_homevisit.csv",
    census_path: str = "./demo_data/census_attribute.csv",
    cbg_path: str = "./demo_data/TCMA_cbg.geojson",
    attrs: Optional[List[str]] = None,) -> gpd.GeoDataFrame:
    # Sanity checks
    if agg_type not in {"visitor", "visit"}:
        raise ValueError("agg_type must be 'visitor' or 'visit'")

    # Fall-back list of census columns
    if attrs is None:
        attrs = ["Income", "Wp", "Pbb", "Upp"]

    weight_col = "Num_visitor" if agg_type == "visitor" else "Num_visit"

    # Load data
    home_visit_tab = pd.read_csv(homevisit_path).drop(columns="Unnamed: 0", errors="ignore")
    attribute_tab  = pd.read_csv(census_path)   .drop(columns="Unnamed: 0", errors="ignore")

    # Link visitor attributes onto each visit record
    link = (home_visit_tab.merge(attribute_tab, left_on="Visitor_CBG", right_on="CBG_ID", how="left")
        .dropna(subset = attrs + [weight_col]))  # keep rows with data

    if link.empty:
        raise ValueError("No rows left after merging & dropping NaNs; check inputs.")

    # Group & compute weighted means
    def _wavg(group: pd.DataFrame) -> pd.Series:
        w = group[weight_col].to_numpy()
        out = {}
        for col in attrs:
            out[f"{col}_v"] = np.average(group[col].to_numpy(), weights=w)
        return pd.Series(out)

    visitor_attribute = (link.groupby("Target_CBG", sort=False)   # preserve input order
            .apply(_wavg).reset_index())

    # merge back onto full CBG
    msa_cbg = gpd.read_file(cbg_path)[["GISJOIN"]]
    visitor_census = (msa_cbg.merge(visitor_attribute, left_on="GISJOIN", right_on="Target_CBG", how="left")
               .drop(columns="Target_CBG")
               .rename(columns={"GISJOIN": "CBG_ID_v"}))

    return visitor_census

## Visitor-based aggregation

In [92]:
visitor_census_1 = visitor_census_aggregation('visitor')

In [93]:
visitor_census_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2641 entries, 0 to 2640
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CBG_ID_v  2641 non-null   object 
 1   Income_v  2560 non-null   float64
 2   Wp_v      2560 non-null   float64
 3   Pbb_v     2560 non-null   float64
 4   Upp_v     2560 non-null   float64
dtypes: float64(4), object(1)
memory usage: 103.3+ KB


In [94]:
visitor_census_1.to_csv('./demo_data/demo_visitor.csv')

## Visit-based aggregation

In [95]:
visitor_census_2 = visitor_census_aggregation('visit')

In [96]:
visitor_census_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2641 entries, 0 to 2640
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CBG_ID_v  2641 non-null   object 
 1   Income_v  2560 non-null   float64
 2   Wp_v      2560 non-null   float64
 3   Pbb_v     2560 non-null   float64
 4   Upp_v     2560 non-null   float64
dtypes: float64(4), object(1)
memory usage: 103.3+ KB


In [97]:
visitor_census_2.to_csv('./demo_data/demo_visit.csv')