In [71]:
import csv
from io import StringIO
from typing import Dict, Iterator, Literal, List, Optional, TypedDict, cast
from typing_extensions import overload, TypeAlias, assert_never
from datetime import date
import datetime
from typing import Generic, TypeVar
from abc import ABC, abstractmethod
import requests
import polars as pl
import pandas as pd
from pathlib import Path

_T = TypeVar("_T")

DAILY_PASSENGERS_TRAFFIC_URL = "https://www.immd.gov.hk/opendata/{locale}/transport/immigration_clearance/statistics_on_daily_passenger_traffic.csv"

Locale: TypeAlias = Literal["eng", "hkt", "hks"]


class DailyPassengersTrafficTypeDef(TypedDict):
    date: date
    control_point: str
    arrival_departure: Literal["Arrival", "Departure"]
    hk_residents_count: int
    mainland_china_residents_count: int
    other_visitors_count: int
    total_count: int


class ToJSON(ABC, Generic[_T]):
    @abstractmethod
    def to_json(self) -> _T: ...


class ToPD(ABC):
    @abstractmethod
    def to_pd(self) -> "pd.DataFrame": ...


class ToPL(ABC):
    @abstractmethod
    def to_pl(self) -> "pl.DataFrame": ...


class ToFile(ABC):
    @abstractmethod
    def to_file(self, path: "Path") -> None: ...


class DailyPassengersTrafficContainer(
    ToJSON[DailyPassengersTrafficTypeDef], ToPD, ToPL, ToFile
):
    def __init__(
        self, stats_iter: Iterator[DailyPassengersTrafficTypeDef], header: List[str]
    ):
        self._stats_iter = stats_iter
        self._header = header

    def to_json(self):
        return list(self._stats_iter)

    def to_pd(self):
        df = pd.DataFrame.from_records(self.to_json())
        if df.empty:
            return pd.DataFrame(columns=self._header)
        return df.rename(columns={k: v for k, v in zip(df.columns, self._header)})

    def to_pl(self):
        with StringIO() as s:
            self.to_pd().to_csv(s, index=False)
            s.seek(0)
            return pl.read_csv(s)

    def to_pl_lazy(self, path: Path):
        self.to_file(path)
        return pl.scan_csv(path)

    def to_file(self, path: Path):
        with path.open("w") as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(self._header)
            for stats in self._stats_iter:
                csv_writer.writerow(stats.values())


_direction_locale_map: Dict[Locale, Dict[Literal["Arrival", "Departure"], str]] = {
    "eng": {
        "Arrival": "Arrival",
        "Departure": "Departure",
    },
    "hkt": {
        "Arrival": "入境",
        "Departure": "離境",
    },
    "hks": {
        "Arrival": "入境",
        "Departure": "离境",
    },
}


def _get_daily_traffics(
    since: date,
    till: date = date.today(),
    locale: Locale = "eng",
    control_point_filter: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
) -> DailyPassengersTrafficContainer:
    url = DAILY_PASSENGERS_TRAFFIC_URL.format(locale=locale)

    session = requests.Session()
    with session.get(url) as r:
        r.raise_for_status()

        line_iter = cast(Iterator[bytes], r.iter_lines())

        #! We need to remove the BOM from the first line
        header = cast(bytes, next(line_iter)).decode("utf-8-sig").split(",")

    def _gen(line_iter: Iterator[bytes]):
        for line in line_iter:
            if not line:
                continue
            line_str = line.decode("utf-8")

            (
                date,
                control_point,
                arrival_departure,
                hk_residents_count,
                mainland_china_residents_count,
                other_visitors_count,
                total_count,
            ) = line_str.rstrip(",").split(",")

            date_dt = datetime.datetime.strptime(date, "%d-%m-%Y").date()

            if date_dt < since:
                continue

            if date_dt > till:
                break

            if (
                control_point_filter is not None
                and control_point != control_point_filter
            ):
                continue

            if (
                only_direction is not None
                and arrival_departure != _direction_locale_map[locale][only_direction]
            ):
                continue

            yield cast(
                DailyPassengersTrafficTypeDef,
                {
                    "date": date_dt,
                    "control_point": control_point,
                    "arrival_departure": arrival_departure,
                    "hk_residents_count": int(hk_residents_count),
                    "mainland_china_residents_count": int(
                        mainland_china_residents_count
                    ),
                    "other_visitors_count": int(other_visitors_count),
                    "total_count": int(total_count),
                },
            )

    return DailyPassengersTrafficContainer(_gen(line_iter), header)


@overload
def get_daily_traffics(
    *,
    since: date,
    till: date = date.today(),
    control_point: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
    locale: Locale = "eng",
    format: Literal["json"] = "json",
) -> List[DailyPassengersTrafficTypeDef]: ...


@overload
def get_daily_traffics(
    *,
    since: date,
    till: date = date.today(),
    control_point: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
    locale: Locale = "eng",
    format: Literal["pd"] = "pd",
) -> pd.DataFrame: ...


@overload
def get_daily_traffics(
    *,
    since: date,
    till: date = date.today(),
    control_point: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
    locale: Locale = "eng",
    format: Literal["pl"] = "pl",
) -> pl.DataFrame: ...


@overload
def get_daily_traffics(
    *,
    since: date,
    till: date = date.today(),
    control_point: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
    locale: Locale = "eng",
    format: Literal["pl_lazy"] = "pl_lazy",
    file_path: Path = None,
) -> pl.LazyFrame: ...


@overload
def get_daily_traffics(
    *,
    since: date,
    till: date = date.today(),
    control_point: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
    locale: Locale = "eng",
    format: Literal["file"] = "file",
    file_path: Path = None,
) -> None: ...


def get_daily_traffics(
    *,
    since: date,
    till: date = date.today(),
    locale: Locale = "eng",
    format: Literal["json", "pd", "pl", "pl_lazy", "file"] = "json",
    control_point: Optional[str] = None,
    only_direction: Optional[Literal["Arrival", "Departure"]] = None,
    file_path: Path = None,
):
    container = _get_daily_traffics(
        since=since,
        till=till,
        locale=locale,
        control_point_filter=control_point,
        only_direction=only_direction,
    )
    if format == "json":
        return container.to_json()
    if format == "pd":
        return container.to_pd()
    if format == "pl":
        return container.to_pl()
    if file_path is None:
        raise ValueError("file_path must be provided when format is 'file'")
    if format == "pl_lazy":
        return container.to_pl_lazy(file_path)
    if format == "file":
        container.to_file(file_path)
    assert_never(format)

In [80]:
from tempfile import NamedTemporaryFile

with NamedTemporaryFile() as f:
    df = (
        get_daily_traffics(
            since=date(2024, 1, 1),
            locale="hks",
            format="pl_lazy",
            file_path=Path(f.name),
        )
        .filter(
            pl.col("管制站") == "深圳湾",
            pl.col("入境 / 出境") == "入境",
            pl.col("内地访客") > pl.lit(3000),
        )
        .collect()
    )

In [81]:
df.select(["内地访客", "日期"]).sort("内地访客", descending=False)

内地访客,日期
i64,str
4778,"""2024-09-06"""
6205,"""2024-04-01"""
6572,"""2024-04-23"""
7073,"""2024-05-21"""
7314,"""2024-02-09"""
…,…
25571,"""2024-05-02"""
25843,"""2024-05-01"""
26483,"""2024-10-03"""
31412,"""2024-10-02"""
