In [1]:
from datetime import datetime

import numpy as np

from geotrees import find_nearest

In [2]:
START_DATETIME = datetime(2003, 1, 1, 0, 0)
END_DATETIME = datetime(2013, 12, 31, 23, 59)
DATETIME_DIFF = END_DATETIME - START_DATETIME

In [3]:
def generate_random_datetime(n: int = 1) -> list[datetime]:
    return list(START_DATETIME + DATETIME_DIFF * np.random.rand(n))

In [4]:
N1 = 50_000
N2 = 300
all_dates = sorted(generate_random_datetime(N1))
query_dates = generate_random_datetime(N2)

# Add some values to test edges
query_dates.append(all_dates[0])
query_dates.append(all_dates[-1])
query_dates.append(datetime(2008, 6, 12, 14, 23))
query_dates.append(datetime(2011, 4, 17, 8, 21))

# Sorted version (for last test)
s_query_dates = sorted(query_dates)

In [5]:
def find_nearest_both_sorted(
    all_dates: list[datetime],
    query_dates: list[datetime],
) -> list[datetime]:
    """
    Find the nearest datetime in a list of dates for each datetime value
    is a list of query datetimes if both lists are sorted.

    Parameters
    ----------
    all_dates : list[datetime]
        Sorted list of datetimes to search. This list must be sorted, this
        function does not check sortedness, nor does it sort the list.
    query_dates : list[datetime]
        Sorted list of query datetimes. This list must be sorted, this
        function does not check sortedness, nor does it sort the list.

    Returns
    -------
    result : list[datetime]
        The nearest datetime value in all_dates for each datetime value in
        query_dates.
    """
    n = len(all_dates)
    result = np.empty_like(query_dates, dtype=datetime)
    i = 0
    for m, m_date in enumerate(query_dates):
        while all_dates[i] < m_date and i < n:
            i += 1
        lower = max(0, i - 1)
        upper = min(i + 2, n)

        dates = all_dates[lower:upper]

        res = dates[np.argmin([abs(m_date - x) for x in dates])]
        result[m] = res
    return list(result)

In [6]:
i = 55
print(f"{query_dates[i] = }")
pos = find_nearest(all_dates, query_dates[i])
print(f"fast result = {find_nearest(all_dates, query_dates[i])}")
print(f"{all_dates[pos - 1]}")
print(f"{all_dates[pos]}")
print(f"{all_dates[pos + 1]}")

query_dates[i] = datetime.datetime(2008, 1, 6, 11, 21, 10, 51601)
fast result = 22770
2008-01-06 07:54:25.317797
2008-01-06 14:12:24.600544
2008-01-06 14:32:59.053522


## Greedy approach

Make every comparison.

### Naive list approach

In [7]:
%%time
greedy_results = [
    all_dates[np.argmin([np.abs(x - y) for y in all_dates])]
    for x in query_dates
]

CPU times: user 10.4 s, sys: 75.6 ms, total: 10.4 s
Wall time: 10.4 s


### NumPy `ufunc`

In [8]:
%%time
greedy_fast_results = [
    all_dates[i]
    for i in np.argmin(
        np.abs(np.subtract.outer(all_dates, query_dates)), axis=0
    )
]

CPU times: user 1.36 s, sys: 371 ms, total: 1.73 s
Wall time: 1.86 s


## Bisection

In [9]:
%%timeit
[all_dates[find_nearest(all_dates, x, check_sorted=False)] for x in query_dates]



636 μs ± 6.19 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%%timeit
[all_dates[i] for i in find_nearest(all_dates, query_dates, check_sorted=False)]

546 μs ± 1.39 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
%%timeit d = random.choice(query_dates)
find_nearest(all_dates, d, check_sorted=False)

2 μs ± 53.4 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


## Both lists are sorted

In [12]:
%%timeit
find_nearest_both_sorted(all_dates, s_query_dates)

2.19 ms ± 205 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
# greedy_fast_results = [
#     all_dates[i]
#     for i in np.argmin(
#         np.abs(np.subtract.outer(all_dates, query_dates)), axis=0
#     )
# ]
bisect_results = [all_dates[i] for i in find_nearest(all_dates, query_dates)]
sorted_results = find_nearest_both_sorted(all_dates, s_query_dates)

In [14]:
all(g == f for g, f in zip(greedy_results, greedy_fast_results))

True

In [15]:
all(g == b for g, b in zip(greedy_results, bisect_results))

True

In [16]:
all(
    s == g
    for s, g in zip(
        sorted_results, [x for _, x in sorted(zip(query_dates, greedy_results))]
    )
)

True

In [17]:
%%timeit vals = generate_random_datetime(N1)
sorted(vals)

6.34 ms ± 161 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
