/
createLocationID.R
158 lines (135 loc) · 4.84 KB
/
createLocationID.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#' @title Create one or more unique locationIDs
#' @description A locationID is created for each incoming \code{longitude} and
#' \code{latitude}. Each locationID is unique to within a certain spatial scale.
#' With \code{algorithm = "geohash"}, the
#' \code{precision} argument determines the size of a geohash grid cell. At the
#' equator, the following grid cell sizes apply for different precision levels:
#'
#' \preformatted{
#' precision (maximum grid cell X axis, in m)
#' 5 ± 2400
#' 6 ± 610
#' 7 ± 76
#' 8 ± 19
#' 9 ± 2.4
#' 10 ± 0.6
#' }
#'
#' Invalid locations will be assigned a locationID specified by the user with
#' the \code{invalidID} argument, typically \code{NA}.
#'
#' @details
#' When the \code{"geohash"} algorithm is specified,
#' the following code is used to generate each locationID:
#'
#' \preformatted{
#' locationID <-
#' geohashTools::gh_encode(latitude, longitude, precision)
#' }
#'
#' When the \code{"digest"} algorithm is specified,
#' the following code is used:
#'
#' \preformatted{
#' # Retain accuracy up to ~.1m
#' locationString <- paste0(
#' sprintf("\%.7f", longitude),
#' "_",
#' sprintf("\%.7f", latitude)
#' )
#' # Avoid collisions until billions of records
#' locationID <- digest::digest(locationString, algo = "xxhash64")
#' }
#'
#' See the references for details on either algorithm.
#'
#' @note The \code{"geohash"} algorithm is preferred but the \code{"digest"}
#' algorithm is retained because several existing databases
#' use the \code{"digest"} algorithm as a unique identifier.
#'
#' @param longitude Vector of longitudes in decimal degrees E.
#' @param latitude Vector of latitudes in decimal degrees N.
#' @param algorithm Algorithm to use -- either \code{"geohash"} or \code{"digest"}.
#' @param precision \code{precision} argument used when encoding with \code{"geohash"}.
#' @param invalidID Identifier to use for invalid locations. This can be a
#' character string or \code{NA}.
#' @return Vector of character locationIDs.
#' @examples
#' library(MazamaCoreUtils)
#'
#' longitude <- c(-122.5, 0, NA, -122.5, -122.5)
#' latitude <- c( 47.5, 0, 47.5, NA, 47.5)
#'
#' createLocationID(longitude, latitude)
#' createLocationID(longitude, latitude, invalidID = "bad")
#' createLocationID(longitude, latitude, algorithm = "digest")
#'
#' @references \url{https://en.wikipedia.org/wiki/Decimal_degrees}
#' @references \url{https://www.johndcook.com/blog/2017/01/10/probability-of-secure-hash-collisions/}
#' @references \url{https://michaelchirico.github.io/geohashTools/index.html}
#' @rdname createLocationID
#' @export
#'
createLocationID <- function(
longitude = NULL,
latitude = NULL,
algorithm = c("geohash", "digest"),
precision = 10,
invalidID = as.character(NA)
) {
# ----- Validate parameters --------------------------------------------------
stopIfNull(longitude)
stopIfNull(latitude)
precision <- setIfNull(precision, 10)
algorithm <- match.arg(algorithm)
suppressWarnings({
longitude <- as.numeric(longitude)
latitude <- as.numeric(latitude)
})
# ----- Create locationID ----------------------------------------------------
locationID <- rep(as.character(invalidID), times = length(longitude))
mask <- createLocationMask(longitude, latitude)
# If all locations are bad, return immediately
if ( sum(mask) == 0 ) return(locationID)
# Some valid locations, so continue
if ( algorithm == "digest" ) {
# meters per decimal degree:
# https://en.wikipedia.org/wiki/Decimal_degrees
# Retain accuracy up to ~.1m
locationString <- paste0(
sprintf("%.7f", longitude),
"_",
sprintf("%.7f", latitude)
)
# Explanation of collision frequency:
# https://www.johndcook.com/blog/2017/01/10/probability-of-secure-hash-collisions/
#
# > a hash function with range of size N can hash on the order of sqrt(N) values
# > before running into collisions.
#
# A 32 bit hash will run into collisions at (2^32)^0.5 = 65,536
# A 64 bit hash will run into collisions at (2^64)^0.5 = 4,294,967,296
#
# One can imagine a table with 60K known locations so it looks like a 32 bit
# hash is not quite safe enough.
# Use base::() mnapply to vectorise digest::digest()
locationID[mask] <- mapply(
function(x) { digest::digest(x, algo = "xxhash64") },
locationString[mask],
SIMPLIFY = TRUE,
USE.NAMES = FALSE
)
} else if ( algorithm == "geohash" ) {
locationID[mask] <- mapply(
function(lat, lon) { geohashTools::gh_encode(lat, lon, precision) },
latitude[mask],
longitude[mask],
SIMPLIFY = TRUE,
USE.NAMES = FALSE
)
} else {
stop(sprintf("algorithm = '%s' is not recognized", algorithm))
}
# ----- Return ---------------------------------------------------------------
return(locationID)
}