/
convertUSCensusCBSA.R
202 lines (168 loc) · 7.26 KB
/
convertUSCensusCBSA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#' @importFrom rlang .data
#' @export
#'
#' @title Convert US Core Based Statistical Areas shapefile
#'
#' @description Returns a simple features data frame for US CBSAs
#'
#' The full resolution file will be named "USCensusCBSA.rda". In addition,
#' "_05", _02" and "_01" versions of the file will be created that that are
#' simplified to 5\%, 2\% and 1\%. Simplified versions will greatly improve the
#' speed of both searching and plotting.
#'
#' @details A US Core Based Statistical Areas (CBSA) shapefile is downloaded and converted to a
#' simple features data frame with additional columns of data. The resulting file
#' will be created in the spatial data directory which is set with
#' \code{setSpatialDataDir()}.
#'
#' @note From the source documentation:
#'
#' Metropolitan and Micropolitan Statistical Areas are together termed Core Based
#' Statistical Areas (CBSAs) and are defined by the Office of Management and Budget
#' (OMB) and consist of the county or counties or equivalent entities associated
#' with at least one urban core (urbanized area or urban cluster) of at least 10,000
#' population, plus adjacent counties having a high degree of social and economic
#' integration with the core as measured through commuting ties with the counties
#' containing the core. Categories of CBSAs are: Metropolitan Statistical Areas,
#' based on urbanized areas of 50,000 or more population; and Micropolitan Statistical
#' Areas, based on urban clusters of at least 10,000 population but less than 50,000
#' population.
#'
#' The CBSA boundaries are those defined by OMB based on the 2010 Census,
#' published in 2013, and updated in 2020.
#'
#' @return Name of the datasetName being created.
#'
#' @references \url{https://www2.census.gov/geo/tiger/TIGER2021/CBSA/}
#'
#' @seealso setSpatialDataDir
#' @seealso getUSCounty
convertUSCensusCBSA <- function() {
# ----- Setup ----------------------------------------------------------------
loadSpatialData("USCensusStates")
# Use package internal data directory
dataDir <- getSpatialDataDir()
# Specify the name of the dataset and file being created
datasetName <- 'USCensusCBSA'
# ----- Get the data ---------------------------------------------------------
# Build appropriate request URL for US County Borders data
url <- 'https://www2.census.gov/geo/tiger/TIGER2021/CBSA/tl_2021_us_cbsa.zip'
filePath <- file.path(dataDir, basename(url))
utils::download.file(url, filePath)
# NOTE: This zip file has no directory so extra subdirectory needs to be created
utils::unzip(filePath, exdir = file.path(dataDir, 'cbsa'))
# ----- Convert to SFDF ------------------------------------------------------
# Convert shapefile into simple features data frame
# NOTE: The 'cbsa' directory has been created
dsnPath <- file.path(dataDir,'cbsa')
shpName <- 'tl_2021_us_cbsa'
SFDF <- convertLayer(
dsn = dsnPath,
layer = shpName
)
# ----- Select useful columns and rename -------------------------------------
# > dplyr::glimpse(SFDF, width = 75)
# Rows: 939
# Columns: 13
# $ CSAFP <chr> "122", "122", "428", "426", "258", "532", "194", NA, NA,…
# $ CBSAFP <chr> "12020", "12060", "12100", "12120", "12140", "12180", "1…
# $ GEOID <chr> "12020", "12060", "12100", "12120", "12140", "12180", "1…
# $ NAME <chr> "Athens-Clarke County, GA", "Atlanta-Sandy Springs-Alpha…
# $ NAMELSAD <chr> "Athens-Clarke County, GA Metro Area", "Atlanta-Sandy Sp…
# $ LSAD <chr> "M1", "M1", "M1", "M2", "M2", "M2", "M1", "M1", "M2", "M…
# $ MEMI <chr> "1", "1", "1", "2", "2", "2", "1", "1", "2", "2", "1", "…
# $ MTFCC <chr> "G3110", "G3110", "G3110", "G3110", "G3110", "G3110", "G…
# $ ALAND <dbl> 2654607902, 22495873026, 1438775279, 2448595161, 9397319…
# $ AWATER <dbl> 26109459, 386782308, 301270067, 20024887, 2657419, 44569…
# $ INTPTLAT <chr> "+33.9439840", "+33.6937280", "+39.4693555", "+31.122286…
# $ INTPTLON <chr> "-083.2138965", "-084.3999113", "-074.6337591", "-087.16…
# $ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-83.36003 3..., MULTIPOLYGO…
#
# Data Dictionary:
# $ CSAFP -----> (drop)
# $ CBSAFP -----> CBSAFP
# $ GEOID -----> (drop)
# $ NAME -----> CBSAName
# $ NAMELSAD -----> (drop)
# $ LSAD -----> (drop)
# $ MEMI -----> sizeClass
# $ MTFCC -----> (drop)
# $ ALAND -----> landArea
# $ AWATER -----> waterArea
# $ INTPTLAT -----> latitude
# $ INTPTLON -----> longitude
# Convert lat/lon to numeric
SFDF$INTPTLAT <- as.numeric(SFDF$INTPTLAT)
SFDF$INTPTLON <- as.numeric(SFDF$INTPTLON)
# We can use longitude and latitude to get one state code for each polygon.
# Validation plot -- check if lon/lat are polygon centroids
if ( FALSE ) {
tx <- dplyr::filter(SFDF, stringr::str_detect(SFDF$NAME, "TX"))
plot(tx)
points(tx$INTPTLON, tx$INTPTLAT, pch = 16, col = 'red')
}
SFDF$stateCode <- getStateCode(SFDF$INTPTLON, SFDF$INTPTLAT, datasetName = 'USCensusStates', useBuffering = TRUE)
SFDF$countryCode <- "US"
# Get CBSAName and allStateCodes from the CBSAName column
nameMatrix <- stringr::str_split_fixed(SFDF$NAME, ',', 2)
SFDF$CBSAName <- nameMatrix[, 1]
# allStateCodes is a comma-separate list of stateCodes
SFDF$allStateCodes <- stringr::str_trim( stringr::str_replace_all(nameMatrix[,2], '-',',') )
# Convert MEMI to explicitly indicate Micropolitan and Metropolitan classes
metroMask <- SFDF$MEMI == "1"
SFDF$MEMI[metroMask] <- "metro"
SFDF$MEMI[!metroMask] <- "micro"
# Create the new dataframe in a specific column order
SFDF <-
dplyr::select(
.data = SFDF,
countryCode = .data$countryCode,
stateCode = .data$stateCode,
allStateCodes = .data$allStateCodes,
CBSAFP = .data$CBSAFP,
CBSAName = .data$NAME,
sizeClass = .data$MEMI,
landArea = .data$ALAND,
waterArea = .data$AWATER,
latitude = .data$INTPTLAT,
longitude = .data$INTPTLON
)
# ----- Simplify and save ----------------------------------------------------
uniqueIdentifier <- "CBSAFP"
simplifyAndSave(
SFDF = SFDF,
datasetName = datasetName,
uniqueIdentifier = uniqueIdentifier,
dataDir = dataDir
)
# ----- Clean up and return --------------------------------------------------
# Clean up
unlink(filePath, force = TRUE)
unlink(dsnPath, recursive = TRUE, force = TRUE)
return(invisible(datasetName))
}
# ===== TEST ===================================================================
if ( FALSE ) {
library(sf)
# Look or horizontal lines from polygons that cross the dateline.
# NOTE: These are sometimes created by sf::st_make_valid()
loadSpatialData(datasetName)
SFDF <- get(paste0(datasetName, ""))
SFDF_05 <- get(paste0(datasetName, "_05"))
SFDF_02 <- get(paste0(datasetName, "_02"))
SFDF_01 <- get(paste0(datasetName, "_01"))
plot(SFDF_01$geometry)
dev.off(dev.list()["RStudioGD"])
plot(SFDF_02$geometry)
dev.off(dev.list()["RStudioGD"])
plot(SFDF_05$geometry)
dev.off(dev.list()["RStudioGD"])
#plot(SFDF$geometry)
# Try out getSpatialData()
lons <- c(-120:-110, 0:10)
lats <- c(30:40, 30:40)
df <- getSpatialData(lons, lats, SFDF_01)
df <- getSpatialData(lons, lats, SFDF_02)
df <- getSpatialData(lons, lats, SFDF_05)
df <- getSpatialData(lons, lats, SFDF)
}