Misc. documentation improvements

restyle with styler, use em dashes, name cleaned objects differently from original objects for clearer code in snake_case, ensure all functions appended with (). group two-table verbs as setops and join per dplyr
NickCH-K · Aug 12, 2019 · a0640ef · a0640ef
1 parent 757f1e4
commit a0640ef
Show file tree

Hide file tree

Showing 17 changed files with 1,606 additions and 978 deletions.
diff --git a/R/panel_consistency.R b/R/panel_consistency.R
@@ -545,11 +545,11 @@ fixed_check <- function(.df, .var = NULL, .within = NULL) {
   # Pull out variable names
   .varcall <- tidyselect::vars_select(names(.df), {{ .var }})
   if (length(.varcall) == 0) {
-    stop('.var must be specified as variable(s) in .df.')
+    stop(".var must be specified as variable(s) in .df.")
   }
   .withincall <- tidyselect::vars_select(names(.df), {{ .within }})
   if (length(.withincall) == 0) {
-    stop('.within must be specified as variable(s) in df.')
+    stop(".within must be specified as variable(s) in df.")
   }
 
   # if .var is unspecified
@@ -619,12 +619,12 @@ fixed_force <- function(.df, .var = NULL, .within = NULL, .resolve = mode_order,
   # Pull out variable names
   .varcall <- tidyselect::vars_select(names(.df), {{ .var }})
   if (length(.varcall) == 0) {
-    stop('.var must be specified as variable(s) in .df.')
+    stop(".var must be specified as variable(s) in .df.")
   }
   .withincall <- tidyselect::vars_select(names(.df), {{ .within }})
 
   if (length(.withincall) == 0) {
-    stop('.within must be specified as variable(s) in .df.')
+    stop(".within must be specified as variable(s) in .df.")
   }
 
   # if .var is unspecified

diff --git a/R/tbl_pb_methods.R b/R/tbl_pb_methods.R
@@ -212,8 +212,17 @@ bind_cols.tbl_pb <- function(.data, ...) {
 }
 
 ##### BIND_ROWS WHY WON'T YOU CALL BIND_ROWS.tbl_pb???
+#' Set operations
+#'
+#' These functions overwrite the set functions provided in base to make them generic to be used to
+#' join pibbles. See  \link[dplyr]{setops} for details.
+#'
+#' @rdname setops
+#' @inheritParams dplyr::setops
+#' @name setops
+NULL
 
-#' @rdname pibble_methods
+#' @rdname setops
 #' @importFrom dplyr intersect
 #' @method intersect tbl_pb
 #' @export
@@ -244,7 +253,7 @@ greatest_hits <- function() {
   }
 }
 
-#' @rdname pibble_methods
+#' @rdname setops
 #' @importFrom dplyr union
 #' @method union tbl_pb
 #' @export
@@ -260,7 +269,7 @@ union.tbl_pb <- function(x, y, ...) {
   return(build_pibble(dplyr::union(x, y, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname setops
 #' @importFrom dplyr union_all
 #' @method union_all tbl_pb
 #' @export
@@ -276,7 +285,7 @@ union_all.tbl_pb <- function(x, y, ...) {
   return(build_pibble(dplyr::union_all(x, y, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname setops
 #' @importFrom dplyr setdiff
 #' @method setdiff tbl_pb
 #' @export
@@ -293,7 +302,16 @@ setdiff.tbl_pb <- function(x, y, ...) {
 }
 
 
-#' @rdname pibble_methods
+#' Join two pibbles together
+#'
+#' These are generic functions that dispatch to individual pibble methods. See  \link[dplyr]{join} for
+#' complete documentation.
+#'
+#' @rdname join
+#' @inheritParams dplyr::join
+#' @name join.tbl_pb
+NULL
+
 #' @importFrom dplyr left_join
 #' @method left_join tbl_pb
 #' @export
@@ -309,7 +327,7 @@ left_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y
   return(build_pibble(dplyr::left_join(x, y, by, copy, suffix, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname join
 #' @importFrom dplyr inner_join
 #' @method inner_join tbl_pb
 #' @export
@@ -325,7 +343,7 @@ inner_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".
   return(build_pibble(dplyr::inner_join(x, y, by, copy, suffix, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname join
 #' @importFrom dplyr right_join
 #' @method right_join tbl_pb
 #' @export
@@ -341,7 +359,7 @@ right_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".
   return(build_pibble(dplyr::right_join(x, y, by, copy, suffix, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname join
 #' @importFrom dplyr full_join
 #' @method full_join tbl_pb
 #' @export
@@ -357,7 +375,7 @@ full_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y
   return(build_pibble(dplyr::full_join(x, y, by, copy, suffix, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname join
 #' @importFrom dplyr semi_join
 #' @method semi_join tbl_pb
 #' @export
@@ -373,7 +391,7 @@ semi_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, ...) {
   return(build_pibble(dplyr::semi_join(x, y, by, copy, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname join
 #' @importFrom dplyr nest_join
 #' @method nest_join tbl_pb
 #' @export
@@ -389,7 +407,7 @@ nest_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, keep = FALSE, name =
   return(build_pibble(dplyr::nest_join(x, y, by, copy, keep, name, ...), .i, .t, .d))
 }
 
-#' @rdname pibble_methods
+#' @rdname join
 #' @importFrom dplyr anti_join
 #' @method anti_join tbl_pb
 #' @export

diff --git a/README.Rmd b/README.Rmd
@@ -22,7 +22,7 @@ knitr::opts_chunk$set(
 [![Codecov test coverage](https://codecov.io/gh/nickch-k/pmdplyr/branch/master/graph/badge.svg)](https://codecov.io/gh/nickch-k/pmdplyr?branch=master)
 <!-- badges: end -->
 
-The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you *analyze* panel data.
+The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `_join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you *analyze* panel data.
 
 Unlike other panel data packages, functions in `pmdplyr` are all designed to work even if there is more than one observation per individual per period. This comes in handy if each individual is observed multiple times per period - for example, multiple classes per student per term; or if you have hierarchical data - for example, multiple companies per country.
 
@@ -38,7 +38,7 @@ devtools::install_github("NickCH-K/pmdplyr")
 ```
 ## College Scorecard Example
 
-Let's start with the fairly straightforward `Scorecard` data, which is uniquely identified by college ID `unitid` and year `year`, and which describes how well students who attended that college are doing years after attendance.
+Let's start with the fairly straightforward `Scorecard` data, which describes how well students who attended that college are doing years after attendance. `Scorecard` observations are uniquely identified by college ID `unitid` and year `year`.
 
 ```{r}
 # Note that pmdplyr automatically loads dplyr as well
@@ -59,17 +59,19 @@ unemp_data <- data.frame(
 I am interested in measuring the differences in ex-student earnings `earnings_med` between two-year and four-year colleges (`pred_degree_awarded_ipeds == 2` or `3`, respectively). But before we can do that we need to clean the data.
 
 ```{r}
-Scorecard <- Scorecard %>%
+Scorecard %>%
   # We want pred_degree_awarded_ipeds to be consistent within college. No changers!
   # So let's drop them by using fixed_check with .resolve = "drop" to lose inconsistencies
-  fixed_force(.var = pred_degree_awarded_ipeds,
-              .within = unitid,
-              .resolve = "drop") %>%
+  fixed_force(
+    .var = pred_degree_awarded_ipeds,
+    .within = unitid,
+    .resolve = "drop"
+  ) %>%
   # Then, get rid of pred_degree_awarded_ipeds == 1
   # And simplify our terms
-  filter(pred_degree_awarded_ipeds %in% c(2,3)) %>%
+  filter(pred_degree_awarded_ipeds %in% c(2, 3)) %>%
   mutate(FourYear = pred_degree_awarded_ipeds == 3) %>%
-  # earnings_med has some missing values - let's fill them in with 
+  # earnings_med has some missing values - let's fill them in with
   # the most recent nonmissing observations we have
   # - panel_locf respects the panel structure declared above with as_pibble()
   mutate(earnings_med = panel_locf(earnings_med)) %>%
@@ -84,23 +86,24 @@ Scorecard <- Scorecard %>%
   # But that's okay! We just pick a .resolve function to handle disagreements.
   # (We could also do this straight in the regression model itself)
   mutate(lag_state_earnings = tlag(earnings_med,
-                                      .i = state_abbr,
-                                      .t = year,
-                                      .resolve = mean))
+    .i = state_abbr,
+    .t = year,
+    .resolve = mean
+  )) -> scorecard_clean
 
-# Now we can run a basic regression. 
+# Now we can run a basic regression.
 
-summary(lm(
-  earnings_med ~ 
-    FourYear +
-    unemp + 
+lm(
+  earnings_med ~
+  FourYear +
+    unemp +
     lag_state_earnings,
-  data = Scorecard
-))
-
+  data = scorecard_clean
+) %>% 
+  summary()
 ```
 
-We could even improve that code - why not run the `anti_join` and `inexact_left_join` using `safe_join`? When we do the `inexact_left_join`, for example, we're assuming that `unemp_data` is uniquely identified by `unemp_year` - is it really? `safe_join` would check for us and minimize error.
+We could even improve that code - why not run the `anti_join()` and `inexact_left_join()` using `safe_join()`? When we do the `inexact_left_join()`, for example, we're assuming that `unemp_data` is uniquely identified by `unemp_year`—is it really? `safe_join()` would check for us and minimize error.
 
 ## Spanish Rail Example
 
@@ -113,33 +116,35 @@ We have some difficulties to cover: making the ID and time variables behave, acc
 ```{r}
 data(SPrail)
 
-SPrail <- SPrail %>%
+SPrail %>%
   # We have two ID variables - origin and destination.
   # pmdplyr has no problem with this, but maybe we want to export
   # to something like plm later, which can't handle it.
   # So let's use id_variable to combine them into one
   mutate(route_ID = id_variable(origin, destination)) %>%
   # We have a time variable down to the minute. Too fine-grained!
-  # Let's back things up to the daily level, and 
+  # Let's back things up to the daily level, and
   # create a nice integer time variable that's easy to use
   mutate(day = time_variable(insert_date, .method = "day")) %>%
   # Now we can declare a pibble
   as_pibble(.i = route_ID, .t = day) %>%
   # We want to account for between-route differences in price,
   # so let's isolate the within variation
   mutate(price_w = within_i(price)) %>%
-  # We want to compare to the cheapo option, so let's use 
+  # We want to compare to the cheapo option, so let's use
   # mutate_subset to get the average price of the cheapo option
   # and propogate that to the other options for comparison
-  mutate_subset(cheapo_price = mean(price, na.rm = TRUE), 
-                .filter = train_class == "Turista con enlace") %>%
+  mutate_subset(
+    cheapo_price = mean(price, na.rm = TRUE),
+    .filter = train_class == "Turista con enlace"
+  ) %>%
   mutate(premium = price - cheapo_price) %>%
   filter(train_class %in% c("Preferente", "Turista", "Turista Plus")) %>%
   # Now let's compare premia
   group_by(train_class) %>%
-  summarize(premium = mean(premium, na.rm = TRUE))
+  summarize(premium = mean(premium, na.rm = TRUE)) -> sprail_compare_premia
 
-SPrail
+sprail_compare_premia
 ```
 
-And so there we have it - `Preferente` will really set you back relative to the cheapo ticket on the same route.
+And so there we have it—`Preferente` will really set you back relative to the cheapo ticket on the same route.