Skip to content

Commit

Permalink
Misc. documentation improvements
Browse files Browse the repository at this point in the history
restyle with styler, use em dashes, name cleaned objects differently from original objects for clearer code in snake_case, ensure all functions appended with (). group two-table verbs as setops and join per dplyr
  • Loading branch information
philip-khor committed Aug 12, 2019
1 parent 757f1e4 commit a0640ef
Show file tree
Hide file tree
Showing 17 changed files with 1,606 additions and 978 deletions.
8 changes: 4 additions & 4 deletions R/panel_consistency.R
Expand Up @@ -545,11 +545,11 @@ fixed_check <- function(.df, .var = NULL, .within = NULL) {
# Pull out variable names
.varcall <- tidyselect::vars_select(names(.df), {{ .var }})
if (length(.varcall) == 0) {
stop('.var must be specified as variable(s) in .df.')
stop(".var must be specified as variable(s) in .df.")
}
.withincall <- tidyselect::vars_select(names(.df), {{ .within }})
if (length(.withincall) == 0) {
stop('.within must be specified as variable(s) in df.')
stop(".within must be specified as variable(s) in df.")
}

# if .var is unspecified
Expand Down Expand Up @@ -619,12 +619,12 @@ fixed_force <- function(.df, .var = NULL, .within = NULL, .resolve = mode_order,
# Pull out variable names
.varcall <- tidyselect::vars_select(names(.df), {{ .var }})
if (length(.varcall) == 0) {
stop('.var must be specified as variable(s) in .df.')
stop(".var must be specified as variable(s) in .df.")
}
.withincall <- tidyselect::vars_select(names(.df), {{ .within }})

if (length(.withincall) == 0) {
stop('.within must be specified as variable(s) in .df.')
stop(".within must be specified as variable(s) in .df.")
}

# if .var is unspecified
Expand Down
40 changes: 29 additions & 11 deletions R/tbl_pb_methods.R
Expand Up @@ -212,8 +212,17 @@ bind_cols.tbl_pb <- function(.data, ...) {
}

##### BIND_ROWS WHY WON'T YOU CALL BIND_ROWS.tbl_pb???
#' Set operations
#'
#' These functions overwrite the set functions provided in base to make them generic to be used to
#' join pibbles. See \link[dplyr]{setops} for details.
#'
#' @rdname setops
#' @inheritParams dplyr::setops
#' @name setops
NULL

#' @rdname pibble_methods
#' @rdname setops
#' @importFrom dplyr intersect
#' @method intersect tbl_pb
#' @export
Expand Down Expand Up @@ -244,7 +253,7 @@ greatest_hits <- function() {
}
}

#' @rdname pibble_methods
#' @rdname setops
#' @importFrom dplyr union
#' @method union tbl_pb
#' @export
Expand All @@ -260,7 +269,7 @@ union.tbl_pb <- function(x, y, ...) {
return(build_pibble(dplyr::union(x, y, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname setops
#' @importFrom dplyr union_all
#' @method union_all tbl_pb
#' @export
Expand All @@ -276,7 +285,7 @@ union_all.tbl_pb <- function(x, y, ...) {
return(build_pibble(dplyr::union_all(x, y, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname setops
#' @importFrom dplyr setdiff
#' @method setdiff tbl_pb
#' @export
Expand All @@ -293,7 +302,16 @@ setdiff.tbl_pb <- function(x, y, ...) {
}


#' @rdname pibble_methods
#' Join two pibbles together
#'
#' These are generic functions that dispatch to individual pibble methods. See \link[dplyr]{join} for
#' complete documentation.
#'
#' @rdname join
#' @inheritParams dplyr::join
#' @name join.tbl_pb
NULL

#' @importFrom dplyr left_join
#' @method left_join tbl_pb
#' @export
Expand All @@ -309,7 +327,7 @@ left_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y
return(build_pibble(dplyr::left_join(x, y, by, copy, suffix, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname join
#' @importFrom dplyr inner_join
#' @method inner_join tbl_pb
#' @export
Expand All @@ -325,7 +343,7 @@ inner_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".
return(build_pibble(dplyr::inner_join(x, y, by, copy, suffix, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname join
#' @importFrom dplyr right_join
#' @method right_join tbl_pb
#' @export
Expand All @@ -341,7 +359,7 @@ right_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".
return(build_pibble(dplyr::right_join(x, y, by, copy, suffix, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname join
#' @importFrom dplyr full_join
#' @method full_join tbl_pb
#' @export
Expand All @@ -357,7 +375,7 @@ full_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y
return(build_pibble(dplyr::full_join(x, y, by, copy, suffix, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname join
#' @importFrom dplyr semi_join
#' @method semi_join tbl_pb
#' @export
Expand All @@ -373,7 +391,7 @@ semi_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, ...) {
return(build_pibble(dplyr::semi_join(x, y, by, copy, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname join
#' @importFrom dplyr nest_join
#' @method nest_join tbl_pb
#' @export
Expand All @@ -389,7 +407,7 @@ nest_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, keep = FALSE, name =
return(build_pibble(dplyr::nest_join(x, y, by, copy, keep, name, ...), .i, .t, .d))
}

#' @rdname pibble_methods
#' @rdname join
#' @importFrom dplyr anti_join
#' @method anti_join tbl_pb
#' @export
Expand Down
61 changes: 33 additions & 28 deletions README.Rmd
Expand Up @@ -22,7 +22,7 @@ knitr::opts_chunk$set(
[![Codecov test coverage](https://codecov.io/gh/nickch-k/pmdplyr/branch/master/graph/badge.svg)](https://codecov.io/gh/nickch-k/pmdplyr?branch=master)
<!-- badges: end -->

The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you *analyze* panel data.
The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `_join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you *analyze* panel data.

Unlike other panel data packages, functions in `pmdplyr` are all designed to work even if there is more than one observation per individual per period. This comes in handy if each individual is observed multiple times per period - for example, multiple classes per student per term; or if you have hierarchical data - for example, multiple companies per country.

Expand All @@ -38,7 +38,7 @@ devtools::install_github("NickCH-K/pmdplyr")
```
## College Scorecard Example

Let's start with the fairly straightforward `Scorecard` data, which is uniquely identified by college ID `unitid` and year `year`, and which describes how well students who attended that college are doing years after attendance.
Let's start with the fairly straightforward `Scorecard` data, which describes how well students who attended that college are doing years after attendance. `Scorecard` observations are uniquely identified by college ID `unitid` and year `year`.

```{r}
# Note that pmdplyr automatically loads dplyr as well
Expand All @@ -59,17 +59,19 @@ unemp_data <- data.frame(
I am interested in measuring the differences in ex-student earnings `earnings_med` between two-year and four-year colleges (`pred_degree_awarded_ipeds == 2` or `3`, respectively). But before we can do that we need to clean the data.

```{r}
Scorecard <- Scorecard %>%
Scorecard %>%
# We want pred_degree_awarded_ipeds to be consistent within college. No changers!
# So let's drop them by using fixed_check with .resolve = "drop" to lose inconsistencies
fixed_force(.var = pred_degree_awarded_ipeds,
.within = unitid,
.resolve = "drop") %>%
fixed_force(
.var = pred_degree_awarded_ipeds,
.within = unitid,
.resolve = "drop"
) %>%
# Then, get rid of pred_degree_awarded_ipeds == 1
# And simplify our terms
filter(pred_degree_awarded_ipeds %in% c(2,3)) %>%
filter(pred_degree_awarded_ipeds %in% c(2, 3)) %>%
mutate(FourYear = pred_degree_awarded_ipeds == 3) %>%
# earnings_med has some missing values - let's fill them in with
# earnings_med has some missing values - let's fill them in with
# the most recent nonmissing observations we have
# - panel_locf respects the panel structure declared above with as_pibble()
mutate(earnings_med = panel_locf(earnings_med)) %>%
Expand All @@ -84,23 +86,24 @@ Scorecard <- Scorecard %>%
# But that's okay! We just pick a .resolve function to handle disagreements.
# (We could also do this straight in the regression model itself)
mutate(lag_state_earnings = tlag(earnings_med,
.i = state_abbr,
.t = year,
.resolve = mean))
.i = state_abbr,
.t = year,
.resolve = mean
)) -> scorecard_clean
# Now we can run a basic regression.
# Now we can run a basic regression.
summary(lm(
earnings_med ~
FourYear +
unemp +
lm(
earnings_med ~
FourYear +
unemp +
lag_state_earnings,
data = Scorecard
))
data = scorecard_clean
) %>%
summary()
```

We could even improve that code - why not run the `anti_join` and `inexact_left_join` using `safe_join`? When we do the `inexact_left_join`, for example, we're assuming that `unemp_data` is uniquely identified by `unemp_year` - is it really? `safe_join` would check for us and minimize error.
We could even improve that code - why not run the `anti_join()` and `inexact_left_join()` using `safe_join()`? When we do the `inexact_left_join()`, for example, we're assuming that `unemp_data` is uniquely identified by `unemp_year`is it really? `safe_join()` would check for us and minimize error.

## Spanish Rail Example

Expand All @@ -113,33 +116,35 @@ We have some difficulties to cover: making the ID and time variables behave, acc
```{r}
data(SPrail)
SPrail <- SPrail %>%
SPrail %>%
# We have two ID variables - origin and destination.
# pmdplyr has no problem with this, but maybe we want to export
# to something like plm later, which can't handle it.
# So let's use id_variable to combine them into one
mutate(route_ID = id_variable(origin, destination)) %>%
# We have a time variable down to the minute. Too fine-grained!
# Let's back things up to the daily level, and
# Let's back things up to the daily level, and
# create a nice integer time variable that's easy to use
mutate(day = time_variable(insert_date, .method = "day")) %>%
# Now we can declare a pibble
as_pibble(.i = route_ID, .t = day) %>%
# We want to account for between-route differences in price,
# so let's isolate the within variation
mutate(price_w = within_i(price)) %>%
# We want to compare to the cheapo option, so let's use
# We want to compare to the cheapo option, so let's use
# mutate_subset to get the average price of the cheapo option
# and propogate that to the other options for comparison
mutate_subset(cheapo_price = mean(price, na.rm = TRUE),
.filter = train_class == "Turista con enlace") %>%
mutate_subset(
cheapo_price = mean(price, na.rm = TRUE),
.filter = train_class == "Turista con enlace"
) %>%
mutate(premium = price - cheapo_price) %>%
filter(train_class %in% c("Preferente", "Turista", "Turista Plus")) %>%
# Now let's compare premia
group_by(train_class) %>%
summarize(premium = mean(premium, na.rm = TRUE))
summarize(premium = mean(premium, na.rm = TRUE)) -> sprail_compare_premia
SPrail
sprail_compare_premia
```

And so there we have it - `Preferente` will really set you back relative to the cheapo ticket on the same route.
And so there we have it`Preferente` will really set you back relative to the cheapo ticket on the same route.

0 comments on commit a0640ef

Please sign in to comment.