From a0640ef3c6701f53ae9067653176e30e5ccd4afb Mon Sep 17 00:00:00 2001 From: Philip Khor Date: Mon, 12 Aug 2019 14:54:13 +0800 Subject: [PATCH] Misc. documentation improvements restyle with styler, use em dashes, name cleaned objects differently from original objects for clearer code in snake_case, ensure all functions appended with (). group two-table verbs as setops and join per dplyr --- R/panel_consistency.R | 8 +- R/tbl_pb_methods.R | 40 +- README.Rmd | 61 +- README.md | 72 +- _pkgdown.yml | 4 +- docs/articles/pmdplyr.html | 1059 +++++++++++++++------------- docs/index.html | 162 +++-- docs/pkgdown.yml | 2 +- docs/reference/index.html | 18 +- docs/reference/join.html | 224 ++++++ docs/reference/pibble_methods.html | 38 - docs/reference/setops.html | 181 +++++ man/join.Rd | 63 ++ man/pibble_methods.Rd | 38 - man/setops.Rd | 29 + tests/testthat/test-bad_input.R | 61 +- vignettes/pmdplyr.Rmd | 524 ++++++++------ 17 files changed, 1606 insertions(+), 978 deletions(-) create mode 100644 docs/reference/join.html create mode 100644 docs/reference/setops.html create mode 100644 man/join.Rd create mode 100644 man/setops.Rd diff --git a/R/panel_consistency.R b/R/panel_consistency.R index e705335..1ab2649 100644 --- a/R/panel_consistency.R +++ b/R/panel_consistency.R @@ -545,11 +545,11 @@ fixed_check <- function(.df, .var = NULL, .within = NULL) { # Pull out variable names .varcall <- tidyselect::vars_select(names(.df), {{ .var }}) if (length(.varcall) == 0) { - stop('.var must be specified as variable(s) in .df.') + stop(".var must be specified as variable(s) in .df.") } .withincall <- tidyselect::vars_select(names(.df), {{ .within }}) if (length(.withincall) == 0) { - stop('.within must be specified as variable(s) in df.') + stop(".within must be specified as variable(s) in df.") } # if .var is unspecified @@ -619,12 +619,12 @@ fixed_force <- function(.df, .var = NULL, .within = NULL, .resolve = mode_order, # Pull out variable names .varcall <- tidyselect::vars_select(names(.df), {{ .var }}) if (length(.varcall) == 0) { - stop('.var must be specified as variable(s) in .df.') + stop(".var must be specified as variable(s) in .df.") } .withincall <- tidyselect::vars_select(names(.df), {{ .within }}) if (length(.withincall) == 0) { - stop('.within must be specified as variable(s) in .df.') + stop(".within must be specified as variable(s) in .df.") } # if .var is unspecified diff --git a/R/tbl_pb_methods.R b/R/tbl_pb_methods.R index 55c7ab9..ddbde35 100644 --- a/R/tbl_pb_methods.R +++ b/R/tbl_pb_methods.R @@ -212,8 +212,17 @@ bind_cols.tbl_pb <- function(.data, ...) { } ##### BIND_ROWS WHY WON'T YOU CALL BIND_ROWS.tbl_pb??? +#' Set operations +#' +#' These functions overwrite the set functions provided in base to make them generic to be used to +#' join pibbles. See \link[dplyr]{setops} for details. +#' +#' @rdname setops +#' @inheritParams dplyr::setops +#' @name setops +NULL -#' @rdname pibble_methods +#' @rdname setops #' @importFrom dplyr intersect #' @method intersect tbl_pb #' @export @@ -244,7 +253,7 @@ greatest_hits <- function() { } } -#' @rdname pibble_methods +#' @rdname setops #' @importFrom dplyr union #' @method union tbl_pb #' @export @@ -260,7 +269,7 @@ union.tbl_pb <- function(x, y, ...) { return(build_pibble(dplyr::union(x, y, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname setops #' @importFrom dplyr union_all #' @method union_all tbl_pb #' @export @@ -276,7 +285,7 @@ union_all.tbl_pb <- function(x, y, ...) { return(build_pibble(dplyr::union_all(x, y, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname setops #' @importFrom dplyr setdiff #' @method setdiff tbl_pb #' @export @@ -293,7 +302,16 @@ setdiff.tbl_pb <- function(x, y, ...) { } -#' @rdname pibble_methods +#' Join two pibbles together +#' +#' These are generic functions that dispatch to individual pibble methods. See \link[dplyr]{join} for +#' complete documentation. +#' +#' @rdname join +#' @inheritParams dplyr::join +#' @name join.tbl_pb +NULL + #' @importFrom dplyr left_join #' @method left_join tbl_pb #' @export @@ -309,7 +327,7 @@ left_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y return(build_pibble(dplyr::left_join(x, y, by, copy, suffix, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname join #' @importFrom dplyr inner_join #' @method inner_join tbl_pb #' @export @@ -325,7 +343,7 @@ inner_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ". return(build_pibble(dplyr::inner_join(x, y, by, copy, suffix, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname join #' @importFrom dplyr right_join #' @method right_join tbl_pb #' @export @@ -341,7 +359,7 @@ right_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ". return(build_pibble(dplyr::right_join(x, y, by, copy, suffix, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname join #' @importFrom dplyr full_join #' @method full_join tbl_pb #' @export @@ -357,7 +375,7 @@ full_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y return(build_pibble(dplyr::full_join(x, y, by, copy, suffix, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname join #' @importFrom dplyr semi_join #' @method semi_join tbl_pb #' @export @@ -373,7 +391,7 @@ semi_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, ...) { return(build_pibble(dplyr::semi_join(x, y, by, copy, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname join #' @importFrom dplyr nest_join #' @method nest_join tbl_pb #' @export @@ -389,7 +407,7 @@ nest_join.tbl_pb <- function(x, y, by = NULL, copy = FALSE, keep = FALSE, name = return(build_pibble(dplyr::nest_join(x, y, by, copy, keep, name, ...), .i, .t, .d)) } -#' @rdname pibble_methods +#' @rdname join #' @importFrom dplyr anti_join #' @method anti_join tbl_pb #' @export diff --git a/README.Rmd b/README.Rmd index 3c3a1a7..a0dc90a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -22,7 +22,7 @@ knitr::opts_chunk$set( [![Codecov test coverage](https://codecov.io/gh/nickch-k/pmdplyr/branch/master/graph/badge.svg)](https://codecov.io/gh/nickch-k/pmdplyr?branch=master) -The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you *analyze* panel data. +The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `_join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you *analyze* panel data. Unlike other panel data packages, functions in `pmdplyr` are all designed to work even if there is more than one observation per individual per period. This comes in handy if each individual is observed multiple times per period - for example, multiple classes per student per term; or if you have hierarchical data - for example, multiple companies per country. @@ -38,7 +38,7 @@ devtools::install_github("NickCH-K/pmdplyr") ``` ## College Scorecard Example -Let's start with the fairly straightforward `Scorecard` data, which is uniquely identified by college ID `unitid` and year `year`, and which describes how well students who attended that college are doing years after attendance. +Let's start with the fairly straightforward `Scorecard` data, which describes how well students who attended that college are doing years after attendance. `Scorecard` observations are uniquely identified by college ID `unitid` and year `year`. ```{r} # Note that pmdplyr automatically loads dplyr as well @@ -59,17 +59,19 @@ unemp_data <- data.frame( I am interested in measuring the differences in ex-student earnings `earnings_med` between two-year and four-year colleges (`pred_degree_awarded_ipeds == 2` or `3`, respectively). But before we can do that we need to clean the data. ```{r} -Scorecard <- Scorecard %>% +Scorecard %>% # We want pred_degree_awarded_ipeds to be consistent within college. No changers! # So let's drop them by using fixed_check with .resolve = "drop" to lose inconsistencies - fixed_force(.var = pred_degree_awarded_ipeds, - .within = unitid, - .resolve = "drop") %>% + fixed_force( + .var = pred_degree_awarded_ipeds, + .within = unitid, + .resolve = "drop" + ) %>% # Then, get rid of pred_degree_awarded_ipeds == 1 # And simplify our terms - filter(pred_degree_awarded_ipeds %in% c(2,3)) %>% + filter(pred_degree_awarded_ipeds %in% c(2, 3)) %>% mutate(FourYear = pred_degree_awarded_ipeds == 3) %>% - # earnings_med has some missing values - let's fill them in with + # earnings_med has some missing values - let's fill them in with # the most recent nonmissing observations we have # - panel_locf respects the panel structure declared above with as_pibble() mutate(earnings_med = panel_locf(earnings_med)) %>% @@ -84,23 +86,24 @@ Scorecard <- Scorecard %>% # But that's okay! We just pick a .resolve function to handle disagreements. # (We could also do this straight in the regression model itself) mutate(lag_state_earnings = tlag(earnings_med, - .i = state_abbr, - .t = year, - .resolve = mean)) + .i = state_abbr, + .t = year, + .resolve = mean + )) -> scorecard_clean -# Now we can run a basic regression. +# Now we can run a basic regression. -summary(lm( - earnings_med ~ - FourYear + - unemp + +lm( + earnings_med ~ + FourYear + + unemp + lag_state_earnings, - data = Scorecard -)) - + data = scorecard_clean +) %>% + summary() ``` -We could even improve that code - why not run the `anti_join` and `inexact_left_join` using `safe_join`? When we do the `inexact_left_join`, for example, we're assuming that `unemp_data` is uniquely identified by `unemp_year` - is it really? `safe_join` would check for us and minimize error. +We could even improve that code - why not run the `anti_join()` and `inexact_left_join()` using `safe_join()`? When we do the `inexact_left_join()`, for example, we're assuming that `unemp_data` is uniquely identified by `unemp_year`—is it really? `safe_join()` would check for us and minimize error. ## Spanish Rail Example @@ -113,14 +116,14 @@ We have some difficulties to cover: making the ID and time variables behave, acc ```{r} data(SPrail) -SPrail <- SPrail %>% +SPrail %>% # We have two ID variables - origin and destination. # pmdplyr has no problem with this, but maybe we want to export # to something like plm later, which can't handle it. # So let's use id_variable to combine them into one mutate(route_ID = id_variable(origin, destination)) %>% # We have a time variable down to the minute. Too fine-grained! - # Let's back things up to the daily level, and + # Let's back things up to the daily level, and # create a nice integer time variable that's easy to use mutate(day = time_variable(insert_date, .method = "day")) %>% # Now we can declare a pibble @@ -128,18 +131,20 @@ SPrail <- SPrail %>% # We want to account for between-route differences in price, # so let's isolate the within variation mutate(price_w = within_i(price)) %>% - # We want to compare to the cheapo option, so let's use + # We want to compare to the cheapo option, so let's use # mutate_subset to get the average price of the cheapo option # and propogate that to the other options for comparison - mutate_subset(cheapo_price = mean(price, na.rm = TRUE), - .filter = train_class == "Turista con enlace") %>% + mutate_subset( + cheapo_price = mean(price, na.rm = TRUE), + .filter = train_class == "Turista con enlace" + ) %>% mutate(premium = price - cheapo_price) %>% filter(train_class %in% c("Preferente", "Turista", "Turista Plus")) %>% # Now let's compare premia group_by(train_class) %>% - summarize(premium = mean(premium, na.rm = TRUE)) + summarize(premium = mean(premium, na.rm = TRUE)) -> sprail_compare_premia -SPrail +sprail_compare_premia ``` -And so there we have it - `Preferente` will really set you back relative to the cheapo ticket on the same route. +And so there we have it—`Preferente` will really set you back relative to the cheapo ticket on the same route. diff --git a/README.md b/README.md index 1109333..3315620 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ coverage](https://codecov.io/gh/nickch-k/pmdplyr/branch/master/graph/badge.svg)] The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the -`dplyr` `mutate` and `join` functions that address common panel data +`dplyr` `mutate` and `_join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy `pibble` panel data object, which you can `panel_convert()` for use in one of the many packages that help you @@ -45,10 +45,10 @@ devtools::install_github("NickCH-K/pmdplyr") ## College Scorecard Example -Let’s start with the fairly straightforward `Scorecard` data, which is -uniquely identified by college ID `unitid` and year `year`, and which +Let’s start with the fairly straightforward `Scorecard` data, which describes how well students who attended that college are doing years -after attendance. +after attendance. `Scorecard` observations are uniquely identified by +college ID `unitid` and year `year`. ``` r # Note that pmdplyr automatically loads dplyr as well @@ -72,17 +72,19 @@ I am interested in measuring the differences in ex-student earnings can do that we need to clean the data. ``` r -Scorecard <- Scorecard %>% +Scorecard %>% # We want pred_degree_awarded_ipeds to be consistent within college. No changers! # So let's drop them by using fixed_check with .resolve = "drop" to lose inconsistencies - fixed_force(.var = pred_degree_awarded_ipeds, - .within = unitid, - .resolve = "drop") %>% + fixed_force( + .var = pred_degree_awarded_ipeds, + .within = unitid, + .resolve = "drop" + ) %>% # Then, get rid of pred_degree_awarded_ipeds == 1 # And simplify our terms - filter(pred_degree_awarded_ipeds %in% c(2,3)) %>% + filter(pred_degree_awarded_ipeds %in% c(2, 3)) %>% mutate(FourYear = pred_degree_awarded_ipeds == 3) %>% - # earnings_med has some missing values - let's fill them in with + # earnings_med has some missing values - let's fill them in with # the most recent nonmissing observations we have # - panel_locf respects the panel structure declared above with as_pibble() mutate(earnings_med = panel_locf(earnings_med)) %>% @@ -97,23 +99,25 @@ Scorecard <- Scorecard %>% # But that's okay! We just pick a .resolve function to handle disagreements. # (We could also do this straight in the regression model itself) mutate(lag_state_earnings = tlag(earnings_med, - .i = state_abbr, - .t = year, - .resolve = mean)) + .i = state_abbr, + .t = year, + .resolve = mean + )) -> scorecard_clean -# Now we can run a basic regression. +# Now we can run a basic regression. -summary(lm( - earnings_med ~ - FourYear + - unemp + +lm( + earnings_med ~ + FourYear + + unemp + lag_state_earnings, - data = Scorecard -)) + data = scorecard_clean +) %>% + summary() #> #> Call: #> lm(formula = earnings_med ~ FourYear + unemp + lag_state_earnings, -#> data = Scorecard) +#> data = scorecard_clean) #> #> Residuals: #> Min 1Q Median 3Q Max @@ -134,10 +138,10 @@ summary(lm( #> F-statistic: 430.6 on 3 and 2474 DF, p-value: < 2.2e-16 ``` -We could even improve that code - why not run the `anti_join` and -`inexact_left_join` using `safe_join`? When we do the -`inexact_left_join`, for example, we’re assuming that `unemp_data` is -uniquely identified by `unemp_year` - is it really? `safe_join` would +We could even improve that code - why not run the `anti_join()` and +`inexact_left_join()` using `safe_join()`? When we do the +`inexact_left_join()`, for example, we’re assuming that `unemp_data` is +uniquely identified by `unemp_year`—is it really? `safe_join()` would check for us and minimize error. ## Spanish Rail Example @@ -161,14 +165,14 @@ how to compare each price to the cheapo price. ``` r data(SPrail) -SPrail <- SPrail %>% +SPrail %>% # We have two ID variables - origin and destination. # pmdplyr has no problem with this, but maybe we want to export # to something like plm later, which can't handle it. # So let's use id_variable to combine them into one mutate(route_ID = id_variable(origin, destination)) %>% # We have a time variable down to the minute. Too fine-grained! - # Let's back things up to the daily level, and + # Let's back things up to the daily level, and # create a nice integer time variable that's easy to use mutate(day = time_variable(insert_date, .method = "day")) %>% # Now we can declare a pibble @@ -176,18 +180,20 @@ SPrail <- SPrail %>% # We want to account for between-route differences in price, # so let's isolate the within variation mutate(price_w = within_i(price)) %>% - # We want to compare to the cheapo option, so let's use + # We want to compare to the cheapo option, so let's use # mutate_subset to get the average price of the cheapo option # and propogate that to the other options for comparison - mutate_subset(cheapo_price = mean(price, na.rm = TRUE), - .filter = train_class == "Turista con enlace") %>% + mutate_subset( + cheapo_price = mean(price, na.rm = TRUE), + .filter = train_class == "Turista con enlace" + ) %>% mutate(premium = price - cheapo_price) %>% filter(train_class %in% c("Preferente", "Turista", "Turista Plus")) %>% # Now let's compare premia group_by(train_class) %>% - summarize(premium = mean(premium, na.rm = TRUE)) + summarize(premium = mean(premium, na.rm = TRUE)) -> sprail_compare_premia -SPrail +sprail_compare_premia #> # A tibble: 3 x 2 #> train_class premium #> @@ -196,5 +202,5 @@ SPrail #> 3 Turista Plus 14.4 ``` -And so there we have it - `Preferente` will really set you back relative +And so there we have it—`Preferente` will really set you back relative to the cheapo ticket on the same route. diff --git a/_pkgdown.yml b/_pkgdown.yml index ee2f507..4e57280 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -23,7 +23,7 @@ reference: - fixed_check - fixed_force -- title: Joins +- title: Two-table verbs contents: - inexact_inner_join - inexact_left_join @@ -33,6 +33,8 @@ reference: - inexact_nest_join - inexact_anti_join - safe_join + - join.tbl_pb + - setops - title: Mutate variations contents: diff --git a/docs/articles/pmdplyr.html b/docs/articles/pmdplyr.html index fff6eab..26ff6a7 100644 --- a/docs/articles/pmdplyr.html +++ b/docs/articles/pmdplyr.html @@ -73,7 +73,7 @@

pmdplyr: Panel Maneuvers in dplyr

Nick Huntington-Klein, Philip Khor

-

2019-08-10

+

2019-08-12

Source: vignettes/pmdplyr.Rmd @@ -83,7 +83,7 @@

2019-08-10

library(pmdplyr)
-

The pmdplyr package is an extension to dplyr designed for cleaning and managing panel and hierarchical data. It contains variations on the dplyr mutate and join functions that address common panel data needs, and contains functions for managing and cleaning panel data.

+

The pmdplyr package is an extension to dplyr designed for cleaning and managing panel and hierarchical data. It contains variations on the dplyr::mutate() and dplyr::join() functions that address common panel data needs, and contains functions for managing and cleaning panel data.

Unlike other panel data packages, functions in pmdplyr are all designed to work even if there is more than one observation per individual per period. This comes in handy if each individual is observed multiple times per period - for example, multiple classes per student per term; or if you have hierarchical data - for example, multiple companies per country.

pmdplyr contains a long list of functions for working with panel data, described below.


@@ -98,42 +98,49 @@

pibble() and as_pibble()

pibbles can be declared in two main ways: raw, via pibble():

+ .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE +)

or by transforming an existing data.frame, list, or tbl_df using as_pibble():

-

Both functions work exactly as tibble::tibble and tibble::as_tibble do, except that they also take the arguments .i, .t, and .d, with .i and .t accepting either unquoted or quoted variable names. If you’d like your pibble checked to see if .i and .t uniquely identify your observations, set .uniqcheck = TRUE. It will do this automatically the first time in each R session you create a pibble, but if you’d like it to keep doing it, use uniqcheck.

+ .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE, + ... +) +

Both functions work exactly as tibble::tibble() and tibble::as_tibble() do, except that they also take the arguments .i, .t, and .d, with .i and .t accepting either unquoted or quoted variable names. If you’d like your pibble checked to see if .i and .t uniquely identify your observations, set .uniqcheck = TRUE. It will do this automatically the first time in each R session you create a pibble, but if you’d like it to keep doing it, use uniqcheck.

As a side bonus, you can check if the variables a, b, c uniquely identify the observations in data set d by running as_pibble(d, .i = c(a, b, c), .uniqcheck = TRUE). No warning? It’s uniquely identified!

+basic_pibble <- pibble( + a = c(1, 1, 1, 2, 2, 2), + b = c(1, 2, 3, 2, 3, 3), + c = 1:6, + .i = a, + .t = b +) + +data(SPrail) +# In SPrail, insert_date does not imply regular gaps between +# time periods, so we set .d = 0 +declared_pibble <- as_pibble(SPrail, + .i = c(origin, destination), + .t = insert_date, + .d = 0 +)

panel_convert()

pmdplyr also has the function panel_convert() which allows you to convert between different popular R panel data objects, including pibble. This can come in handy for creating pibbles, or exporting your cleaned pibble to use with a package that does panel data analysis (which pmdplyr does not):

- +

Where data is a panel data object, either pibble, tsibble, pdata.frame, or panel_data, and to is the type of object you’d like returned, which you can refer to by object name, object class, or package name: get a pibble with "pmdplyr", "pibble", or "tbl_pb", a tsibble with "tsibble" or "tbl_ts", a pdata.frame with "plm" or "pdata.frame", or a panel_data with "panelr" or "panel_data". ... sends additional arguments to the functions used to declare those objects.

When using panel_convert, be aware that any grouping will be lost, and you must have the relevant package of your to option installed (tsibble, plm, or panelr). When your data object is a pdata.frame, it is recommended to also have sjlabelled installed.

All valid objects of the non-pibble types can be converted to pibbles, but the reverse is not true, since pibble does not enforce some strict requirements that other types do:

@@ -215,8 +222,9 @@

id_variable()

id_variable() syntax follows:

+ .method = "number", + .minwidth = FALSE +)

where ... is the set of identity variables that you want to combine into a single one (or, potentially, a single variable you’d like to encode numerically).

.method describes the way in which you’d like the variable encoded:

    @@ -227,52 +235,61 @@

  • .method = character preserves all original information and combines the variables together into a string, adding spacing to ensure uniqueness. Set .minwidth = TRUE to remove the spacing, although this may lead to non-uniqueness in some cases.
- +

Where ... is the set of variables that you want to combine into a single, integer-class time variable. The rest of the options determine how the variable(s) will be read or transformed; the need for each varies depending on the structure of the original data and which .method is used.

.method can take the values:

  • -.method="present" will assume that, even if each individual may have some missing periods, each period is present in your data somewhere, and so simply numbers, in order, all the time periods observed in the data.
  • +.method = "present" will assume that, even if each individual may have some missing periods, each period is present in your data somewhere, and so simply numbers, in order, all the time periods observed in the data.
  • -.method="year" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()) and will extract the year from it. Or, use it with a character or numeric variable and indicate with .datepos the character/digit positions that hold the year in YY or YYYY format. If combined with .breaks or .skip, will instead set the earliest year in the data to 1 rather than returning the actual year.
  • +.method = "year" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()) and will extract the year from it. Or, use it with a character or numeric variable and indicate with .datepos the character/digit positions that hold the year in YY or YYYY format. If combined with .breaks or .skip, will instead set the earliest year in the data to 1 rather than returning the actual year.
  • -.method="month" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()). It will give the earliest-observed month in the data set a value of 1, and will increment from there. Or, use it with a character or numeric variable and indicate with .datepos the character/digit positions that hold the year and month in YYMM or YYYYMM format (note that if your variable is in MMYYYY format, for example, you can just give a .datepos argument like c(3:6,1:2)). Months turn over on the .start day of the month, which is by default 1.
  • +.method = "month" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()). It will give the earliest-observed month in the data set a value of 1, and will increment from there. Or, use it with a character or numeric variable and indicate with .datepos the character/digit positions that hold the year and month in YYMM or YYYYMM format (note that if your variable is in MMYYYY format, for example, you can just give a .datepos argument like c(3:6,1:2)). Months turn over on the .start day of the month, which is by default 1.
  • -.method="week" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()). It will give the earliest-observed week in the data set a value of 1, and will increment from there. Weeks turn over on the .start day, which is by default 1 (Monday). Note that this method always starts weeks on the same day of the week, which is different from standard lubridate procedure of counting sets of 7 days starting from January 1.
  • +.method = "week" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()). It will give the earliest-observed week in the data set a value of 1, and will increment from there. Weeks turn over on the .start day, which is by default 1 (Monday). Note that this method always starts weeks on the same day of the week, which is different from standard lubridate procedure of counting sets of 7 days starting from January 1.
  • -.method="day" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()). It will give the earliest-observed day in the data set a value of 1, and increment from there. Or, use it with a character or numeric variable and indicate with .datepos the character/digit positions that hold the year and month in YYMMDD or YYYYMMDD format. To skip certain days of the week, such as weekends, use the .skip option.
  • +.method = "day" can be used with a single Date/POSIX/etc.-type variable (anything that allows lubridate::date()). It will give the earliest-observed day in the data set a value of 1, and increment from there. Or, use it with a character or numeric variable and indicate with .datepos the character/digit positions that hold the year and month in YYMMDD or YYYYMMDD format. To skip certain days of the week, such as weekends, use the .skip option.
  • -.method="turnover" can be used when you have more than one variable in variable and they are all numeric nonnegative integers. Set the .turnover option to indicate the highest value each variable takes before it starts over, and set .turnover_start to indicate what value it takes when it starts over. Cannot be combined with .skip or .breaks. Doesn’t work with any variable for which the turnover values change, i.e. it doesn’t play well with days-in-month - if you’d like to do something like year-month-day-hour, I recommend running .method="day" once with just the year-month-day variable, and then taking the result and combining that with hour in .method="turnover".
  • +.method = "turnover" can be used when you have more than one variable in variable and they are all numeric nonnegative integers. Set the .turnover option to indicate the highest value each variable takes before it starts over, and set .turnover_start to indicate what value it takes when it starts over. Cannot be combined with .skip or .breaks. Doesn’t work with any variable for which the turnover values change, i.e. it doesn’t play well with days-in-month - if you’d like to do something like year-month-day-hour, I recommend running .method="day" once with just the year-month-day variable, and then taking the result and combining that with hour in .method = "turnover".
data(SPrail)
 
@@ -288,67 +305,71 @@ 

) # Let's see what we've got -head(SPrail %>% select(insert_date, ends_with("time_id"))) -#> # A tibble: 6 x 5 -#> insert_date year_time_id month_time_id week_time_id day_time_id -#> <dttm> <int> <int> <int> <int> -#> 1 2019-04-12 20:17:04 2019 1 1 2 -#> 2 2019-04-16 09:33:08 2019 1 2 6 -#> 3 2019-05-08 09:04:07 2019 2 5 28 -#> 4 2019-04-16 06:21:42 2019 1 2 6 -#> 5 2019-05-02 07:03:34 2019 2 4 22 -#> 6 2019-04-13 06:03:43 2019 1 1 3 - -# Perhaps I'd like quarterly data -# (although in this case there are only two months, not much variation there) -SPrail <- SPrail %>% - dplyr::mutate(quarter_time_id = time_variable(insert_date, - .method = "month", - .breaks = c(1, 4, 7, 10) - )) -# Should line up properly with month -table(SPrail$month_time_id, SPrail$quarter_time_id, dnn = c('Month', 'Quarter')) -#> Quarter -#> Month 1 -#> 1 1633 -#> 2 367 - -# Maybe I'd like Monday to come immediately after Friday! -SPrail <- SPrail %>% - dplyr::mutate(weekday_time_id = time_variable(insert_date, - .method = "day", - .skip = c(6, 7) - )) - -# Perhaps I'm interested in ANY time period in the data and just want to enumerate them in order -SPrail <- SPrail %>% - dplyr::mutate(any_present_time_id = time_variable(insert_date, - .method = "present" - )) - -# Note the weekday_time_id NAs - these are weekends! We told it to skip those. -head(SPrail %>% select(insert_date, day_time_id, weekday_time_id, any_present_time_id)) -#> # A tibble: 6 x 4 -#> insert_date day_time_id weekday_time_id any_present_time_id -#> <dttm> <int> <int> <int> -#> 1 2019-04-12 20:17:04 2 NA 96 -#> 2 2019-04-16 09:33:08 6 4 461 -#> 3 2019-05-08 09:04:07 28 20 1899 -#> 4 2019-04-16 06:21:42 6 4 446 -#> 5 2019-05-02 07:03:34 22 16 1670 -#> 6 2019-04-13 06:03:43 3 NA 130 - -# Maybe instead of being given a nice time variable, I was given it in string form -SPrail <- SPrail %>% dplyr::mutate(time_string = as.character(insert_date)) -# As long as the character positions are consistent we can still use it -SPrail <- SPrail %>% - dplyr::mutate(day_from_string_id = time_variable(time_string, - .method = "day", - .datepos = c(3, 4, 6, 7, 9, 10) - )) -# Results are identical from using the actual Date variable -cor(SPrail$day_time_id, SPrail$day_from_string_id) -#> [1] 1

+SPrail %>% + select(insert_date, ends_with("time_id")) %>% + head() +#> # A tibble: 6 x 5 +#> insert_date year_time_id month_time_id week_time_id day_time_id +#> <dttm> <int> <int> <int> <int> +#> 1 2019-04-12 20:17:04 2019 1 1 2 +#> 2 2019-04-16 09:33:08 2019 1 2 6 +#> 3 2019-05-08 09:04:07 2019 2 5 28 +#> 4 2019-04-16 06:21:42 2019 1 2 6 +#> 5 2019-05-02 07:03:34 2019 2 4 22 +#> 6 2019-04-13 06:03:43 2019 1 1 3 + +# Perhaps I'd like quarterly data +# (although in this case there are only two months, not much variation there) +SPrail <- SPrail %>% + dplyr::mutate(quarter_time_id = time_variable(insert_date, + .method = "month", + .breaks = c(1, 4, 7, 10) + )) +# Should line up properly with month +SPrail %>% + count(month_time_id, quarter_time_id) +#> # A tibble: 2 x 3 +#> month_time_id quarter_time_id n +#> <int> <int> <int> +#> 1 1 1 1633 +#> 2 2 1 367 + +# Maybe I'd like Monday to come immediately after Friday! +SPrail <- SPrail %>% + dplyr::mutate(weekday_time_id = time_variable(insert_date, + .method = "day", + .skip = c(6, 7) + )) + +# Perhaps I'm interested in ANY time period in the data and just want to enumerate them in order +SPrail <- SPrail %>% + dplyr::mutate(any_present_time_id = time_variable(insert_date, + .method = "present" + )) + +# Note the weekday_time_id NAs - these are weekends! We told it to skip those. +head(SPrail %>% select(insert_date, day_time_id, weekday_time_id, any_present_time_id)) +#> # A tibble: 6 x 4 +#> insert_date day_time_id weekday_time_id any_present_time_id +#> <dttm> <int> <int> <int> +#> 1 2019-04-12 20:17:04 2 NA 96 +#> 2 2019-04-16 09:33:08 6 4 461 +#> 3 2019-05-08 09:04:07 28 20 1899 +#> 4 2019-04-16 06:21:42 6 4 446 +#> 5 2019-05-02 07:03:34 22 16 1670 +#> 6 2019-04-13 06:03:43 3 NA 130 + +# Maybe instead of being given a nice time variable, I was given it in string form +SPrail <- SPrail %>% dplyr::mutate(time_string = as.character(insert_date)) +# As long as the character positions are consistent we can still use it +SPrail <- SPrail %>% + dplyr::mutate(day_from_string_id = time_variable(time_string, + .method = "day", + .datepos = c(3, 4, 6, 7, 9, 10) + )) +# Results are identical from using the actual Date variable +cor(SPrail$day_time_id, SPrail$day_from_string_id) +#> [1] 1
@@ -362,40 +383,43 @@

panel_fill()

panel_fill() will fill in gaps between time periods for individuals. For example, if person 1 has observations in period 1 and period 3, but not period 2, then panel_fill() will add an observation to the data for person 1 in time period 2. If there is more than one observation for person 1 in period 1, then all of them will be copied for period 2.

+ .set_NA = FALSE, + .min = NA, + .max = NA, + .backwards = FALSE, + .group_i = TRUE, + .flag = NA, + .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE, + .setpanel = TRUE +)

panel_fill() will give us some newly-created observations, and we need to decide what to fill them in with. By default, it will fill in values using what we see in the most recent non-missing observation. But we can set .backwards = TRUE to use the next non-missing observation instead, or use .set_NA to fill the new observations with missing data.

.set_NA is a character vector of variable names that should be set to NA for newly-created observations, or set to TRUE to set everything except .i and .t to NA. You can also create a new variable indicating which observations are newly-created with .flag.

+df <- pibble( + i = c(1, 1, 1, 2, 2, 2), + t = c(2, 4, 5, 1, 2, 3), + x = 1:6, + y = 7:12, + .i = i, + .t = t +) + +panel_fill(df, .set_NA = "y", .flag = "new_obs") +#> # A pibble: 7 x 5 +#> i t x y new_obs +#> <dbl> <dbl> <int> <int> <lgl> +#> 1 1 2 1 7 FALSE +#> 2 1 3 1 NA TRUE +#> 3 1 4 2 8 FALSE +#> 4 1 5 3 9 FALSE +#> 5 2 1 4 10 FALSE +#> 6 2 2 5 11 FALSE +#> 7 2 3 6 12 FALSE +panel_fill(df, .set_NA = "y", .backwards = TRUE)$x +#> [1] 1 2 2 3 4 5 6

By default, panel_fill() will only fill in gaps between existing observations. However, commonly we might want to create new observations outside of the existing range, perhaps to create a fully balanced panel for ourselves. .min and .max will ensure that each individual has observations at least as far back as .min, and at least as far out as .max. Set .min = min(t) and .max = max(t) (where t is your time variable) to ensure a fully balanced panel.

Data for the outside-the-observed-range values will be taken from the closest observed value.

panel_fill(df, .min = min(df$t), .max = max(df$t))
@@ -419,74 +443,83 @@ 

panel_locf()

panel_locf() (“last observation carried forward”) will fill in explicit NA values using recently available data. It is very similar to zoo::na.locf() except that it respects panel structure and is more flexible.

+ .df = get(".", envir = parent.frame()), + .fill = NA, + .backwards = FALSE, + .resolve = "error", + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE +)

where .var is the variable to be filled in, and .df is the data set that variable lives in. If the data set is being passed in via %>%, then .df will automatically pick it up and you don’t need to specify it.

- +

You have a fair amount of control over how filling-in works. By default, data will be filled in using the most recent previous observation. But .backwards = TRUE will use the next upcoming observation instead. Also, by default, only NA values will be overwritten. But .fill will allow you to specify a vector of values (perhaps including NA) to be overwritten. This can be handy if you’re working with data that uses missingness indicators other than NA.

- +

panel_locf() will work even if .i and .t don’t uniquely identify the observations. However, this presents a problem! If there are different values of .var for a given combination of .i and .t, then which value do we choose to use for the purpose of filling in other observations? .resolve makes this choice. By default, there will be an “error” if values of .var are inconsistent within .i and .t. Or, set .resolve to a summary function like .resolve = mean or .resolve = function(x) mean(x, na.rm = TRUE) to resolve inconsistencies before filling in. If you have some .i/.t combinations with both missing and non-missing values, the missing values will be filled in using the same function.

- +

The rest of the options include .group_i (by default, if .i can be found, data will be filled within-individual. Set .group_i = FALSE to ignore this), and standard arguments related to declaring the panel structure of the data (.i, .t, .d, .uniqcheck, see the “pibble” section above). .setpanel ensures that if you declare the panel structure in the panel_fill() function, it will be maintained in the object you get back.


@@ -496,25 +529,28 @@

Panel Consistency

In panel data, and especially hierarchical data, there are some variables that should be fixed within values of other variables. And if they’re not, you have a problem!

For example, consider the data set

-
df <- data.frame(continent = c("Asia", "Europe", "Europe", "S America", "S America"),
-           country = c("France", "France", "France", "Brazil", "Brazil"),
-           year = c(2000, 2001, 2002, 2000, 2001))
-
-df
-#>   continent country year
-#> 1      Asia  France 2000
-#> 2    Europe  France 2001
-#> 3    Europe  France 2002
-#> 4 S America  Brazil 2000
-#> 5 S America  Brazil 2001
+
df <- data.frame(
+  continent = c("Asia", "Europe", "Europe", "S America", "S America"),
+  country = c("France", "France", "France", "Brazil", "Brazil"),
+  year = c(2000, 2001, 2002, 2000, 2001)
+)
+
+df
+#>   continent country year
+#> 1      Asia  France 2000
+#> 2    Europe  France 2001
+#> 3    Europe  France 2002
+#> 4 S America  Brazil 2000
+#> 5 S America  Brazil 2001

The variable continent should never change within values of country - a country can’t change the continent it’s on! The fact that France changes continents from year to year in this data should be regarded as very fishy. It will be handy to spot these sorts of potential errors in your data set, and fix them if you think you know how.

fixed_check()

fixed_check() will look in your data .df for inconsistencies in the value of some variables .var within values of other variables .within.

+ .var = NULL, + .within = NULL +)

You should pick variables for .var that are supposed to be constant within combinations of .within.

If your data has problems and is inconsistent, fixed_check() will retun a list of data sets, one for each .var variable, containing the subset of the data that gives you problems. For our df with the France problem, that’s all of the France observations!

fixed_check(df, .var = continent, .within = country)$continent
@@ -524,17 +560,19 @@ 

#> 1 Asia France 2000 #> 2 Europe France 2001 #> 3 Europe France 2002

-

If your data is fine, and all .var variables are indeed constant within combinations of .within, then fixed_check() will return TRUE.

-
consistent_df <- data.frame(state = c(1, 1, 1, 2, 2, 2),
-                            year = c(2000, 2001, 2001, 2000, 2000, 2001),
-                            treatment = c(F, T, T, T, T, F),
-                            outcome = c(4.4, 3.2, 3.4, 5.5, 5.6, 8))
-
-# Since this policy treatment is administered on the state level,
-# everyone in the same state/year should get the same treatment.
-# And they do!
-fixed_check(consistent_df, .var = treatment, .within = c(state, year))
-#> [1] TRUE
+

If your data is fine, and all .var variables are indeed constant within combinations of .within, then fixed_check() will return TRUE.

+
consistent_df <- data.frame(
+  state = c(1, 1, 1, 2, 2, 2),
+  year = c(2000, 2001, 2001, 2000, 2000, 2001),
+  treatment = c(F, T, T, T, T, F),
+  outcome = c(4.4, 3.2, 3.4, 5.5, 5.6, 8)
+)
+
+# Since this policy treatment is administered on the state level,
+# everyone in the same state/year should get the same treatment.
+# And they do!
+fixed_check(consistent_df, .var = treatment, .within = c(state, year))
+#> [1] TRUE

Some handy fixed_check() tips:

  1. @@ -547,10 +585,11 @@

    fixed_force()

    fixed_force() will take a data set .df, find any inconsistencies in the variables .var within combinations of the variables .within, and will “fix” those inconsistencies, using the function .resolve to select the correct values. It will flag any changed values with a new variable named .flag.

    + .var = NULL, + .within = NULL, + .resolve = mode_order, + .flag = NA +)

    The default resolution function is mode_order() (see the Additional Calculations section), which calculates the mode, selecting the first-ordered value in the data if there are ties. The mode seems most relevant here, since the most likely (and responsible) use for fixed_force() is when you have data that is mostly correct but just has a few odd values that are likely just miscodes. mode_order() also is not just limited to numeric variables.

    Continuing with our France-in-Asia data set,

    fixed_force(df, .var = continent, .within = country, .flag = "altered")
    @@ -575,45 +614,61 @@ 

    Joins

    -

    pmdplyr offers a set of wrappers for the dplyr::join functions.

    +

    pmdplyr offers a set of wrappers for the dplyr::join() functions.

    -inexact_join

    -

    The set of inexact_join functions maps directly onto the set of dplyr::join functions: inexact_inner_join, inexact_left_join, inexact_right_join, inexact_full_join, inexact_semi_join, inexact_nest_join, and inexact_anti_join.

    -

    Here we will focus specifically on inexact_left_join; for the differences between the functions see the descriptions of the original join functions at help(join, package = "dplyr").

    -

    join functions take two data sets and join them based on matching values of a set of shared variables.

    - +inexact_join()

    +

    The set of inexact_join() functions maps directly onto the set of dplyr::join() functions:

    + +

    Here we will focus specifically on inexact_left_join(). For the differences between the functions, see dplyr::join().

    +

    join() functions take two data sets and join them based on matching values of a set of shared variables.

    +

    However, it is common (especially in a panel data context) to want to join two data frames where one of the variables does not line up exactly. For example, maybe we want those t = 1 values in left_df to pick up the t = 0 values in right_df.

    -

    We can do this, in a few different ways with an inexact_join:

    +

    We can do this, in a few different ways with an inexact_join():

    -

    The first arguments: x, y, by, copy, suffix, ..., are standard arguments to be passed to left_join. x and y are our left-hand and right-hand data sets, respectively. See help(left_join, package = "dplyr") for the rest.

    + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + var = NULL, + jvar = NULL, + method, + exact = TRUE +)
    +

    The first arguments: x, y, by, copy, suffix, ..., are standard arguments to be passed to left_join(). x and y are our left-hand and right-hand data sets, respectively. See dplyr::left_join() for the rest.

    We’ve added on here var, jvar, method, and exact.

    var is the variable in the left-hand data set that you would like to match inexactly on, and jvar is the variable(s) in the right-hand data set that you would like to match inexactly on. It’s important that the names of these variables aren’t shared, because the resulting data set will show how var and jvar line up. So let’s prepare our data by renaming t in right_df to something else so it’s not t in both data sets.

    right_df <- right_df %>%
    @@ -623,47 +678,50 @@ 

  2. method = "last" matches var to the closest value of jvar that is lower, so those t = 1 observations will get matched to t_right = 0, and t = 3 will get matched to t_right = 2 (meaning that t_right = 2 will get matched to both t = 2 and t = 3):
  3. - +
    • method = "next" matches var to the closest value of jvar that is higher, so now t = 1 will get matched to t_right = 2, and t = 3 will get matched to t_right = 4:
    - +
    • method = "closest" will match var to the closest value of jvar in either direction. If there’s a tie, it will pick the lower value of jvar. So now t = 1 will pick t_right = 0 (out of a tie between 0 and 2), and t = 3 will match to t = 2:
    - +
    • Finally, method = "between" is for matching var to a set of two jvars that define the beginning and end of a range. Make sure that the ranges are non-overlapping within the joining variables, or else you will get strange results (specifically, it should join to the earliest-starting range). So now, given the way we define t_bottom and t_top below, t = 1 should go in the range t_bottom = 0, t_top = 2, and t = 2 and t = 3 should both go in the range t_bottom = 2, t_top = 4.
    @@ -671,31 +729,33 @@

    rename(t_bottom = t_right) %>% mutate(t_top = t_bottom + 2) -inexact_left_join(left_df, - right_df, - var = t, jvar = c(t_bottom, t_top), - method = "between") -#> i t v1 t_bottom v2 t_top -#> 1 1 1 1 0 7 2 -#> 2 1 2 2 2 8 4 -#> 3 1 3 3 2 8 4 -#> 4 2 1 4 0 10 2 -#> 5 2 2 5 2 11 4 -#> 6 2 3 6 2 11 4

    +inexact_left_join(left_df, + right_df, + var = t, jvar = c(t_bottom, t_top), + method = "between" +) +#> i t v1 t_bottom v2 t_top +#> 1 1 1 1 0 7 2 +#> 2 1 2 2 2 8 4 +#> 3 1 3 3 2 8 4 +#> 4 2 1 4 0 10 2 +#> 5 2 2 5 2 11 4 +#> 6 2 3 6 2 11 4

    So that leaves us with exact. exact determines whether or not an exact match is an acceptable match, and interprets "last" as “this value or earlier” and "next" as “this value or later”. Generally, for joining purposes, you’ll want this to be TRUE. But perhaps you don’t! Maybe you want “earlier” or “later” only to get something like “the most recent previous value” for method = "last". In that case, set this to FALSE.

    -

    In the case of method = "between", it’s especially important to keep track of exact because it’s common for one range to start at the exact endpoint of another. If the end of one range is the exact start of another, exact = c(TRUE,FALSE) or exact = c(FALSE,TRUE) is recommended to avoid overlaps. Defaults to exact = c(TRUE,FALSE).

    +

    In the case of method = "between", it’s especially important to keep track of exact because it’s common for one range to start at the exact endpoint of another. If the end of one range is the exact start of another, exact = c(TRUE, FALSE) or exact = c(FALSE, TRUE) is recommended to avoid overlaps. Defaults to exact = c(TRUE, FALSE).

    safe_join()

    When joining two data sets x and y on a set of shared variables by, there are four ways in which they can be matched: one-to-many (by uniquely identifies rows in x but not y, so each observation in x will be matched to several in y), many-to-one (by uniquely identifies rows in y but not x, so each observation in y will be matched to several in x), one-to-one (by uniquely identifies rows in both x and y, so each observation in x will be matched to exactly one in y), and many-to-many (by does not uniquely identify rows in either x or y).

    -

    Unfortunately, when you perform a join or inexact_join, it doesn’t tell you which of those you’ve just done! This can be especially problematic if you’ve accidentally done a many-to-many join, since many-to-many join often leads to unexpected results.

    -

    safe_join() is a wrapper for all join and inexact_join functions which tells you whether you are, in fact, doing the join you expect to be doing, and returns an error if you’re not.

    +

    Unfortunately, when you perform a join() or inexact_join(), it doesn’t tell you which of those you’ve just done! This can be especially problematic if you’ve accidentally done a many-to-many join, since many-to-many join often leads to unexpected results.

    +

    safe_join() is a wrapper for all join() and inexact_join() functions which tells you whether you are, in fact, doing the join you expect to be doing, and returns an error if you’re not.

    -

    x, y, and ... are the standard join/inexact_join arguments that you would normally use. See help(join, package = "dplyr") or the inexact_join section above to see what arguments might go in ... to pass through to those functions, such as suffix or var.

    + expect = NULL, + join = NULL, + ... +)
    +

    x, y, and ... are the standard join()/inexact_join() arguments that you would normally use. See help(join, package = "dplyr") or the inexact_join section above to see what arguments might go in ... to pass through to those functions, such as suffix or var.

    expect is a character variable where you specify the type of join you think you’re about to do. You can specify this either as one-to-many / many-to-one / one-to-one directly, or you can specify which of the two data sets (x or y) you think should be uniquely identified by the joining variables.

    • @@ -708,10 +768,10 @@

      expect = "no m:m" indicates that you don’t care whether you’re one-to-one, one-to-many, or many-to-one, as long as you’re not many-to-many.

    • There is no expect option that allows you to run a many-to-many join.
    -

    safe_join will return an error if your data do not match your expect selection.

    +

    safe_join() will return an error if your data do not match your expect selection.

    If your data does match your expect option, then it will look to your join. join is the function for the join or inexact_join you’d like to run, for example join = inexact_left_join.

    If run without a join specified, safe_join() will return TRUE if you’re good to go. If run with a join specified, then instead safe_join() will pass your data on to the function and actually run the join for you.

    -

    There is little reason to run any join or inexact_join without going through safe_join(). It will help you avoid some nasty surprises!

    +

    There is little reason to run any join() or inexact_join() without going through safe_join(). It will help you avoid some nasty surprises!

    # left is panel data and i does not uniquely identify observations
     left <- data.frame(
       i = c(1, 1, 2, 2),
    @@ -760,38 +820,41 @@ 

    If you only have a few such comparisons to make, mutate_subset() lets you make them without fully widening the data. Just make a “value at the beginning of the sample” variable, if that’s all you need, without having to bother fully widening.

    Another common use is to make specific comparisons within groups. If I want to know how your earnings compare to the average earnings in your state, I can just do a within_i() calculation (see Additional Calculations section). But what if I want to know how your earnings compare to the average earnings of college graduates in your state? That’s harder. But mutate_subset() makes it easy.

    + ..., + .filter, + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = NA, + .uniqcheck = FALSE, + .setpanel = TRUE +)

    where .df is the data set being mutated and ... is a set of name-value pairs of expressions in the style of dplyr::mutate. Note that, since the idea here is to get a summary measure from a filtered group, expressions should be written such that they would be valid arguments in dplyr::summarize().

    .filter is a logical condition that describes the observations that you want to perform the ... calculations on.

    Let’s perform the analysis we described above, comparing an individual’s earnings to the average earnings of college graduates in their state:

    - +

    The rest of the options include .group_i (by default, if .i can be found, analysis will be performed within-individual. Set .group_i = FALSE to ignore this), and standard arguments related to declaring the panel structure of the data (.i, .t, .d, .uniqcheck, see the “pibble” section above). The .d = NA will become .d = 1 if either .i or .t are declared. .setpanel ensures that if you declare the panel structure in the panel_fill() function, it will be maintained in the object you get back.

    @@ -801,24 +864,27 @@

    In effect, you can think of mutate_cascade() as behaving much like cumsum(), cumprod(), cummax() or cummin(), except that it (1) respects the panel structure of the data, (2) works when you have multiple observations per .i/.t, (3) is much more flexible, and (4) is much slower.

    As of this writing mutate_cascade() is pretty darn slow (after all, if you have T time periods, you’re running T separate mutate commands in a loop!), so be careful in using it.

    + ..., + .skip = TRUE, + .backwards = FALSE, + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = NA, + .uniqcheck = FALSE, + .setpanel = TRUE +)

    where .df is the data set being mutated, and ... is the list of expressions to be passed to dplyr::mutate().

    .skip instructs mutate_cascade() to skip over the first time period (or last time period if backwards = TRUE). This should usually be set to TRUE, since most usages of mutate_cascade() involve a tlag(), and the tlag() of something in the first time period is usually NA. Then, you’ve filled in that first-period NA - now the tlag() in period 2 is NA as well, and it will cascade down to make your whole data set NA.

    .backwards, unsurprisingly, tells mutate_cascade() to start with the last time period and work backwards.

    Let’s do a very simple example and use mutate_cascade() to build a present discounted value. We have an asset with a payout each period, and we have a discount factor .95. We can build a present discounted value PDV by taking the PDV in the next period, multiplying it by .95, and adding on the current payout. But we need to calculate PDV one period at a time, so that we can use each period’s calculation to calculate the previous one.

    -
    df <- pibble(t = c(1, 2, 3, 4, 5),
    -             payout = c(3, 4, 2, 2, 4),
    -             .t = t) %>%
    -  mutate(PDV = payout) %>%
    -  mutate_cascade(PDV = payout + .95*tlag(PDV, .n = -1), .backwards = TRUE)
    +
    df <- pibble(
    +  t = c(1, 2, 3, 4, 5),
    +  payout = c(3, 4, 2, 2, 4),
    +  .t = t
    +) %>%
    +  mutate(PDV = payout) %>%
    +  mutate_cascade(PDV = payout + .95 * tlag(PDV, .n = -1), .backwards = TRUE)

    As expected, the PDV in period 5 is just the payout: 4. In period 4 it’s 2 + .95*4 = 5.8. Then in period 3 it’s 2 + .95*5.8 = 7.51, and so on.

    The rest of the options include .group_i (by default, if .i can be found, analysis will be performed within-individual. Set .group_i = FALSE to ignore this), and standard arguments related to declaring the panel structure of the data (.i, .t, .d, .uniqcheck, see the “pibble” section above). The .d = NA will become .d = 1 if either .i or .t are declared. .setpanel ensures that if you declare the panel structure in the panel_fill() function, it will be maintained in the object you get back.


    @@ -829,66 +895,73 @@

    tlag()

    tlag() is a function that lags a variable in time. It respects the panel structure of the data, works with multiple observations per combination of .i/.t, and, unlike plm::lag(), doesn’t run into masking problems by sharing a name with dplyr::lag(). Do remember that dplyr::lag() does not lag data in time, it lags data in the order of the data set.

    + .df = get(".", envir = parent.frame()), + .n = 1, + .default = NA, + .quick = FALSE, + .resolve = "error", + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = NA, + .uniqcheck = FALSE +)

    where .var is the variable being lagged, , and .df is the data set that variable lives in. If the data set is being passed in via %>%, then .df will automatically pick it up and you don’t need to specify it.

    .n is the number of periods to lag. Negative values of .n imply a lead instead of a lag (as in the example in mutate_cascade() in the Mutate Variations section). There’s not a separate tlead() function.

    .default is the value to use if a lag does not exist. By default, this is NA. So if you have data in periods 1 and 3 but not 2, then the tlag in the third period will produce NA.

    .quick is a setting you can use if your data is very nicely structured, with rows uniquely identified by .i/.t and there are either no gaps between time periods or .d = 0. tlag() will run more quickly with .quick = TRUE, but will produce incorrect results if these conditions are not met.

    - +

    If .var is not constant within combinations of .i and .t we have a problem! Which value do we choose to use for the purpose of filling in other observations? .resolve makes this choice. By default, there will be an “error” if values of .var are inconsistent within .i and .t. Or, set .resolve to a summary function like .resolve = mean or .resolve = function(x) mean(x, na.rm = TRUE) to resolve inconsistencies before filling in.

    - +

    The rest of the options include .group_i (by default, if .i can be found, lags will be performed within-individual. Set .group_i = FALSE to ignore this), and standard arguments related to declaring the panel structure of the data (.i, .t, .d, .uniqcheck, see the “pibble” section above). The .d = NA will become .d = 1 if either .i or .t are declared. .setpanel ensures that if you declare the panel structure in the panel_fill() function, it will be maintained in the object you get back.


    @@ -904,40 +977,43 @@

    \[between.i(x) = \bar{x}_i - \bar{x}\]

    where \(\bar{x}_i\) is the mean of x within the .i groups, and \(\bar{x}\) is the grand mean of x over all observations.

    Be aware that this is different from plm::between(), which returns \(\bar{x}_i\) and does not subtract out \(\bar{x}\).

    -

    The syntax for between_i is:

    +

    The syntax for between_i() is:

    + .df = get(".", envir = parent.frame()), + .fcn = function(x) mean(x, na.rm = TRUE), + .i = NULL, + .t = NULL, + uniqcheck = FALSE +)

    Where .var is the variable on which the transformation is performed, and .df is the data set. If the data set is being passed in via %>%, then .df will automatically pick it up and you don’t need to specify it. .fcn is the function applied to calculate the group and grand values, i.e. \(.fcn(x) = \bar{x}\). The standard definition of the between transformation is for this to be the mean, but it has been left flexible.

    The rest of the options include standard arguments related to declaring the panel structure of the data (.i, .t, .uniqcheck, see the “pibble” section above). .d is omitted because it is irrelevant to the calculation.

    An example of the between transformation follows:

    - +

    @@ -948,36 +1024,39 @@

    where \(\bar{x}_i\) is the mean of x within the .i groups.

    The syntax for within_i is:

    + .df = get(".", envir = parent.frame()), + .fcn = function(x) mean(x, na.rm = TRUE), + .i = NULL, + .t = NULL, + uniqcheck = FALSE +)

    Where .var is the variable on which the transformation is performed, and .df is the data set. If the data set is being passed in via %>%, then .df will automatically pick it up and you don’t need to specify it. .fcn is the function applied to calculate the group values, i.e. \(.fcn(x) = \bar{x}\). The standard definition of the within transformation is for this to be the mean, but it has been left flexible.

    The rest of the options include standard arguments related to declaring the panel structure of the data (.i, .t, .uniqcheck, see the “pibble” section above). .d is omitted because it is irrelevant to the calculation.

    An example of the between transformation follows:

    - +

    @@ -1029,7 +1108,7 @@

  4. Joins
  5. diff --git a/docs/index.html b/docs/index.html index d863e31..42fd91b 100644 --- a/docs/index.html +++ b/docs/index.html @@ -79,7 +79,7 @@ Panel Maneuvers in dplyr (pmdplyr) -

    The pmdplyr package is an extension to dplyr designed for cleaning and managing panel and hierarchical data. It contains variations on the dplyr mutate and join functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy pibble panel data object, which you can panel_convert() for use in one of the many packages that help you analyze panel data.

    +

    The pmdplyr package is an extension to dplyr designed for cleaning and managing panel and hierarchical data. It contains variations on the dplyr mutate and _join functions that address common panel data needs, and contains functions for managing and cleaning panel data. The goal is to get you a nice tidy pibble panel data object, which you can panel_convert() for use in one of the many packages that help you analyze panel data.

    Unlike other panel data packages, functions in pmdplyr are all designed to work even if there is more than one observation per individual per period. This comes in handy if each individual is observed multiple times per period - for example, multiple classes per student per term; or if you have hierarchical data - for example, multiple companies per country.

    Examples of pmdplyr use are below. These examples only cover some of the functionality of the package. See the Reference pkgdown page for a full list of functions, or help(pmdplyr).

    @@ -92,7 +92,7 @@

    College Scorecard Example

    -

    Let’s start with the fairly straightforward Scorecard data, which is uniquely identified by college ID unitid and year year, and which describes how well students who attended that college are doing years after attendance.

    +

    Let’s start with the fairly straightforward Scorecard data, which describes how well students who attended that college are doing years after attendance. Scorecard observations are uniquely identified by college ID unitid and year year.

    # Note that pmdplyr automatically loads dplyr as well
     library(pmdplyr)
     
    @@ -107,67 +107,71 @@ 

    unemp = c(.017, .036, .048, .040, .028, .025, .020) )

    I am interested in measuring the differences in ex-student earnings earnings_med between two-year and four-year colleges (pred_degree_awarded_ipeds == 2 or 3, respectively). But before we can do that we need to clean the data.

    -
    Scorecard <- Scorecard %>%
    +
    Scorecard %>%
       # We want pred_degree_awarded_ipeds to be consistent within college. No changers!
       # So let's drop them by using fixed_check with .resolve = "drop" to lose inconsistencies
    -  fixed_force(.var = pred_degree_awarded_ipeds,
    -              .within = unitid,
    -              .resolve = "drop") %>%
    -  # Then, get rid of pred_degree_awarded_ipeds == 1
    -  # And simplify our terms
    -  filter(pred_degree_awarded_ipeds %in% c(2,3)) %>%
    -  mutate(FourYear = pred_degree_awarded_ipeds == 3) %>%
    -  # earnings_med has some missing values - let's fill them in with 
    -  # the most recent nonmissing observations we have
    -  # - panel_locf respects the panel structure declared above with as_pibble()
    -  mutate(earnings_med = panel_locf(earnings_med)) %>%
    -  # Now let's bring in that unemployment data!
    -  # Since it's every other year, it won't line up properly
    -  # in the join. So let's use inexact_join to get the MOST RECENT
    -  # year to join with
    -  inexact_left_join(unemp_data, var = year, jvar = unemp_year, method = "last") %>%
    -  # To adjust for state-level trends, let's also control for a tlag of
    -  # average earnings within state.
    -  # The lag is at the state level, and state-year doesn't uniquely identify,
    -  # But that's okay! We just pick a .resolve function to handle disagreements.
    -  # (We could also do this straight in the regression model itself)
    -  mutate(lag_state_earnings = tlag(earnings_med,
    -                                      .i = state_abbr,
    -                                      .t = year,
    -                                      .resolve = mean))
    -
    -# Now we can run a basic regression. 
    -
    -summary(lm(
    -  earnings_med ~ 
    -    FourYear +
    -    unemp + 
    -    lag_state_earnings,
    -  data = Scorecard
    -))
    -#> 
    -#> Call:
    -#> lm(formula = earnings_med ~ FourYear + unemp + lag_state_earnings, 
    -#>     data = Scorecard)
    +  fixed_force(
    +    .var = pred_degree_awarded_ipeds,
    +    .within = unitid,
    +    .resolve = "drop"
    +  ) %>%
    +  # Then, get rid of pred_degree_awarded_ipeds == 1
    +  # And simplify our terms
    +  filter(pred_degree_awarded_ipeds %in% c(2, 3)) %>%
    +  mutate(FourYear = pred_degree_awarded_ipeds == 3) %>%
    +  # earnings_med has some missing values - let's fill them in with
    +  # the most recent nonmissing observations we have
    +  # - panel_locf respects the panel structure declared above with as_pibble()
    +  mutate(earnings_med = panel_locf(earnings_med)) %>%
    +  # Now let's bring in that unemployment data!
    +  # Since it's every other year, it won't line up properly
    +  # in the join. So let's use inexact_join to get the MOST RECENT
    +  # year to join with
    +  inexact_left_join(unemp_data, var = year, jvar = unemp_year, method = "last") %>%
    +  # To adjust for state-level trends, let's also control for a tlag of
    +  # average earnings within state.
    +  # The lag is at the state level, and state-year doesn't uniquely identify,
    +  # But that's okay! We just pick a .resolve function to handle disagreements.
    +  # (We could also do this straight in the regression model itself)
    +  mutate(lag_state_earnings = tlag(earnings_med,
    +    .i = state_abbr,
    +    .t = year,
    +    .resolve = mean
    +  )) -> scorecard_clean
    +
    +# Now we can run a basic regression.
    +
    +lm(
    +  earnings_med ~
    +  FourYear +
    +    unemp +
    +    lag_state_earnings,
    +  data = scorecard_clean
    +) %>% 
    +  summary()
     #> 
    -#> Residuals:
    -#>    Min     1Q Median     3Q    Max 
    -#> -25341  -4917   -528   4091  54587 
    +#> Call:
    +#> lm(formula = earnings_med ~ FourYear + unemp + lag_state_earnings, 
    +#>     data = scorecard_clean)
     #> 
    -#> Coefficients:
    -#>                      Estimate Std. Error t value Pr(>|t|)    
    -#> (Intercept)         5.933e+03  1.772e+03   3.348 0.000826 ***
    -#> FourYearTRUE        9.088e+03  3.511e+02  25.886  < 2e-16 ***
    -#> unemp              -4.564e+04  2.529e+04  -1.805 0.071215 .  
    -#> lag_state_earnings  7.348e-01  4.027e-02  18.244  < 2e-16 ***
    -#> ---
    -#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    -#> 
    -#> Residual standard error: 8209 on 2474 degrees of freedom
    -#>   (25161 observations deleted due to missingness)
    -#> Multiple R-squared:  0.3431, Adjusted R-squared:  0.3423 
    -#> F-statistic: 430.6 on 3 and 2474 DF,  p-value: < 2.2e-16
    -

    We could even improve that code - why not run the anti_join and inexact_left_join using safe_join? When we do the inexact_left_join, for example, we’re assuming that unemp_data is uniquely identified by unemp_year - is it really? safe_join would check for us and minimize error.

    +#> Residuals: +#> Min 1Q Median 3Q Max +#> -25341 -4917 -528 4091 54587 +#> +#> Coefficients: +#> Estimate Std. Error t value Pr(>|t|) +#> (Intercept) 5.933e+03 1.772e+03 3.348 0.000826 *** +#> FourYearTRUE 9.088e+03 3.511e+02 25.886 < 2e-16 *** +#> unemp -4.564e+04 2.529e+04 -1.805 0.071215 . +#> lag_state_earnings 7.348e-01 4.027e-02 18.244 < 2e-16 *** +#> --- +#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 +#> +#> Residual standard error: 8209 on 2474 degrees of freedom +#> (25161 observations deleted due to missingness) +#> Multiple R-squared: 0.3431, Adjusted R-squared: 0.3423 +#> F-statistic: 430.6 on 3 and 2474 DF, p-value: < 2.2e-16
    +

    We could even improve that code - why not run the anti_join() and inexact_left_join() using safe_join()? When we do the inexact_left_join(), for example, we’re assuming that unemp_data is uniquely identified by unemp_year—is it really? safe_join() would check for us and minimize error.

    @@ -177,14 +181,14 @@

    We have some difficulties to cover: making the ID and time variables behave, accounting for the between-route differences, and figuring out how to compare each price to the cheapo price.

    data(SPrail)
     
    -SPrail <- SPrail %>%
    +SPrail %>%
       # We have two ID variables - origin and destination.
       # pmdplyr has no problem with this, but maybe we want to export
       # to something like plm later, which can't handle it.
       # So let's use id_variable to combine them into one
       mutate(route_ID = id_variable(origin, destination)) %>%
       # We have a time variable down to the minute. Too fine-grained!
    -  # Let's back things up to the daily level, and 
    +  # Let's back things up to the daily level, and
       # create a nice integer time variable that's easy to use
       mutate(day = time_variable(insert_date, .method = "day")) %>%
       # Now we can declare a pibble
    @@ -192,25 +196,27 @@ 

    # We want to account for between-route differences in price, # so let's isolate the within variation mutate(price_w = within_i(price)) %>% - # We want to compare to the cheapo option, so let's use + # We want to compare to the cheapo option, so let's use # mutate_subset to get the average price of the cheapo option # and propogate that to the other options for comparison - mutate_subset(cheapo_price = mean(price, na.rm = TRUE), - .filter = train_class == "Turista con enlace") %>% - mutate(premium = price - cheapo_price) %>% - filter(train_class %in% c("Preferente", "Turista", "Turista Plus")) %>% - # Now let's compare premia - group_by(train_class) %>% - summarize(premium = mean(premium, na.rm = TRUE)) - -SPrail -#> # A tibble: 3 x 2 -#> train_class premium -#> <fct> <dbl> -#> 1 Preferente 29.9 -#> 2 Turista 8.36 -#> 3 Turista Plus 14.4

    -

    And so there we have it - Preferente will really set you back relative to the cheapo ticket on the same route.

    + mutate_subset( + cheapo_price = mean(price, na.rm = TRUE), + .filter = train_class == "Turista con enlace" + ) %>% + mutate(premium = price - cheapo_price) %>% + filter(train_class %in% c("Preferente", "Turista", "Turista Plus")) %>% + # Now let's compare premia + group_by(train_class) %>% + summarize(premium = mean(premium, na.rm = TRUE)) -> sprail_compare_premia + +sprail_compare_premia +#> # A tibble: 3 x 2 +#> train_class premium +#> <fct> <dbl> +#> 1 Preferente 29.9 +#> 2 Turista 8.36 +#> 3 Turista Plus 14.4

    +

    And so there we have it—Preferente will really set you back relative to the cheapo ticket on the same route.

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 02ee9fa..550bd99 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,4 +1,4 @@ -pandoc: '2.6' +pandoc: 2.7.2 pkgdown: 1.3.0.9100 pkgdown_sha: 1829398a4e97056fe7fb332d0e8b952784b49321 articles: diff --git a/docs/reference/index.html b/docs/reference/index.html index 2189cf7..33f4abd 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -223,7 +223,7 @@

    -

    Joins

    +

    Two-table verbs

    @@ -239,6 +239,18 @@

    safe_join()

    Join two data frames safely

    + + + +

    inner_join(<tbl_pb>) right_join(<tbl_pb>) full_join(<tbl_pb>) semi_join(<tbl_pb>) nest_join(<tbl_pb>) anti_join(<tbl_pb>)

    + +

    Join two pibbles together

    + + + +

    intersect(<tbl_pb>) union(<tbl_pb>) union_all(<tbl_pb>) setdiff(<tbl_pb>)

    + +

    Set operations

    @@ -324,7 +336,7 @@

    mutate(<tbl_pb>) mutate_all(<tbl_pb>) mutate_at(<tbl_pb>) mutate_if(<tbl_pb>) distinct(<tbl_pb>) group_by(<tbl_pb>) group_by_all(<tbl_pb>) group_by_all(<tbl_pb>) group_by_at(<tbl_pb>) group_by_if(<tbl_pb>) ungroup(<tbl_pb>) bind_cols(<tbl_pb>) intersect(<tbl_pb>) union(<tbl_pb>) union_all(<tbl_pb>) setdiff(<tbl_pb>) left_join(<tbl_pb>) inner_join(<tbl_pb>) right_join(<tbl_pb>) full_join(<tbl_pb>) semi_join(<tbl_pb>) nest_join(<tbl_pb>) anti_join(<tbl_pb>) select(<tbl_pb>) select_all(<tbl_pb>) select_at(<tbl_pb>) select_if(<tbl_pb>) rename(<tbl_pb>) rename_all(<tbl_pb>) rename_at(<tbl_pb>) rename_if(<tbl_pb>) summarize(<tbl_pb>) summarize_all(<tbl_pb>) summarize_at(<tbl_pb>) summarize_if(<tbl_pb>) summarise(<tbl_pb>) summarise_all(<tbl_pb>) summarise_at(<tbl_pb>) summarise_if(<tbl_pb>) transmute(<tbl_pb>) transmute_all(<tbl_pb>) transmute_at(<tbl_pb>) transmute_if(<tbl_pb>)

    +

    mutate(<tbl_pb>) mutate_all(<tbl_pb>) mutate_at(<tbl_pb>) mutate_if(<tbl_pb>) distinct(<tbl_pb>) group_by(<tbl_pb>) group_by_all(<tbl_pb>) group_by_all(<tbl_pb>) group_by_at(<tbl_pb>) group_by_if(<tbl_pb>) ungroup(<tbl_pb>) bind_cols(<tbl_pb>) select(<tbl_pb>) select_all(<tbl_pb>) select_at(<tbl_pb>) select_if(<tbl_pb>) rename(<tbl_pb>) rename_all(<tbl_pb>) rename_at(<tbl_pb>) rename_if(<tbl_pb>) summarize(<tbl_pb>) summarize_all(<tbl_pb>) summarize_at(<tbl_pb>) summarize_if(<tbl_pb>) summarise(<tbl_pb>) summarise_all(<tbl_pb>) summarise_at(<tbl_pb>) summarise_if(<tbl_pb>) transmute(<tbl_pb>) transmute_all(<tbl_pb>) transmute_at(<tbl_pb>) transmute_if(<tbl_pb>)

    pibble methods

    @@ -340,7 +352,7 @@

    Contents

  6. ID and time variables
  7. Filling in data
  8. Panel consistency
  9. -
  10. Joins
  11. +
  12. Two-table verbs
  13. Mutate variations
  14. Time lag
  15. Additional calculations
  16. diff --git a/docs/reference/join.html b/docs/reference/join.html new file mode 100644 index 0000000..b477328 --- /dev/null +++ b/docs/reference/join.html @@ -0,0 +1,224 @@ + + + + + + + + +Join two pibbles together — join.tbl_pb • pmdplyr + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    + +

    These are generic functions that dispatch to individual pibble methods. See join for +complete documentation.

    + +
    + +
    # S3 method for tbl_pb
    +inner_join(x, y, by = NULL, copy = FALSE,
    +  suffix = c(".x", ".y"), ...)
    +
    +# S3 method for tbl_pb
    +right_join(x, y, by = NULL, copy = FALSE,
    +  suffix = c(".x", ".y"), ...)
    +
    +# S3 method for tbl_pb
    +full_join(x, y, by = NULL, copy = FALSE,
    +  suffix = c(".x", ".y"), ...)
    +
    +# S3 method for tbl_pb
    +semi_join(x, y, by = NULL, copy = FALSE, ...)
    +
    +# S3 method for tbl_pb
    +nest_join(x, y, by = NULL, copy = FALSE,
    +  keep = FALSE, name = NULL, ...)
    +
    +# S3 method for tbl_pb
    +anti_join(x, y, by = NULL, copy = FALSE, ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    x

    tbls to join

    y

    tbls to join

    by

    a character vector of variables to join by. If NULL, the +default, *_join() will do a natural join, using all variables with +common names across the two tables. A message lists the variables so +that you can check they're right (to suppress the message, simply +explicitly list the variables that you want to join).

    +

    To join by different variables on x and y use a named vector. +For example, by = c("a" = "b") will match x.a to +y.b.

    copy

    If x and y are not from the same data source, +and copy is TRUE, then y will be copied into the +same src as x. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.

    suffix

    If there are non-joined duplicate variables in x and +y, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.

    ...

    other parameters passed onto methods, for instance, na_matches +to control how NA values are matched. See join.tbl_df for more.

    keep

    If TRUE the by columns are kept in the nesting joins.

    name

    the name of the list column nesting joins create. If NULL the name of y is used.

    + + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.3.0.9100.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/pibble_methods.html b/docs/reference/pibble_methods.html index 6e96634..6970fd5 100644 --- a/docs/reference/pibble_methods.html +++ b/docs/reference/pibble_methods.html @@ -157,44 +157,6 @@

    pibble methods

    # S3 method for tbl_pb bind_cols(.data, ...) -# S3 method for tbl_pb -intersect(x, y, ...) - -# S3 method for tbl_pb -union(x, y, ...) - -# S3 method for tbl_pb -union_all(x, y, ...) - -# S3 method for tbl_pb -setdiff(x, y, ...) - -# S3 method for tbl_pb -left_join(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -# S3 method for tbl_pb -inner_join(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -# S3 method for tbl_pb -right_join(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -# S3 method for tbl_pb -full_join(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -# S3 method for tbl_pb -semi_join(x, y, by = NULL, copy = FALSE, ...) - -# S3 method for tbl_pb -nest_join(x, y, by = NULL, copy = FALSE, - keep = FALSE, name = NULL, ...) - -# S3 method for tbl_pb -anti_join(x, y, by = NULL, copy = FALSE, ...) - # S3 method for tbl_pb select(.data, ...) diff --git a/docs/reference/setops.html b/docs/reference/setops.html new file mode 100644 index 0000000..7c5f787 --- /dev/null +++ b/docs/reference/setops.html @@ -0,0 +1,181 @@ + + + + + + + + +Set operations — setops • pmdplyr + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    + +

    These functions overwrite the set functions provided in base to make them generic to be used to +join pibbles. See setops for details.

    + +
    + +
    # S3 method for tbl_pb
    +intersect(x, y, ...)
    +
    +# S3 method for tbl_pb
    +union(x, y, ...)
    +
    +# S3 method for tbl_pb
    +union_all(x, y, ...)
    +
    +# S3 method for tbl_pb
    +setdiff(x, y, ...)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    x

    objects to perform set function on (ignoring order)

    y

    objects to perform set function on (ignoring order)

    ...

    other arguments passed on to methods

    + + +
    + +
    + + + +
    + + + + + + + + diff --git a/man/join.Rd b/man/join.Rd new file mode 100644 index 0000000..49091e1 --- /dev/null +++ b/man/join.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tbl_pb_methods.R +\name{join.tbl_pb} +\alias{join.tbl_pb} +\alias{inner_join.tbl_pb} +\alias{right_join.tbl_pb} +\alias{full_join.tbl_pb} +\alias{semi_join.tbl_pb} +\alias{nest_join.tbl_pb} +\alias{anti_join.tbl_pb} +\title{Join two pibbles together} +\usage{ +\method{inner_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) + +\method{right_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) + +\method{full_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) + +\method{semi_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, ...) + +\method{nest_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, + keep = FALSE, name = NULL, ...) + +\method{anti_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, ...) +} +\arguments{ +\item{x}{tbls to join} + +\item{y}{tbls to join} + +\item{by}{a character vector of variables to join by. If \code{NULL}, the +default, \code{*_join()} will do a natural join, using all variables with +common names across the two tables. A message lists the variables so +that you can check they're right (to suppress the message, simply +explicitly list the variables that you want to join). + +To join by different variables on x and y use a named vector. +For example, \code{by = c("a" = "b")} will match \code{x.a} to +\code{y.b}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{other parameters passed onto methods, for instance, \code{na_matches} +to control how \code{NA} values are matched. See \link{join.tbl_df} for more.} + +\item{keep}{If \code{TRUE} the by columns are kept in the nesting joins.} + +\item{name}{the name of the list column nesting joins create. If \code{NULL} the name of \code{y} is used.} +} +\description{ +These are generic functions that dispatch to individual pibble methods. See \link[dplyr]{join} for +complete documentation. +} diff --git a/man/pibble_methods.Rd b/man/pibble_methods.Rd index 5694710..b408706 100644 --- a/man/pibble_methods.Rd +++ b/man/pibble_methods.Rd @@ -13,17 +13,6 @@ \alias{group_by_if.tbl_pb} \alias{ungroup.tbl_pb} \alias{bind_cols.tbl_pb} -\alias{intersect.tbl_pb} -\alias{union.tbl_pb} -\alias{union_all.tbl_pb} -\alias{setdiff.tbl_pb} -\alias{left_join.tbl_pb} -\alias{inner_join.tbl_pb} -\alias{right_join.tbl_pb} -\alias{full_join.tbl_pb} -\alias{semi_join.tbl_pb} -\alias{nest_join.tbl_pb} -\alias{anti_join.tbl_pb} \alias{select.tbl_pb} \alias{select_all.tbl_pb} \alias{select_at.tbl_pb} @@ -74,33 +63,6 @@ \method{bind_cols}{tbl_pb}(.data, ...) -\method{intersect}{tbl_pb}(x, y, ...) - -\method{union}{tbl_pb}(x, y, ...) - -\method{union_all}{tbl_pb}(x, y, ...) - -\method{setdiff}{tbl_pb}(x, y, ...) - -\method{left_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -\method{inner_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -\method{right_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -\method{full_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, - suffix = c(".x", ".y"), ...) - -\method{semi_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, ...) - -\method{nest_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, - keep = FALSE, name = NULL, ...) - -\method{anti_join}{tbl_pb}(x, y, by = NULL, copy = FALSE, ...) - \method{select}{tbl_pb}(.data, ...) \method{select_all}{tbl_pb}(.tbl, .funs = list(), ...) diff --git a/man/setops.Rd b/man/setops.Rd new file mode 100644 index 0000000..a5154b9 --- /dev/null +++ b/man/setops.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tbl_pb_methods.R +\name{setops} +\alias{setops} +\alias{intersect.tbl_pb} +\alias{union.tbl_pb} +\alias{union_all.tbl_pb} +\alias{setdiff.tbl_pb} +\title{Set operations} +\usage{ +\method{intersect}{tbl_pb}(x, y, ...) + +\method{union}{tbl_pb}(x, y, ...) + +\method{union_all}{tbl_pb}(x, y, ...) + +\method{setdiff}{tbl_pb}(x, y, ...) +} +\arguments{ +\item{x}{objects to perform set function on (ignoring order)} + +\item{y}{objects to perform set function on (ignoring order)} + +\item{...}{other arguments passed on to methods} +} +\description{ +These functions overwrite the set functions provided in base to make them generic to be used to +join pibbles. See \link[dplyr]{setops} for details. +} diff --git a/tests/testthat/test-bad_input.R b/tests/testthat/test-bad_input.R index 0d840df..cf68e5a 100644 --- a/tests/testthat/test-bad_input.R +++ b/tests/testthat/test-bad_input.R @@ -2,9 +2,11 @@ # Note that the data.table warning is not tested so as to avoid needing the package ### BETWEEN_WITHIN -df <- pibble(i = 1:3, - x = 1:3, - .i = i) +df <- pibble( + i = 1:3, + x = 1:3, + .i = i +) test_that("between_i input failstates", { expect_error(df %>% dplyr::mutate(y = between_i(.))) @@ -35,16 +37,18 @@ test_that("safe_join input failstates", { test_that("inexact_join input failstates", { expect_error(pmdplyr:::inexact_join_prep(left, right, var = x, jvar = y, method = "last")) - expect_error(pmdplyr:::inexact_join_prep(left, right, var = 'x', jvar = y, method = "last")) + expect_error(pmdplyr:::inexact_join_prep(left, right, var = "x", jvar = y, method = "last")) expect_error(inexact_left_join(left, right, var = x, jvar = c(y, z, a), method = "last")) expect_error(inexact_left_join(left, right, var = x, jvar = y, method = 2)) expect_error(inexact_left_join(left, right, var = x, jvar = y, method = "last", exact = 2)) expect_error(inexact_left_join(left, right, var = i, jvar = i, method = "last")) expect_error(inexact_left_join(left, right, var = x, jvar = i, method = "last")) expect_error(inexact_left_join(left %>% dplyr::mutate(x = c("hey", "ho")), - right, var = x, jvar = y, method = "closest")) + right, + var = x, jvar = y, method = "closest" + )) expect_error(inexact_left_join(left, right %>% - mutate(y = c("hey", "ho")), var = x, jvar = y, method = "closest")) + mutate(y = c("hey", "ho")), var = x, jvar = y, method = "closest")) expect_warning(inexact_left_join(left, right, var = x, jvar = y, method = "closest", exact = FALSE)) expect_error(inexact_left_join(left, right, var = x, jvar = y, method = "foo")) expect_error(inexact_left_join(left, right, var = x, jvar = c(y, z), method = "between", exact = FALSE)) @@ -133,29 +137,34 @@ test_that("fixed_force input failstates", { }) ### UNEXPORTED_SHARED_FUNCTIONS -df <- data.frame(i = 1:3, - t = 1:3) +df <- data.frame( + i = 1:3, + t = 1:3 +) test_that("declare_in_fcn_check input failstates", { expect_error(declare_in_fcn_check(df, - .i = "i", - .t = "t", - .d = 1, - .uniqcheck = 2, - .setpanel = TRUE, - .noneed = FALSE)) + .i = "i", + .t = "t", + .d = 1, + .uniqcheck = 2, + .setpanel = TRUE, + .noneed = FALSE + )) expect_error(declare_in_fcn_check(df, - .i = "i", - .t = "t", - .d = 1, - .uniqcheck = FALSE, - .setpanel = 2, - .noneed = FALSE)) + .i = "i", + .t = "t", + .d = 1, + .uniqcheck = FALSE, + .setpanel = 2, + .noneed = FALSE + )) expect_error(declare_in_fcn_check(df, - .i = NA, - .t = NA, - .d = 1, - .uniqcheck = FALSE, - .setpanel = 2, - .noneed = FALSE)) + .i = NA, + .t = NA, + .d = 1, + .uniqcheck = FALSE, + .setpanel = 2, + .noneed = FALSE + )) }) diff --git a/vignettes/pmdplyr.Rmd b/vignettes/pmdplyr.Rmd index 6fee575..b77cecc 100644 --- a/vignettes/pmdplyr.Rmd +++ b/vignettes/pmdplyr.Rmd @@ -23,7 +23,7 @@ knitr::opts_chunk$set( library(pmdplyr) ``` -The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr` `mutate` and `join` functions that address common panel data needs, and contains functions for managing and cleaning panel data. +The `pmdplyr` package is an extension to `dplyr` designed for cleaning and managing panel and hierarchical data. It contains variations on the `dplyr::mutate()` and `dplyr::join()` functions that address common panel data needs, and contains functions for managing and cleaning panel data. Unlike other panel data packages, functions in `pmdplyr` are all designed to work even if there is more than one observation per individual per period. This comes in handy if each individual is observed multiple times per period - for example, multiple classes per student per term; or if you have hierarchical data - for example, multiple companies per country. @@ -45,43 +45,48 @@ Most functions in `pmdplyr` will allow you to declare `.i` and `.t` in the funct ```{r, eval=FALSE} pibble(..., - .i = NULL, - .t = NULL, - .d = 1, - .uniqcheck = FALSE) + .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE +) ``` or by transforming an existing `data.frame`, `list`, or `tbl_df` using `as_pibble()`: ```{r, eval=FALSE} as_pibble(x, - .i = NULL, - .t = NULL, - .d = 1, - .uniqcheck = FALSE, - ...) + .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE, + ... +) ``` -Both functions work exactly as `tibble::tibble` and `tibble::as_tibble` do, except that they also take the arguments `.i`, `.t`, and `.d`, with `.i` and `.t` accepting either unquoted or quoted variable names. If you'd like your `pibble` checked to see if `.i` and `.t` uniquely identify your observations, set `.uniqcheck = TRUE`. It will do this automatically the first time in each R session you create a `pibble`, but if you'd like it to keep doing it, use `uniqcheck`. +Both functions work exactly as `tibble::tibble()` and `tibble::as_tibble()` do, except that they also take the arguments `.i`, `.t`, and `.d`, with `.i` and `.t` accepting either unquoted or quoted variable names. If you'd like your `pibble` checked to see if `.i` and `.t` uniquely identify your observations, set `.uniqcheck = TRUE`. It will do this automatically the first time in each R session you create a `pibble`, but if you'd like it to keep doing it, use `uniqcheck`. As a side bonus, you can check if the variables `a, b, c` uniquely identify the observations in data set `d` by running `as_pibble(d, .i = c(a, b, c), .uniqcheck = TRUE)`. No warning? It's uniquely identified! ```{r} # .d = 1 by default, so in this data, # a = 1, b = 3 comes one period after a = 1, b = 2. -basic_pibble <- pibble(a = c(1, 1, 1, 2, 2, 2), - b = c(1, 2, 3, 2, 3, 3), - c = 1:6, - .i = a, - .t = b) +basic_pibble <- pibble( + a = c(1, 1, 1, 2, 2, 2), + b = c(1, 2, 3, 2, 3, 3), + c = 1:6, + .i = a, + .t = b +) data(SPrail) # In SPrail, insert_date does not imply regular gaps between # time periods, so we set .d = 0 declared_pibble <- as_pibble(SPrail, - .i = c(origin, destination), - .t = insert_date, - .d = 0) + .i = c(origin, destination), + .t = insert_date, + .d = 0 +) ``` ## panel_convert() @@ -89,9 +94,11 @@ declared_pibble <- as_pibble(SPrail, `pmdplyr` also has the function `panel_convert()` which allows you to convert between different popular R panel data objects, including `pibble`. This can come in handy for creating `pibbles`, or exporting your cleaned `pibble` to use with a package that does panel data *analysis* (which `pmdplyr` does not): ```{r, eval = FALSE} -panel_convert(data, - to, - ...) +panel_convert( + data, + to, + ... +) ``` Where `data` is a panel data object, either `pibble`, `tsibble`, `pdata.frame`, or `panel_data`, and `to` is the type of object you'd like returned, which you can refer to by object name, object class, or package name: get a `pibble` with `"pmdplyr"`, `"pibble"`, or `"tbl_pb"`, a `tsibble` with `"tsibble"` or `"tbl_ts"`, a `pdata.frame` with `"plm"` or `"pdata.frame"`, or a `panel_data` with `"panelr"` or `"panel_data"`. `...` sends additional arguments to the functions used to declare those objects. @@ -101,15 +108,15 @@ When using `panel_convert`, be aware that any grouping will be lost, and you mus All valid objects of the non-`pibble` types can be converted to `pibbles`, but the reverse is not true, since `pibble` does not enforce some strict requirements that other types do: -Feature/Requirement | `pibble` | `tsibble` | `pdata.frame` | `panel_data` ----------------------|-----------|----------------|--------------|------------ -ID | `.i` | `key` | `index[1]` | `id` -Time | `.t` | `index` | `index[2]` | `wave` -Gap control | `.d` | `regular` | No | No -ID must exist | No | No | Yes | Yes -Time must exist | No | Yes | Yes | Yes[1] -Only one ID variable[2]| No | No | Yes | Yes -Unique identification | No | Yes | No[3] | No[3] +Feature/Requirement | `pibble` | `tsibble`| `pdata.frame` | `panel_data` +-----------------------|-----------|-----------|----------------|------------ +ID | `.i` | `key` | `index[1]` | `id` +Time | `.t` | `index` | `index[2]` | `wave` +Gap control | `.d` | `regular` | No | No +ID must exist | No | No | Yes | Yes +Time must exist | No | Yes | Yes | Yes[1] +Only one ID variable[2]| No | No | Yes | Yes +Unique identification | No | Yes | No[3] | No[3] [1] `pdata.frame` does not require that time be provided, but if not provided will create it based on original ordering of the data. The `pdata.frame` option to set `index` equal to an integer for a balanced panel and have it figure out the rest by itself is not supported. @@ -135,8 +142,9 @@ Many panel data packages, including `pmdplyr`, prefer a time variable that is an ```{r, eval=FALSE} id_variable(..., - .method = "number", - .minwidth = FALSE) + .method = "number", + .minwidth = FALSE +) ``` where `...` is the set of identity variables that you want to combine into a single one (or, potentially, a single variable you'd like to encode numerically). @@ -148,13 +156,21 @@ where `...` is the set of identity variables that you want to combine into a sin * `.method = character` preserves all original information and combines the variables together into a string, adding spacing to ensure uniqueness. Set `.minwidth = TRUE` to remove the spacing, although this may lead to non-uniqueness in some cases. ```{r} -df <- data.frame(country = c("US", "US", "US", "US", - "GBR", "GBR", "GBR", "GBR"), - city = c("NYC", "NYC", "Cambridge", "NYC", - "Cambridge", "London", "Manchester", "Manchester")) %>% - mutate(numeric_ID = id_variable(country, city), - random_ID = id_variable(country, city, .method = "random"), - char_ID = id_variable(country, city, .method = "character")) +df <- data.frame( + country = c( + "US", "US", "US", "US", + "GBR", "GBR", "GBR", "GBR" + ), + city = c( + "NYC", "NYC", "Cambridge", "NYC", + "Cambridge", "London", "Manchester", "Manchester" + ) +) %>% + mutate( + numeric_ID = id_variable(country, city), + random_ID = id_variable(country, city, .method = "random"), + char_ID = id_variable(country, city, .method = "character") + ) df ``` @@ -165,25 +181,26 @@ df ```{r, eval = FALSE} time_variable(..., - .method = "present", - .datepos = NA, - .start = 1, - .skip = NA, - .breaks = NA, - .turnover = NA, - .turnover_start = NA) + .method = "present", + .datepos = NA, + .start = 1, + .skip = NA, + .breaks = NA, + .turnover = NA, + .turnover_start = NA +) ``` Where `...` is the set of variables that you want to combine into a single, `integer`-class time variable. The rest of the options determine how the variable(s) will be read or transformed; the need for each varies depending on the structure of the original data and which `.method` is used. `.method` can take the values: -* `.method="present"` will assume that, even if each individual may have some missing periods, each period is present in your data *somewhere*, and so simply numbers, in order, all the time periods observed in the data. -* `.method="year"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`) and will extract the year from it. Or, use it with a character or numeric variable and indicate with `.datepos` the character/digit positions that hold the year in YY or YYYY format. If combined with `.breaks` or `.skip`, will instead set the earliest year in the data to 1 rather than returning the actual year. -* `.method="month"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`). It will give the earliest-observed month in the data set a value of `1`, and will increment from there. Or, use it with a character or numeric variable and indicate with `.datepos` the character/digit positions that hold the year and month in YYMM or YYYYMM format (note that if your variable is in MMYYYY format, for example, you can just give a `.datepos` argument like `c(3:6,1:2)`). Months turn over on the `.start` day of the month, which is by default 1. -* `.method="week"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`). It will give the earliest-observed week in the data set a value of `1`, and will increment from there. Weeks turn over on the `.start` day, which is by default 1 (Monday). Note that this method always starts weeks on the same day of the week, which is different from standard `lubridate` procedure of counting sets of 7 days starting from January 1. -* `.method="day"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`). It will give the earliest-observed day in the data set a value of `1`, and increment from there. Or, use it with a character or numeric variable and indicate with `.datepos` the character/digit positions that hold the year and month in YYMMDD or YYYYMMDD format. To skip certain days of the week, such as weekends, use the `.skip` option. -* `.method="turnover"` can be used when you have more than one variable in variable and they are all numeric nonnegative integers. Set the `.turnover` option to indicate the highest value each variable takes before it starts over, and set `.turnover_start` to indicate what value it takes when it starts over. Cannot be combined with `.skip` or `.breaks`. Doesn't work with any variable for which the turnover values change, i.e. it doesn't play well with days-in-month - if you'd like to do something like year-month-day-hour, I recommend running `.method="day"` once with just the year-month-day variable, and then taking the result and combining *that* with hour in `.method="turnover"`. +* `.method = "present"` will assume that, even if each individual may have some missing periods, each period is present in your data *somewhere*, and so simply numbers, in order, all the time periods observed in the data. +* `.method = "year"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`) and will extract the year from it. Or, use it with a character or numeric variable and indicate with `.datepos` the character/digit positions that hold the year in YY or YYYY format. If combined with `.breaks` or `.skip`, will instead set the earliest year in the data to 1 rather than returning the actual year. +* `.method = "month"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`). It will give the earliest-observed month in the data set a value of `1`, and will increment from there. Or, use it with a character or numeric variable and indicate with `.datepos` the character/digit positions that hold the year and month in YYMM or YYYYMM format (note that if your variable is in MMYYYY format, for example, you can just give a `.datepos` argument like `c(3:6,1:2)`). Months turn over on the `.start` day of the month, which is by default 1. +* `.method = "week"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`). It will give the earliest-observed week in the data set a value of `1`, and will increment from there. Weeks turn over on the `.start` day, which is by default 1 (Monday). Note that this method always starts weeks on the same day of the week, which is different from standard `lubridate` procedure of counting sets of 7 days starting from January 1. +* `.method = "day"` can be used with a single `Date`/`POSIX`/etc.-type variable (anything that allows `lubridate::date()`). It will give the earliest-observed day in the data set a value of `1`, and increment from there. Or, use it with a character or numeric variable and indicate with `.datepos` the character/digit positions that hold the year and month in YYMMDD or YYYYMMDD format. To skip certain days of the week, such as weekends, use the `.skip` option. +* `.method = "turnover"` can be used when you have more than one variable in variable and they are all numeric nonnegative integers. Set the `.turnover` option to indicate the highest value each variable takes before it starts over, and set `.turnover_start` to indicate what value it takes when it starts over. Cannot be combined with `.skip` or `.breaks`. Doesn't work with any variable for which the turnover values change, i.e. it doesn't play well with days-in-month - if you'd like to do something like year-month-day-hour, I recommend running `.method="day"` once with just the year-month-day variable, and then taking the result and combining *that* with hour in `.method = "turnover"`. ```{r} data(SPrail) @@ -200,7 +217,9 @@ SPrail <- SPrail %>% ) # Let's see what we've got -head(SPrail %>% select(insert_date, ends_with("time_id"))) +SPrail %>% + select(insert_date, ends_with("time_id")) %>% + head() # Perhaps I'd like quarterly data # (although in this case there are only two months, not much variation there) @@ -210,7 +229,8 @@ SPrail <- SPrail %>% .breaks = c(1, 4, 7, 10) )) # Should line up properly with month -table(SPrail$month_time_id, SPrail$quarter_time_id, dnn = c('Month', 'Quarter')) +SPrail %>% + count(month_time_id, quarter_time_id) # Maybe I'd like Monday to come immediately after Friday! SPrail <- SPrail %>% @@ -254,17 +274,18 @@ You may wish to fill in either of these kinds of missing data using the data you ```{r, eval = FALSE} panel_fill(.df, - .set_NA = FALSE, - .min = NA, - .max = NA, - .backwards = FALSE, - .group_i = TRUE, - .flag = NA, - .i = NULL, - .t = NULL, - .d = 1, - .uniqcheck = FALSE, - .setpanel = TRUE) + .set_NA = FALSE, + .min = NA, + .max = NA, + .backwards = FALSE, + .group_i = TRUE, + .flag = NA, + .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE, + .setpanel = TRUE +) ``` `panel_fill()` will give us some newly-created observations, and we need to decide what to fill them in with. By default, it will fill in values using what we see in the most recent non-missing observation. But we can set `.backwards = TRUE` to use the *next* non-missing observation instead, or use `.set_NA` to fill the new observations with missing data. @@ -273,12 +294,14 @@ panel_fill(.df, ```{r} # Note the gap between periods 2 and 4 for person 1. -df <- pibble(i = c(1, 1, 1, 2, 2, 2), - t = c(2, 4, 5, 1, 2, 3), - x = 1:6, - y = 7:12, - .i = i, - .t = t) +df <- pibble( + i = c(1, 1, 1, 2, 2, 2), + t = c(2, 4, 5, 1, 2, 3), + x = 1:6, + y = 7:12, + .i = i, + .t = t +) panel_fill(df, .set_NA = "y", .flag = "new_obs") panel_fill(df, .set_NA = "y", .backwards = TRUE)$x @@ -300,51 +323,59 @@ The rest of the options include `.group_i` (by default, if `.i` can be found, da ```{r, eval = FALSE} panel_locf(.var, - .df = get(".", envir = parent.frame()), - .fill = NA, - .backwards = FALSE, - .resolve = "error", - .group_i = TRUE, - .i = NULL, - .t = NULL, - .d = 1, - .uniqcheck = FALSE) + .df = get(".", envir = parent.frame()), + .fill = NA, + .backwards = FALSE, + .resolve = "error", + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = 1, + .uniqcheck = FALSE +) ``` where `.var` is the variable to be filled in, and `.df` is the data set that variable lives in. If the data set is being passed in via `%>%`, then `.df` will automatically pick it up and you don't need to specify it. ```{r} -df <- pibble(i = c(1, 1, 1, 2, 2, 2), - t = c(1, 2, 3, 2, 3, 4), - x = c(1, NA, 3, NA, -3, 4), - .i = i, - .t = t) +df <- pibble( + i = c(1, 1, 1, 2, 2, 2), + t = c(1, 2, 3, 2, 3, 4), + x = c(1, NA, 3, NA, -3, 4), + .i = i, + .t = t +) # Notice that the fourth observation doesn't get filled in # because it's the first observation for person 2, so nothing to fill in from -df %>% +df %>% mutate(x_filled = panel_locf(x)) ``` You have a fair amount of control over how filling-in works. By default, data will be filled in using the most recent previous observation. But `.backwards = TRUE` will use the *next upcoming* observation instead. Also, by default, only `NA` values will be overwritten. But `.fill` will allow you to specify a vector of values (perhaps including `NA`) to be overwritten. This can be handy if you're working with data that uses missingness indicators other than `NA`. ```{r} -df %>% mutate(x_filled = panel_locf(x, .backwards = TRUE), - x_no_neg3 = panel_locf(x, .backwards = TRUE, .fill = c(NA, -3))) +df %>% mutate( + x_filled = panel_locf(x, .backwards = TRUE), + x_no_neg3 = panel_locf(x, .backwards = TRUE, .fill = c(NA, -3)) +) ``` `panel_locf()` will work even if `.i` and `.t` don't uniquely identify the observations. However, this presents a problem! If there are *different values* of `.var` for a given combination of `.i` and `.t`, then which value do we choose to use for the purpose of filling in other observations? `.resolve` makes this choice. By default, there will be an "error" if values of `.var` are inconsistent within `.i` and `.t`. Or, set `.resolve` to a summary function like `.resolve = mean` or `.resolve = function(x) mean(x, na.rm = TRUE)` to resolve inconsistencies before filling in. If you have some `.i`/`.t` combinations with both missing and non-missing values, the missing values will be filled in using the same function. ```{r} -inconsistent_df <- pibble(i = c(1, 1, 1, 2, 2, 2), - t = c(1, 1, 2, 1, 2, 3), - x = c(1, 2, NA, 1, 2, 3), - .i = i, - .t = t) - -inconsistent_df %>% mutate(x_filled = - panel_locf(x, .resolve = mean)) +inconsistent_df <- pibble( + i = c(1, 1, 1, 2, 2, 2), + t = c(1, 1, 2, 1, 2, 3), + x = c(1, 2, NA, 1, 2, 3), + .i = i, + .t = t +) +inconsistent_df %>% mutate( + x_filled = + panel_locf(x, .resolve = mean) +) ``` The rest of the options include `.group_i` (by default, if `.i` can be found, data will be filled within-individual. Set `.group_i = FALSE` to ignore this), and standard arguments related to declaring the panel structure of the data (`.i`, `.t`, `.d`, `.uniqcheck`, see the "pibble" section above). `.setpanel` ensures that if you declare the panel structure in the `panel_fill()` function, it will be maintained in the object you get back. @@ -358,9 +389,11 @@ In panel data, and especially hierarchical data, there are some variables that s For example, consider the data set ```{r} -df <- data.frame(continent = c("Asia", "Europe", "Europe", "S America", "S America"), - country = c("France", "France", "France", "Brazil", "Brazil"), - year = c(2000, 2001, 2002, 2000, 2001)) +df <- data.frame( + continent = c("Asia", "Europe", "Europe", "S America", "S America"), + country = c("France", "France", "France", "Brazil", "Brazil"), + year = c(2000, 2001, 2002, 2000, 2001) +) df ``` @@ -373,8 +406,9 @@ The variable `continent` should never change within values of `country` - a coun ```{r, eval = FALSE} fixed_check(.df, - .var = NULL, - .within = NULL) + .var = NULL, + .within = NULL +) ``` You should pick variables for `.var` that are supposed to be constant within combinations of `.within`. @@ -385,13 +419,15 @@ If your data has problems and is inconsistent, `fixed_check()` will retun a list fixed_check(df, .var = continent, .within = country)$continent ``` -If your data is fine, and all `.var` variables are indeed constant within combinations of `.within`, then `fixed_check()` will return TRUE. +If your data is fine, and all `.var` variables are indeed constant within combinations of `.within`, then `fixed_check()` will return `TRUE`. ```{r} -consistent_df <- data.frame(state = c(1, 1, 1, 2, 2, 2), - year = c(2000, 2001, 2001, 2000, 2000, 2001), - treatment = c(F, T, T, T, T, F), - outcome = c(4.4, 3.2, 3.4, 5.5, 5.6, 8)) +consistent_df <- data.frame( + state = c(1, 1, 1, 2, 2, 2), + year = c(2000, 2001, 2001, 2000, 2000, 2001), + treatment = c(F, T, T, T, T, F), + outcome = c(4.4, 3.2, 3.4, 5.5, 5.6, 8) +) # Since this policy treatment is administered on the state level, # everyone in the same state/year should get the same treatment. @@ -410,10 +446,11 @@ Some handy `fixed_check()` tips: ```{r, eval = FALSE} fixed_force(..df, - .var = NULL, - .within = NULL, - .resolve = mode_order, - .flag = NA) + .var = NULL, + .within = NULL, + .resolve = mode_order, + .flag = NA +) ``` The default resolution function is `mode_order()` (see the Additional Calculations section), which calculates the mode, selecting the first-ordered value in the data if there are ties. The mode seems most relevant here, since the most likely (and responsible) use for `fixed_force()` is when you have data that is mostly correct but just has a few odd values that are likely just miscodes. `mode_order()` also is not just limited to numeric variables. @@ -433,27 +470,35 @@ fixed_force(df, .var = continent, .within = country, .resolve = "drop") # Joins -`pmdplyr` offers a set of wrappers for the `dplyr::join` functions. +`pmdplyr` offers a set of wrappers for the `dplyr::join()` functions. + +## inexact_join() -## inexact_join +The set of `inexact_join()` functions maps directly onto the set of `dplyr::join()` functions: -The set of `inexact_join` functions maps directly onto the set of `dplyr::join` functions: `inexact_inner_join, inexact_left_join, inexact_right_join, inexact_full_join, inexact_semi_join, inexact_nest_join`, and `inexact_anti_join`. +- **Mutating joins**: `inexact_inner_join()`, `inexact_left_join()`, `inexact_right_join()`, `inexact_full_join()` +- **Filtering joins**: `inexact_semi_join()`, and `inexact_anti_join()` +- **Nesting joins**: `inexact_nest_join()` -Here we will focus specifically on `inexact_left_join`; for the differences between the functions see the descriptions of the original join functions at `help(join, package = "dplyr")`. +Here we will focus specifically on `inexact_left_join()`. For the differences between the functions, see `dplyr::join()`. -`join` functions take two data sets and join them based on matching values of a set of shared variables. +`join()` functions take two data sets and join them based on matching values of a set of shared variables. ```{r} -left_df <- data.frame(i = c(1, 1, 1, 2, 2, 2), - t = c(1, 2, 3, 1, 2, 3), - v1 = 1:6) -right_df <- data.frame(i = c(1, 1, 1, 2, 2, 2), - t = c(0, 2, 4, 0, 2, 4), - v2 = 7:12) +left_df <- data.frame( + i = c(1, 1, 1, 2, 2, 2), + t = c(1, 2, 3, 1, 2, 3), + v1 = 1:6 +) +right_df <- data.frame( + i = c(1, 1, 1, 2, 2, 2), + t = c(0, 2, 4, 0, 2, 4), + v2 = 7:12 +) # It automatically detects that i and t are the shared variables # and finds two combinations of those in left_df that are also -# in right_df: i = 1, t = 2, and i = 2, t = 2. So it brings the +# in right_df: i = 1, t = 2, and i = 2, t = 2. So it brings the # v2 values it can match up in to the joined data. # Other observations don't find a match left_join(left_df, right_df) @@ -461,21 +506,22 @@ left_join(left_df, right_df) However, it is common (especially in a panel data context) to want to join two data frames where one of the variables does not line up exactly. For example, maybe we want those `t = 1` values in `left_df` to pick up the `t = 0` values in `right_df`. -We can do this, in a few different ways with an `inexact_join`: +We can do this, in a few different ways with an `inexact_join()`: ```{r, eval = FALSE} inexact_left_join(x, y, - by = NULL, - copy = FALSE, - suffix = c(".x", ".y"), - ..., - var = NULL, - jvar = NULL, - method, - exact = TRUE) + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + var = NULL, + jvar = NULL, + method, + exact = TRUE +) ``` -The first arguments: `x, y, by, copy, suffix, ...`, are standard arguments to be passed to `left_join`. `x` and `y` are our left-hand and right-hand data sets, respectively. See `help(left_join, package = "dplyr")` for the rest. +The first arguments: `x, y, by, copy, suffix, ...`, are standard arguments to be passed to `left_join()`. `x` and `y` are our left-hand and right-hand data sets, respectively. See `dplyr::left_join()` for the rest. We've added on here `var, jvar, method`, and `exact`. @@ -491,28 +537,31 @@ right_df <- right_df %>% * `method = "last"` matches `var` to the closest value of `jvar` that is *lower*, so those `t = 1` observations will get matched to `t_right = 0`, and `t = 3` will get matched to `t_right = 2` (meaning that `t_right = 2` will get matched to both `t = 2` and `t = 3`): ```{r} -inexact_left_join(left_df, - right_df, - var = t, jvar = t_right, - method = "last") +inexact_left_join(left_df, + right_df, + var = t, jvar = t_right, + method = "last" +) ``` * `method = "next"` matches `var` to the closest value of `jvar` that is *higher*, so now `t = 1` will get matched to `t_right = 2`, and `t = 3` will get matched to `t_right = 4`: ```{r} -inexact_left_join(left_df, - right_df, - var = t, jvar = t_right, - method = "next") +inexact_left_join(left_df, + right_df, + var = t, jvar = t_right, + method = "next" +) ``` * `method = "closest"` will match `var` to the closest value of `jvar` in either direction. If there's a tie, it will pick the lower value of `jvar`. So now `t = 1` will pick `t_right = 0` (out of a tie between `0` and `2`), and `t = 3` will match to `t = 2`: ```{r} -inexact_left_join(left_df, - right_df, - var = t, jvar = t_right, - method = "closest") +inexact_left_join(left_df, + right_df, + var = t, jvar = t_right, + method = "closest" +) ``` * Finally, `method = "between"` is for matching `var` to a set of two `jvar`s that define the beginning and end of a *range*. Make sure that the ranges are non-overlapping within the joining variables, or else you will get strange results (specifically, it should join to the earliest-starting range). So now, given the way we define `t_bottom` and `t_top` below, `t = 1` should go in the range `t_bottom = 0, t_top = 2`, and `t = 2` and `t = 3` should both go in the range `t_bottom = 2, t_top = 4`. @@ -522,32 +571,34 @@ right_df <- right_df %>% rename(t_bottom = t_right) %>% mutate(t_top = t_bottom + 2) -inexact_left_join(left_df, - right_df, - var = t, jvar = c(t_bottom, t_top), - method = "between") +inexact_left_join(left_df, + right_df, + var = t, jvar = c(t_bottom, t_top), + method = "between" +) ``` So that leaves us with `exact`. `exact` determines whether or not an exact match is an acceptable match, and interprets `"last"` as "this value or earlier" and `"next"` as "this value or later". Generally, for joining purposes, you'll want this to be `TRUE`. But perhaps you don't! Maybe you want "earlier" or "later" only to get something like "the most recent previous value" for `method = "last"`. In that case, set this to `FALSE`. -In the case of `method = "between"`, it's especially important to keep track of `exact` because it's common for one range to start at the exact endpoint of another. If the end of one range is the exact start of another, `exact = c(TRUE,FALSE)` or `exact = c(FALSE,TRUE)` is recommended to avoid overlaps. Defaults to `exact = c(TRUE,FALSE)`. +In the case of `method = "between"`, it's especially important to keep track of `exact` because it's common for one range to start at the exact endpoint of another. If the end of one range is the exact start of another, `exact = c(TRUE, FALSE)` or `exact = c(FALSE, TRUE)` is recommended to avoid overlaps. Defaults to `exact = c(TRUE, FALSE)`. ## safe_join() When joining two data sets `x` and `y` on a set of shared variables `by`, there are four ways in which they can be matched: one-to-many (`by` uniquely identifies rows in `x` but not `y`, so each observation in `x` will be matched to several in `y`), many-to-one (`by` uniquely identifies rows in `y` but not `x`, so each observation in `y` will be matched to several in `x`), one-to-one (`by` uniquely identifies rows in both `x` and `y`, so each observation in `x` will be matched to exactly one in `y`), and many-to-many (`by` does not uniquely identify rows in either `x` or `y`). -Unfortunately, when you perform a `join` or `inexact_join`, it doesn't tell you which of those you've just done! This can be especially problematic if you've accidentally done a many-to-many join, since many-to-many join often leads to unexpected results. +Unfortunately, when you perform a `join()` or `inexact_join()`, it doesn't tell you which of those you've just done! This can be especially problematic if you've accidentally done a many-to-many join, since many-to-many join often leads to unexpected results. -`safe_join()` is a wrapper for all `join` and `inexact_join` functions which tells you whether you are, in fact, doing the join you expect to be doing, and returns an error if you're not. +`safe_join()` is a wrapper for all `join()` and `inexact_join()` functions which tells you whether you are, in fact, doing the join you expect to be doing, and returns an error if you're not. ```{r, eval = FALSE} safe_join(x, y, - expect = NULL, - join = NULL, - ...) + expect = NULL, + join = NULL, + ... +) ``` -`x`, `y`, and `...` are the standard `join`/`inexact_join` arguments that you would normally use. See `help(join, package = "dplyr")` or the `inexact_join` section above to see what arguments might go in `...` to pass through to those functions, such as `suffix` or `var`. +`x`, `y`, and `...` are the standard `join()`/`inexact_join()` arguments that you would normally use. See `help(join, package = "dplyr")` or the `inexact_join` section above to see what arguments might go in `...` to pass through to those functions, such as `suffix` or `var`. `expect` is a character variable where you specify the type of join you *think* you're about to do. You can specify this either as one-to-many / many-to-one / one-to-one directly, or you can specify which of the two data sets (`x` or `y`) you think should be uniquely identified by the joining variables. @@ -557,13 +608,13 @@ safe_join(x, y, * `expect = "no m:m"` indicates that you don't care whether you're one-to-one, one-to-many, or many-to-one, as long as you're not many-to-many. * There is no `expect` option that allows you to run a many-to-many join. -`safe_join` will return an error if your data do not match your `expect` selection. +`safe_join()` will return an error if your data do not match your `expect` selection. If your data *does* match your `expect` option, then it will look to your `join`. `join` is the function for the `join` or `inexact_join` you'd like to run, for example `join = inexact_left_join`. If run without a `join` specified, `safe_join()` will return `TRUE` if you're good to go. If run with a `join` specified, then instead `safe_join()` will pass your data on to the function and actually run the join for you. -There is little reason to run any `join` or `inexact_join` without going through `safe_join()`. It will help you avoid some nasty surprises! +There is little reason to run any `join()` or `inexact_join()` without going through `safe_join()`. It will help you avoid some nasty surprises! ```{r} # left is panel data and i does not uniquely identify observations @@ -612,14 +663,15 @@ Another common use is to make specific comparisons within groups. If I want to k ```{r, eval = FALSE} mutate_subset(.df, - ..., - .filter, - .group_i = TRUE, - .i = NULL, - .t = NULL, - .d = NA, - .uniqcheck = FALSE, - .setpanel = TRUE) + ..., + .filter, + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = NA, + .uniqcheck = FALSE, + .setpanel = TRUE +) ``` where `.df` is the data set being mutated and `...` is a set of name-value pairs of expressions in the style of `dplyr::mutate`. Note that, since the idea here is to get a summary measure from a filtered group, expressions should be written such that they would be valid arguments in `dplyr::summarize()`. @@ -629,10 +681,12 @@ where `.df` is the data set being mutated and `...` is a set of name-value pairs Let's perform the analysis we described above, comparing an individual's earnings to the average earnings of college graduates in their state: ```{r} -df <- pibble(state = c("CA", "CA", "CA", "NV", "NV", "NV"), - college = c(TRUE, TRUE, FALSE, TRUE, FALSE, FALSE), - earn = c(1, 2, 3, 2, 3, 2), - .i = state) +df <- pibble( + state = c("CA", "CA", "CA", "NV", "NV", "NV"), + college = c(TRUE, TRUE, FALSE, TRUE, FALSE, FALSE), + earn = c(1, 2, 3, 2, 3, 2), + .i = state +) df %>% # Calculate average earnings of college grads @@ -653,15 +707,16 @@ As of this writing `mutate_cascade()` is pretty darn slow (after all, if you hav ```{r, eval = FALSE} mutate_cascade(.df, - ..., - .skip = TRUE, - .backwards = FALSE, - .group_i = TRUE, - .i = NULL, - .t = NULL, - .d = NA, - .uniqcheck = FALSE, - .setpanel = TRUE) + ..., + .skip = TRUE, + .backwards = FALSE, + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = NA, + .uniqcheck = FALSE, + .setpanel = TRUE +) ``` where `.df` is the data set being mutated, and `...` is the list of expressions to be passed to `dplyr::mutate()`. @@ -673,11 +728,13 @@ where `.df` is the data set being mutated, and `...` is the list of expressions Let's do a very simple example and use `mutate_cascade()` to build a present discounted value. We have an asset with a `payout` each period, and we have a discount factor `.95`. We can build a present discounted value `PDV` by taking the `PDV` in the next period, multiplying it by `.95`, and adding on the current `payout`. But we need to calculate `PDV` one period at a time, so that we can use each period's calculation to calculate the previous one. ```{r} -df <- pibble(t = c(1, 2, 3, 4, 5), - payout = c(3, 4, 2, 2, 4), - .t = t) %>% +df <- pibble( + t = c(1, 2, 3, 4, 5), + payout = c(3, 4, 2, 2, 4), + .t = t +) %>% mutate(PDV = payout) %>% - mutate_cascade(PDV = payout + .95*tlag(PDV, .n = -1), .backwards = TRUE) + mutate_cascade(PDV = payout + .95 * tlag(PDV, .n = -1), .backwards = TRUE) ``` As expected, the `PDV` in period `5` is just the payout: `4`. In period `4` it's `2 + .95*4 = 5.8`. Then in period `3` it's `2 + .95*5.8 = 7.51`, and so on. @@ -692,16 +749,17 @@ The rest of the options include `.group_i` (by default, if `.i` can be found, an ```{r, eval = FALSE} tlag(.var, - .df = get(".", envir = parent.frame()), - .n = 1, - .default = NA, - .quick = FALSE, - .resolve = "error", - .group_i = TRUE, - .i = NULL, - .t = NULL, - .d = NA, - .uniqcheck = FALSE) + .df = get(".", envir = parent.frame()), + .n = 1, + .default = NA, + .quick = FALSE, + .resolve = "error", + .group_i = TRUE, + .i = NULL, + .t = NULL, + .d = NA, + .uniqcheck = FALSE +) ``` where `.var` is the variable being lagged, , and `.df` is the data set that variable lives in. If the data set is being passed in via `%>%`, then `.df` will automatically pick it up and you don't need to specify it. @@ -713,17 +771,21 @@ where `.var` is the variable being lagged, , and `.df` is the data set that vari `.quick` is a setting you can use if your data is very nicely structured, with rows uniquely identified by `.i`/`.t` and there are either no gaps between time periods or `.d = 0`. `tlag()` will run more quickly with `.quick = TRUE`, but will produce incorrect results if these conditions are not met. ```{r} -df <- pibble(i = c(1, 1, 1, 2, 2, 2), - t = c(1, 2, 3, 1, 2, 3), - x = 1:6, - .i = i, - .t = t) %>% +df <- pibble( + i = c(1, 1, 1, 2, 2, 2), + t = c(1, 2, 3, 1, 2, 3), + x = 1:6, + .i = i, + .t = t +) %>% # A lag and a lead, filling in the lead with 0 instead of NA - mutate(x_lag = tlag(x), - x_lead = tlag(x, .n = -1, .default = 0), - # Our data satisfies the .quick conditions so we can - # do that for a little extra speed - x_quicklag = tlag(x, .quick = TRUE)) + mutate( + x_lag = tlag(x), + x_lead = tlag(x, .n = -1, .default = 0), + # Our data satisfies the .quick conditions so we can + # do that for a little extra speed + x_quicklag = tlag(x, .quick = TRUE) + ) df ``` @@ -731,11 +793,13 @@ df If `.var` is not constant within combinations of `.i` and `.t` we have a problem! Which value do we choose to use for the purpose of filling in other observations? `.resolve` makes this choice. By default, there will be an "error" if values of `.var` are inconsistent within `.i` and `.t`. Or, set `.resolve` to a summary function like `.resolve = mean` or `.resolve = function(x) mean(x, na.rm = TRUE)` to resolve inconsistencies before filling in. ```{r} -df <- pibble(i = c(1, 1, 1, 2, 2, 2), - t = c(1, 1, 2, 1, 1, 2), - x = 1:6, - .i = i, - .t = t) %>% +df <- pibble( + i = c(1, 1, 1, 2, 2, 2), + t = c(1, 1, 2, 1, 1, 2), + x = 1:6, + .i = i, + .t = t +) %>% mutate(x_lag = tlag(x, .resolve = mean)) df @@ -762,15 +826,16 @@ where $\bar{x}_i$ is the mean of `x` within the `.i` groups, and $\bar{x}$ is th Be aware that this is different from `plm::between()`, which returns $\bar{x}_i$ and does not subtract out $\bar{x}$. -The syntax for `between_i` is: +The syntax for `between_i()` is: ```{r, eval = FALSE} between_i(.var, - .df = get(".", envir = parent.frame()), - .fcn = function(x) mean(x, na.rm = TRUE), - .i = NULL, - .t = NULL, - uniqcheck = FALSE) + .df = get(".", envir = parent.frame()), + .fcn = function(x) mean(x, na.rm = TRUE), + .i = NULL, + .t = NULL, + uniqcheck = FALSE +) ``` Where `.var` is the variable on which the transformation is performed, and `.df` is the data set. If the data set is being passed in via `%>%`, then `.df` will automatically pick it up and you don't need to specify it. `.fcn` is the function applied to calculate the group and grand values, i.e. $.fcn(x) = \bar{x}$. The standard definition of the between transformation is for this to be the mean, but it has been left flexible. @@ -780,9 +845,11 @@ The rest of the options include standard arguments related to declaring the pane An example of the between transformation follows: ```{r} -df <- pibble(i = c(1, 1, 2, 2), - x = 1:4, - .i = i) %>% +df <- pibble( + i = c(1, 1, 2, 2), + x = 1:4, + .i = i +) %>% mutate(between_x = between_i(x)) # Notice that the grand mean is... @@ -792,7 +859,7 @@ df %>% group_by(i) %>% summarize(x = mean(x)) -# So the between calculation should be +# So the between calculation should be # 1.5 - 2.5 = -1 and 3.5 - 2.5 = 1 for the different groups: df$between_x ``` @@ -811,11 +878,12 @@ The syntax for `within_i` is: ```{r, eval = FALSE} within_i(.var, - .df = get(".", envir = parent.frame()), - .fcn = function(x) mean(x, na.rm = TRUE), - .i = NULL, - .t = NULL, - uniqcheck = FALSE) + .df = get(".", envir = parent.frame()), + .fcn = function(x) mean(x, na.rm = TRUE), + .i = NULL, + .t = NULL, + uniqcheck = FALSE +) ``` Where `.var` is the variable on which the transformation is performed, and `.df` is the data set. If the data set is being passed in via `%>%`, then `.df` will automatically pick it up and you don't need to specify it. `.fcn` is the function applied to calculate the group values, i.e. $.fcn(x) = \bar{x}$. The standard definition of the within transformation is for this to be the mean, but it has been left flexible. @@ -825,9 +893,11 @@ The rest of the options include standard arguments related to declaring the pane An example of the between transformation follows: ```{r} -df <- pibble(i = c(1, 1, 2, 2), - x = 1:4, - .i = i) %>% +df <- pibble( + i = c(1, 1, 2, 2), + x = 1:4, + .i = i +) %>% mutate(within_x = within_i(x)) # Notice that the mean within groups is... @@ -835,7 +905,7 @@ df %>% group_by(i) %>% summarize(x = mean(x)) -# So the between calculation should be +# So the between calculation should be # 1 - 1.5 = -.5 and 2 - 1.5 = .5 for individual 1 # and 3 - 3.5 = -.5 and 4 - 3.5 = .5 individual 2: df$within_x