In [None]:
#' rlookup - Reverse Lookup
#'
#' Performs a reverse lookup to find a unique identifier for data that has non-unique identifiers
#' 
#' The rlookup function can help when data is collected about known entities (for example customers), but the data collected does not contain a unique
#' identifier for the entity (for example customer survey data that does not contain the customer id).  If the data contains non-unique identifiers (for example first name, last name, age, and city), the rlookup function compares permutations of these non-unique identifiers with a lookup table (such as a customer database) to find unique matches between the two datasets. The rlookup function then adds the unique identifier (i.e. customer id) to the
#' entities in the data that does not have the unique identifier (i.e. survey data), so that traditional data tools can connect the two data sources.
#'
#' Version: 0.1.0
#' Author: Jacob J. Walker

In [None]:
# Initialize Packages  
# Automatically installs packages that do not exist, and loads them also
# When this is made as an actual package, it will need to be trimmed down to only load the truly necessary packages

if (!require('tidyverse')) install.packages('tidyverse'); library('tidyverse')          # Loads dplyr, and all the other parts of tidyverse
if (!require('available')) install.packages('available'); library('available')          # Used to see if a package name is available

In [None]:
# Features to be added (product backlog)
# --------------------------------------
# Add uid field to tbl_missing_uid if it doesn't have the field already
# Have parameter to choose whether to return all rows from the original tbl_missing_uid, or only those with uids that were found, or uids still missing
# Have the capability to do more permutations than just pairs of nuids
# Increase the efficiency of the code, so that it can run quicker over big data sets

In [None]:
# Defines rlookup function
rlookup <- function(tbl_missing_uid, lookup_tbl, uid = NULL, nuids = NULL) {
    
    # Set the uid list for for tbl_missing_uid (x) and lookup_tbl (y) based upon what the user entered, or a shared column name if no value entered
    uid <- common_by(by = uid, tbl_missing_uid, lookup_tbl)
    
    # Set the nuids list for tbl_missing_uid (x) and lookup_tbl (y) based upon what the user entered, or shared column names if no value entered
    nuids <- common_by(by = nuids, tbl_missing_uid, lookup_tbl)
    
    # Reverses the order of the nuid columns to prioritize the first elements in the final lookup
    nuids$x <- rev(nuids$x)
    nuids$y <- rev(nuids$y)
    
    # Adds uid field to tbl_missing_uid if it doesn't have the field already
    ## Add Code Here
    
    # Makes a copy of the table that is missing a unique identifier column
    result <- tbl_missing_uid
    
    # Clears the uid field in the target
    result %>% mutate(!!sym(uid$y[[1]]) := eval(NA)) -> result
    
    # Loops through the tbl_missing_uid to see if any pairs of nuids have a unique match in the lookup_tbl
    for (result_row in 1:nrow(result)) {
        for (nuid_column_1 in 1:length(nuids$x)) {
            for (nuid_column_2 in nuid_column_1:length(nuids$x)) {
                if (nuid_column_1 != nuid_column_2) {
                    if(nrow(filter(lookup_tbl, 
                                !!sym(nuids$y[nuid_column_1]) == result[[result_row,!!nuids$x[nuid_column_1]]], 
                                !!sym(nuids$y[nuid_column_2]) == result[[result_row,!!nuids$x[nuid_column_2]]]))==1) {
                        result[result_row, {{uid$x[[1]]}}] <- filter(lookup_tbl, 
                                !!sym(nuids$y[nuid_column_1]) == result[[result_row,!!nuids$x[nuid_column_1]]], 
                                !!sym(nuids$y[nuid_column_2]) == result[[result_row,!!nuids$x[nuid_column_2]]])[[{{uid$y[[1]]}}]]
                    }
                }
            }
        }
    }
    
    # Returns the result table
    result
}

In [None]:
# Load example tbl_missing_uid
tbl_missing_uid <- tibble(id = "", 
                         first_name =  c("Barack",     "George",     "Bill",       "George",     "Ronald",     "Jimmy"     ),
                         middle_name = c("Hussein",    "Walker",     "",           "H.W.",       "Wilson",     ""          ),
                         last_name  =  c("Obama",      "Bush",       "Clinton",    "Bush",       "Reagon",     "Carter"    ),
                         dob = as.Date(c("1961-08-04", "1946-07-06", "1946-08-19", "1924-06-12", "1911-02-06", "1924-10-01")))

# Load example lookup_tbl
lookup_tbl <-   tibble(num = c(44, 43, 42, 41, 40, 39),
                       fname = c("Barack",     "George",     "William",       "George",               "Ronald",     "James"     ),
                       mname = c("Hussein",    "Walker",     "Jefferson",     "Herbert Walker",       "Wilson",     "Earl"      ),
                       lname = c("Obama",      "Bush",       "Clinton",       "Bush",                 "Reagon",     "Carter"    ),
               bdate = as.Date(c("1961-08-04", "1946-07-06", "1946-08-19",    "1924-06-12",           "1911-02-06", "1924-10-01")))

In [None]:
result <- rlookup(tbl_missing_uid, lookup_tbl, uid = c("id" = "num"), 
                  nuids = c(            "first_name" = "fname", 
                                        "middle_name" = "mname", 
                                        "last_name" = "lname",
                                        "dob" = "bdate"))

In [None]:
result