In [None]:
#' rlookup - Reverse ID Lookup
#'
#' Looks up a unique identifier for an entity from an authoritative source 
#' by using multiple non-unique and/or partially unreliable individual identifiers
#'
#' Author: Jacob J. Walker
#'
#' Note: Currently this is a linear script, but it will be changed into a function in the future to be released as an R Package
#' Also, it is very likely that this could be done much more efficiently, but this works for now

In [None]:
# Initialize Packages  
# Automatically installs packages that do not exist, and loads them also
# When this is made as an actual package, it will need to be trimmed down to only load the truly necessary packages

if (!require('tidyverse')) install.packages('tidyverse'); library('tidyverse')          # Loads dplyr, and all the other parts of tidyverse

In [None]:
# Variables/Objects
#
# Data Frames / Tibbles
# ---------------------
# source - Source file which needs the unique identifier
# target - Target file where transformed source file will be saved
# lookup_table - table of unique identifiers and other identifiers
#
# Columns (Use with !! and := notation for tibbles)
# -------
# target_uid_column - name of the column that will contain the unique identifier in the target
# lookup_uid_column - name of hte column that contains the unique identifier in the lookup_table
# source_nuid_columns - vector of names of the columns that will be used in combination to do the reverse lookup from the source
# lookup_nuid_columns - vector of names of the columuns that will be used in combination to do the reverse lookup from the lookup_table

In [None]:
# Load lookup_table

lookup_table <- tibble(num = c(44, 43, 42, 41, 40, 39),
                       fname = c("Barack",     "George",     "William",       "George",               "Ronald",     "James"     ),
                       mname = c("Hussein",    "Walker",     "Jefferson",     "Herbert Walker",       "Wilson",     "Earl"      ),
                       lname = c("Obama",      "Bush",       "Clinton",       "Bush",                 "Reagon",     "Carter"    ),
               bdate = as.Date(c("1961-08-04", "1946-07-06", "1946-08-19",    "1924-06-12",           "1911-02-06", "1924-10-01")))

# Set Lookup ID Column Names
lookup_uid_column <- "num"
lookup_nuid_columns <- c("fname",      "mname",       "lname",     "bdate")

In [None]:
# Load source data

source <- tibble(id = "", 
                 first_name =  c("Barack",     "George",     "Bill",       "George",     "Ronald",     "Jimmy"     ),
                 middle_name = c("Hussein",    "Walker",     "",           "H.W.",       "Wilson",     ""          ),
                 last_name  =  c("Obama",      "Bush",       "Clinton",    "Bush",       "Reagon",     "Carter"    ),
                 dob = as.Date(c("1961-08-04", "1946-07-06", "1946-08-19", "1924-06-12", "1911-02-06", "1924-10-01")))

# Set Source (and Target) ID Columns names
target_uid_column <- "id"
source_nuid_columns <- c("first_name", "middle_name", "last_name", "dob"  )

In [None]:
# Makes a copy of the source table as the target
target <- source

# Reverses the order of the nuid columns to prioritize the first elements in the final lookup
source_nuid_columns <- rev(source_nuid_columns)
lookup_nuid_columns <- rev(lookup_nuid_columns)

# Clears the uid field in the target
target %>% mutate(!!sym(target_uid_column) := eval(NA)) -> target

# Transform nuid fields to lower case, and date in source for comparisons (if needed)

In [None]:
for (target_row in 1:nrow(target)) {
    for (nuid_column_1 in 1:length(source_nuid_columns)) {
        for (nuid_column_2 in nuid_column_1:length(source_nuid_columns)) {
            if (nuid_column_1 != nuid_column_2) {
                if(nrow(filter(lookup_table, 
                            !!sym(lookup_nuid_columns[nuid_column_1]) == target[[target_row,!!source_nuid_columns[nuid_column_1]]], 
                            !!sym(lookup_nuid_columns[nuid_column_2]) == target[[target_row,!!source_nuid_columns[nuid_column_2]]]))==1) {
                    target[target_row, {{target_uid_column}}] <- filter(lookup_table, 
                            !!sym(lookup_nuid_columns[nuid_column_1]) == target[[target_row,!!source_nuid_columns[nuid_column_1]]], 
                            !!sym(lookup_nuid_columns[nuid_column_2]) == target[[target_row,!!source_nuid_columns[nuid_column_2]]])[[{{lookup_uid_column}}]]
                }
            }
        }
    }
}

In [None]:
target