forked from NSAPH/GPSmatching
/
create_matching.R
executable file
·135 lines (110 loc) · 4.49 KB
/
create_matching.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#' @title
#' Create pseudo population using matching casual inference approach
#'
#' @description
#' Generates pseudo population based on matching casual inference method.
#'
#' @param data_obj A list of elements. Including An original dataset as well
#' as helper vectors from estimating GPS. See [compile_pseudo_pop()] for more
#' details.
#' @param bin_seq Sequence of w (treatment) to generate pseudo population. If
#' NULL is passed the default value will be used, which is
#' `seq(min(w)+delta_n/2,max(w), by=delta_n)`.
#' @param gps_density Model type which is used for estimating GPS value, including
#' `normal` (default) and `kernel`.
#' @param nthread Number of available cores.
#' @param ... Additional arguments passed to the function.
#'
#' @return
#' Returns data.table of matched set.
#'
#' @keywords internal
#'
create_matching <- function(data_obj, exposure_col_name, bin_seq = NULL,
gps_density = "normal",
nthread = 1, ...) {
# Passing packaging check() ----------------------------
delta_n <- NULL
counter_weight <- NULL
i.counter_weight <- NULL
dist_measure <- NULL
# ------------------------------------------------------
dot_args <- list(...)
arg_names <- names(dot_args)
for (i in arg_names) {
assign(i, unlist(dot_args[i], use.names = FALSE))
}
#matching_fun <- get(matching_fun)
gps_mx <- data_obj$gps_mx
w_mx <- data_obj$w_mx
if (is.null(bin_seq)) {
bin_num<-seq(w_mx[1] + delta_n / 2, w_mx[2], by = delta_n)
used_bin <- "Default"
} else {
bin_num <- bin_seq
used_bin <- "User defined"
}
logger::log_debug(used_bin, "bin seq is used. Min: {min(bin_num)} ",
"Max: {max(bin_num)}, count: {length(bin_num)}.")
logger::log_debug("Started generating matched set ...")
st_t_m <- proc.time()
matched_set <- lapply(bin_num,
matching_fn,
dataset=data_obj$dataset,
exposure_col_name = exposure_col_name,
e_gps_pred = data_obj$dataset$e_gps_pred,
e_gps_std_pred = data_obj$dataset$e_gps_std_pred,
w_resid=data_obj$dataset$w_resid,
gps_mx = gps_mx,
w_mx = w_mx,
dist_measure = dist_measure,
gps_density = gps_density,
delta_n = delta_n,
scale = scale,
nthread = nthread)
e_t_m <- proc.time()
logger::log_debug("Finished generating matched set (Wall clock time: ",
" {(e_t_m - st_t_m)[[3]]} seconds).")
logger::log_debug("Started working on compiling ... ")
s_comp_p <- proc.time()
cp_original_data <- data_obj$dataset
# create initial freq_table
logger::log_debug("Started working on merging the frequency table ... ")
s_bindlist <- proc.time()
N <- N.x <- N.y <- id <- NULL
freq_table <- data.table(id=numeric(), N=integer())
for (i in seq(1, length(matched_set))){
if (length(matched_set[[i]]) == 0){
# bins that does not have any match.
next
}
freq_table <- merge(freq_table, matched_set[[i]],
by = "id",
all = TRUE)
row.names(freq_table) <- NULL
freq_table[is.na(freq_table)] <- 0
freq_table[, N:= N.x + N.y]
freq_table[, N.x:= NULL]
freq_table[, N.y:= NULL]
}
e_bindlist <- proc.time()
logger::log_debug(paste0("Finished binding the frequency table ",
"(Wall clock time: ",
(e_bindlist - s_bindlist)[[3]]," seconds)."))
cp_original_data["counter_weight"] <- rep(0, nrow(cp_original_data))
if (nrow(freq_table) != 0) {
c_w <- cp_original_data[, c("id", "counter_weight")]
data.table::setDT(c_w)
merged_dt <- merge(c_w, freq_table, by="id", all.x = TRUE)
merged_dt[is.na(N), N := 0]
merged_dt[, counter_weight := counter_weight + N]
c_w[merged_dt, counter_weight := i.counter_weight, on = "id"]
data.table::setDF(c_w)
cp_original_data$counter_weight <- NULL
cp_original_data <- merge(cp_original_data, c_w, by = "id")
}
e_comp_p <- proc.time()
logger::log_debug("Finished compiling (vectorized) (Wall clock time: ",
" {(e_comp_p - s_comp_p)[[3]]} seconds).")
return(data.table(cp_original_data))
}