/
tools.R
170 lines (139 loc) · 5.23 KB
/
tools.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#' Summary of a count variable
#'
#' Summary of a count variable.
#'
#' The function does a similar job to \code{table()} with more flexibility
#' introduced by the argument \code{breaks}. The user can decide how to break
#' the count values and decide to merge some cells if needed.
#'
#' @param count integer, observed count value for every individual in the
#' sample.
#' @param breaks integer, values at which the breaks should happen. The function
#' will compute the observed frequency in \code{[breaks[i], breaks[i + 1])}.
#' @param formatChar logical, should the values be converted to character and
#' formatted?
#' @return \code{matrix} with 2 rows and \code{length(breaks)} columns. The
#' column names are the cells names. The rows are the observed frequencies
#' and relative frequencies (probabilities).
#' @export
count_table <- function(count, breaks, formatChar = FALSE) {
if (missing(breaks))
breaks <- sort(unique(count))
cl <- character()
nb <- numeric()
for (i in 1:(length(breaks) - 1)) {
cl <- c(cl, .getNames_(breaks[i], breaks[i + 1]))
nb <- c(nb, .getValues_(breaks[i], breaks[i + 1], count))
}
## final count
cl <- c(cl, paste(">=", breaks[length(breaks)]))
nb <- c(nb, sum(count >= breaks[length(breaks)]))
mat <- matrix(nrow = 2, ncol = length(cl))
mat[1, ] <- nb
mat[2, ] <- nb / sum(nb)
colnames(mat) <- cl
rownames(mat) <- c("Frequency", "Relative frequency")
if (formatChar) {
mat[1, ] <- formatC(mat[1, ])
mat[2, ] <- trimws(formatC(as.numeric(mat[2, ]), digits = 2))
}
mat
}
.getNames_ <- function(start, end) {
if (end - start == 1)
return(as.character(start))
else
return(as.character(paste(start, end - 1, sep = "-")))
}
.getValues_ <- function(start, end, count) {
sum(count >= start & count < end)
}
.getValues2_ <- function(start, end, count, values) {
sum(values[which(count >= start & count < end)])
}
.getValuesMat_ <- function(start, end, count, values) {
if (missing(end)) {
## val <- values[, which(count < start), drop = FALSE]
## return(1 - rowSums(val))
val <- values[, which(count >= start), drop = FALSE]
return(rowSums(val))
} else {
val <- values[, which(count >= start & count < end), drop = FALSE]
return(rowSums(val))
}
}
.getValues3_ <- function(start, count, values) {
sum(values[which(count >= start)])
}
.computeCountRange <- function(count, count_range) {
for (i in 1:length(count_range)) {
ri <- count_range[i]
if (grepl("-", ri)) {
tmp <- unlist(strsplit(ri, "-"))
start <- as.numeric(tmp[1])
end <- as.numeric(tmp[2])
ans <- count >= start & count <= end
} else if (grepl(">=", ri)) {
tmp <- unlist(strsplit(ri, ">="))
ans <- count >= as.numeric(trimws(tmp[2]))
} else
ans <- count == as.numeric(ri)
if (ans) return(i)
}
}
.adjust_breaks <- function(breaks, res, pij) {
counts <- character()
actual <- numeric()
model <- numeric()
p <- matrix(nrow = nrow(pij), ncol = length(breaks))
for (i in 1:(length(breaks) - 1)) {
counts <- c(counts, .getNames_(breaks[i], breaks[i + 1]))
actual <- c(actual, .getValues2_(breaks[i], breaks[i + 1],
res$Counts, res$Actual)
)
model <- c(model,
.getValues2_(breaks[i], breaks[i + 1],
res$Counts, res$Predicted)
)
p[, i] <- .getValuesMat_(breaks[i], breaks[i + 1],
res$Counts, pij)
}
## produce the final counts
i <- length(breaks)
counts <- c(counts, paste(">=", breaks[i]))
actual <- c(actual, .getValues3_(breaks[i], res$Counts, res$Actual))
p[, i] <- .getValuesMat_(breaks[i],, res$Counts, pij)
model <- c(model, .getValues3_(breaks[i], res$Counts, res$Predicted))
res <- data.frame(Counts = counts, Actual = actual, Predicted = model)
colnames(p) <- counts
attr(p, "details") <- res
p
}
.get_dij_mat <- function(lhs_dat, count_range_) {
cl <- sapply(lhs_dat, .computeCountRange,
count_range = count_range_
)
.ff <- function(ind) {
res <- rep(0, length(count_range_))
res[ind] <- 1
res
}
t(sapply(cl, .ff))
}
.run_chisq_reg <- function(si, dij_pij, cols_) {
## build the regression matrix: 1_i ~ s_i + dij_pij, j =1,'' J-1
mat <- data.frame(y = 1, si, dij_pij[, -ncol(dij_pij)])
mod <- mod <- lm(y~. -1, data = mat)
stat <- nrow(mat) * summary(mod)$r.squared
## format the result
rval <- matrix(NA, nrow = 1, ncol = 3)
colnames(rval) <- c("DF", "Chisq", "Pr(>Chisq)")
rval[, 1] <- length(cols_) - 1 ## number of cells - 1
rval[, 2] <- stat
rval[, 3] <- pchisq(stat, rval[, 1], lower.tail = FALSE)
title <- "chi-square goodness-of-fit test\n"
topnote <- paste("Cells considered ", paste(cols_, collapse = " "),
sep = "", collapse = "\n")
structure(as.data.frame(rval), heading = c(title, topnote),
class = c("anova", "data.frame"))
}