# 1.载入包和定义函数

In [1]:
# Install required packages and library them
packages <- c("LaplacesDemon",
              "dplyr")

for (i in packages) {
    if (!suppressMessages(require(i, character.only = TRUE))) {
        install.packages(i)
    }
}

In [2]:
rnorm_truncated <- function(n, mean, sd, ...) {
    additional_arguments <- list(...)
    if (length(additional_arguments) == 1 & names(additional_arguments)[[1]] == 'a') {
        X = c()
        j = 0
        for (i in 1 : 10000) {
            x = rnorm(1, mean, sd)
            if (x >= additional_arguments[[1]]) {
                X = c(X, x)
                j = j + 1
            } 
            if (j == n) {
                j = 0
                break
            }
        }
    } else if (length(additional_arguments) == 1 & names(additional_arguments)[[1]] == 'b') {
        X = c()
        j = 0
        for (i in 1 : 10000) {
            x = rnorm(1, mean, sd)
            if (x <= additional_arguments[[1]]) {
                X = c(X, x)
                j = j + 1
            } 
            if (j == n) {
                j = 0
                break
            }
        }
    } else if (length(additional_arguments) == 2) {
        X = c()
        j = 0
        for (i in 1 : 10000) {
            x = rnorm(1, mean, sd)
            if (x >= additional_arguments[[1]] & x <= additional_arguments[[2]]) {
                X = c(X, x)
                j = j + 1
            } 
            if (j == n) {
                j = 0
                break
            }
        }
    } else {
        stop("wrong input")
    }
    return(X)
}

# 2.数据

## 2.1 模拟情景（原理和方法）

### （1）设计矩阵
$X_1 \sim \psi(62, 8, 18, 130; x)$
\begin{equation}
  \psi(\mu, \sigma, a, b; x) =
    \begin{cases}
      0 & \text{if $x < a$}\\
      \frac{\Phi(\mu, \sigma, x)}{\Phi(\mu, \sigma, b) - \Phi(\mu, \sigma, a)} & \text{if $a \leq x \leq b$} \\
      0 & \text{if $x > b$}
    \end{cases}       
\end{equation}
$X_2 \sim Bern(0.42 + 0.001 \times X_1)$ <br>
$X_3 \sim \nu(x, X_2)$
\begin{equation}
  \nu(x, X_2) =
    \begin{cases}
      (1 - 0.42 + 0.001 \times X_1) \times \psi(20, 5, 13, 49; x) & \text{if $X_2 = 0$}\\
      (0.42 + 0.001 \times X_1) \times \psi(23, 5, 15, 50; x) & \text{if $X_2 = 1$}\\
    \end{cases}       
\end{equation}
$X_4 \sim \psi(110, 18, 20, 300; x)$ <br>
$X_5 \sim Bern(0.13 + 0.005 \times X_2 + 0.02 \times I(X_4 > 140))$ <br>
$X_6 \sim Bern(0.23 + 0.0005 \times X_1)$ <br>
$X_7 \sim Bern(0.29 + 0.01 \times X_2)$ <br>
$X_8 \sim Bern(0.2)$ <br>
$X_9 \sim Multin(0.68, 0.2, 0.12)$ <br>
$X_{10} \sim \psi(4.4 + 0.1 \times X_2, 1.1, 0, 100; x)$ <br>
$log(X_{11}) \sim N(log(2.8), 0.2)$ <br>
$X_{12} \sim Pois(0.3)$ <br>
$X_{13} \sim Pois(0.05 + 0.01 \times X_1)$ <br>
$X_{14} \sim Bern(0.08)$ <br>
$X_{15} \sim Bern(0.02)$ <br>
$X_{16} \sim Multin(0.19, 0.50, 0.25, 0.06)$

### （2）生存结局变量
$T \sim Exp(\lambda_0 + \beta_1 \times X_1 + \beta_2 \times X_1 ^ 2 + \beta_3 \times X_2 + \beta_4 \times X_3 + \beta_5 \times X_5 + \beta_6 \times X_7 + \beta_7 \times log(X_{11}) + \beta_8 \times X_2 \times X_6 + \beta_9 \times X_2 \times X_{15} \times X_{16} + \beta_{10} \times I(X_4 > 150))$ <br>
$C_{administrative} = 7$ <br>
$C_{random} \sim U(1, 7)$ <br>
$T_{death} \sim Exp(\Lambda_0 + \alpha_1 \times X_1)$ <br>
**没有competing risk的场景** <br>
$Y = min(T, C_{administrative}, C_{random})$ <br>
$\Delta = I(Y=T)$ <br>
**有competing risk的场景** <br>
$Y = min(T, C_{administrative}, C_{random}, T_{death})$ <br>
$\Delta = I(Y=T)$

### （3）二分类结局变量
$Y \sim Bern(\theta_{0} + \theta_{1} \times X_{1} + \theta_2 \times X_2 + \theta_3 \times X_5 + \theta_4 \times I(X_9 = 1) + \theta_5 \times I(X_{10} \geq 4.5))$

## 2.2 模拟代码

In [3]:
set.seed(1234)

In [4]:
N = 5000
## Design matrix
X_1 = round(rnorm_truncated(N, 62, 5, a = 18, b = 130), 0)
X_2 = rbinom(N, 1, 0.42 + 0.001 * X_1)
X_3 = round((1 - X_2) * rnorm_truncated(N, 20, 5, a = 13, b = 49) + 
            X_2 * rnorm_truncated(N, 23, 5, a = 15, b = 50), 1)
X_4 = round(rnorm_truncated(N, 110, 18, a = 20, b = 300), 0)
X_5 = rbinom(N, 1, 0.13 + 0.005 * X_2 + 0.02 * (X_4 > 140))
X_6 = rbinom(N, 1, 0.23 + 0.0005 * X_1)
X_7 = rbinom(N, 1, 0.29 + 0.01 * X_2)
X_8 = rbinom(N, 1, 0.2)
X_9 = rcat(N, c(0.68, 0.20, 0.12))
X_10 = round(rnorm_truncated(N, 4.4 + 0.1 * X_2, 1.1, a = 0, b = 100), 2)
X_11 = round(exp(rnorm(N, log(2.8), 0.2)), 2)
X_12 = rpois(N, 0.3)
X_13 = rpois(N, 0.05 + 0.01 * X_1)
X_14 = rbinom(N, 1, 0.08)
X_15 = rbinom(N, 1, 0.02)
X_16 = rcat(N, c(0.19, 0.50, 0.25, 0.06))

In [5]:
T = round(rexp(N, rate = 0.0001 + log(1.0003) * X_1 + log(1.00002) * X_1 ^ 2 + log(1.1) * X_2 + log(1.0002) * X_3 + log(1.3) * X_5 + 
               log(1.2) * X_7 + log(1.0004) * log(X_11) + log(1.02) * X_2 * X_6 + log(1.01) * X_2 * X_15 * X_16 + 
               log(1.02) * I(X_4 > 150)), 4)
C_adm = 7
C_random = runif(N, min = 1, max = 7)
T_death = rexp(N, rate = 0.002 + log(1.02) * X_1)
Y = pmin(T, C_adm, C_random)
delta = ifelse(Y == T, 1, 0)

In [6]:
Y_binary = rbinom(N, 1, 0.02 - 0.0001 * X_1 + 0.05 * X_2 + 0.1 * X_5 + 0.04 * I(X_9 == 1) + 0.02 * I(X_10 >= 4.5))

In [7]:
dataset <- cbind(ID = 1 : N, 
                 X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14, X_15, X_16, 
                 Y, delta, Y_binary) %>% data.frame() %>% mutate(type = 'raw')

In [8]:
head(dataset, n = 10)

Unnamed: 0_level_0,ID,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_11,X_12,X_13,X_14,X_15,X_16,Y,delta,Y_binary,type
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,56,0,15.4,127,0,1,1,0,1,...,4.2,1,0,0,0,3,1.511,1,0,raw
2,2,63,0,14.8,122,0,0,0,0,1,...,3.72,0,0,0,0,3,2.854611,0,0,raw
3,3,67,1,28.5,130,0,1,1,1,1,...,2.57,0,0,0,0,4,1.614861,0,1,raw
4,4,50,1,23.3,85,0,0,0,0,1,...,3.1,0,0,0,0,2,5.20872,0,0,raw
5,5,64,0,17.9,109,0,0,0,0,1,...,2.61,0,1,0,0,1,4.71357,0,0,raw
6,6,65,0,18.9,82,0,0,1,0,2,...,2.57,1,0,0,0,3,1.4283,1,0,raw
7,7,59,0,20.8,139,0,1,1,1,1,...,2.09,0,1,0,0,2,5.6598,1,0,raw
8,8,59,0,14.6,97,0,0,0,0,2,...,3.48,0,3,0,0,2,2.3131,1,0,raw
9,9,59,0,25.6,66,0,1,0,0,1,...,2.78,1,2,0,0,3,4.956085,0,0,raw
10,10,58,0,17.9,88,0,1,0,0,1,...,2.85,0,1,1,0,1,6.300248,0,0,raw


# 3.外部数据

## 3.1 模拟情景 (外部验证数据集)

### （1）设计矩阵
$X_1 \sim \psi(61, 7, 18, 120; x)$
\begin{equation}
  \psi(\mu, \sigma, a, b; x) =
    \begin{cases}
      0 & \text{if $x < a$}\\
      \frac{\Phi(\mu, \sigma, x)}{\Phi(\mu, \sigma, b) - \Phi(\mu, \sigma, a)} & \text{if $a \leq x \leq b$} \\
      0 & \text{if $x > b$}
    \end{cases}       
\end{equation}
$X_2 \sim Bern(0.43 + 0.001 \times X_1)$ <br>
$X_3 \sim \nu(x, X_2)$
\begin{equation}
  \nu(x, X_2) =
    \begin{cases}
      (1 - 0.43 + 0.001 \times X_1) \times \psi(19, 6, 13, 49; x) & \text{if $X_2 = 0$}\\
      (0.43 + 0.001 \times X_1) \times \psi(24, 5, 15, 50; x) & \text{if $X_2 = 1$}\\
    \end{cases}       
\end{equation}
$X_4 \sim \psi(120, 20, 10, 400; x)$ <br>
$X_5 \sim Bern(0.14 + 0.004 \times X_2 + 0.02 \times I(X_4 > 140))$ <br>
$X_6 \sim Bern(0.23 + 0.0005 \times X_1)$ <br>
$X_7 \sim Bern(0.28 + 0.015 \times X_2)$ <br>
$X_8 \sim Bern(0.22)$ <br>
$X_9 \sim Multin(0.54, 0.3, 0.16)$ <br>
$X_{10} \sim \psi(4.5 + 0.1 \times X_2, 1.1, 0, 100; x)$ <br>
$log(X_{11}) \sim N(log(3), 0.23)$ <br>
$X_{12} \sim Pois(0.32)$ <br>
$X_{13} \sim Pois(0.07 + 0.008 \times X_1)$ <br>
$X_{14} \sim Bern(0.09)$ <br>
$X_{15} \sim Bern(0.03)$ <br>
$X_{16} \sim Multin(0.2, 0.5, 0.24, 0.06)$

### （2）结局变量
$T \sim Exp(\lambda_0 + \beta_1 \times X_1 + \beta_2 \times X_1 ^ 2 + \beta_3 \times X_2 + \beta_4 \times X_3 + \beta_5 \times X_5 + \beta_6 \times X_7 + \beta_7 \times log(X_{11}) + \beta_8 \times X_2 \times X_6 + \beta_9 \times X_2 \times X_{15} \times X_{16} + \beta_{10} \times I(X_4 > 150))$ <br>
$C_{administrative} = 7$ <br>
$C_{random} \sim U(1, 7)$ <br>
$T_{death} \sim Exp(\Lambda_0 + \alpha_1 \times X_1)$ <br>
**没有competing risk的场景** <br>
$Y = min(T, C_{administrative}, C_{random})$ <br>
$\Delta = I(Y=T)$ <br>
**有competing risk的场景** <br>
$Y = min(T, C_{administrative}, C_{random}, T_{death})$ <br>
$\Delta = I(Y=T)$

### （3）二分类结局变量

$Y \sim Bern(\theta_{0} + \theta_{1} \times X_{1} + \theta_2 \times X_2 + \theta_3 \times X_5 + \theta_4 \times I(X_9 = 1) + \theta_5 \times I(X_{10} \geq 4.7)) + \theta_6 \times X_{15}$

## 3.2 模拟代码

In [9]:
N = 3000
## Design matrix
X_1 = round(rnorm_truncated(N, 61, 7, a = 18, b = 120), 0)
X_2 = rbinom(N, 1, 0.43 + 0.001 * X_1)
X_3 = round((1 - X_2) * rnorm_truncated(N, 19, 6, a = 13, b = 49) + 
            X_2 * rnorm_truncated(N, 24, 5, a = 15, b = 50), 1)
X_4 = round(rnorm_truncated(N, 120, 20, a = 10, b = 400), 0)
X_5 = rbinom(N, 1, 0.14 + 0.004 * X_2 + 0.02 * (X_4 > 140))
X_6 = rbinom(N, 1, 0.23 + 0.0005 * X_1)
X_7 = rbinom(N, 1, 0.28 + 0.015 * X_2)
X_8 = rbinom(N, 1, 0.22)
X_9 = rcat(N, c(0.54, 0.30, 0.16))
X_10 = round(rnorm_truncated(N, 4.5 + 0.1 * X_2, 1.1, a = 0, b = 100), 2)
X_11 = round(exp(rnorm(N, log(3), 0.23)), 2)
X_12 = rpois(N, 0.32)
X_13 = rpois(N, 0.07 + 0.008 * X_1)
X_14 = rbinom(N, 1, 0.08)
X_15 = rbinom(N, 1, 0.03)
X_16 = rcat(N, c(0.20, 0.50, 0.24, 0.06))

In [10]:
T = round(rexp(N, rate = 0.0002 + log(1.0004) * X_1 + log(1.2) * X_2 + log(1.0003) * X_3 + log(1.25) * X_5 + 
               log(1.15) * X_7 + log(1.0006) * log(X_11) + log(1.03) * X_2 * X_6 + log(1.04) * I(X_4 > 140)), 4)
C_adm = 7
C_random = runif(N, min = 1, max = 7)
T_death = rexp(N, rate = 0.0015 + log(1.03) * X_1)
Y = pmin(T, C_adm, C_random)
delta = ifelse(Y == T, 1, 0)

### 二分类结局变量

$Y \sim Bern(\theta_{0} + \theta_{1} \times X_{1} + \theta_2 \times X_2 + \theta_3 \times X_5 + \theta_4 \times I(X_9 = 1) + \theta_5 \times I(X_{10} \geq 4.7)) + \theta_6 \times X_{15}$

In [11]:
Y_binary = rbinom(N, 1, 0.03 - 0.0002 * X_1 + 0.07 * X_2 + 0.1 * X_5 + 0.03 * I(X_9 == 1) + 0.01 * I(X_10 >= 4.7) + 0.1 * X_15)

In [12]:
dataset_external <- cbind(ID = 1 : N, 
                          X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14, X_15, X_16, 
                          Y, delta, Y_binary) %>% data.frame() %>% mutate(type = 'external')

In [13]:
head(dataset_external)

Unnamed: 0_level_0,ID,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_11,X_12,X_13,X_14,X_15,X_16,Y,delta,Y_binary,type
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,58,0,19.2,73,0,0,0,0,1,...,2.41,1,2,0,0,3,6.3683,0,0,external
2,2,67,1,17.4,142,0,1,1,0,2,...,3.12,0,0,0,0,1,4.467985,0,0,external
3,3,46,1,33.9,137,1,0,1,1,1,...,3.47,0,0,0,0,3,3.2693,1,0,external
4,4,71,1,29.0,135,0,0,0,0,3,...,2.91,0,1,0,0,1,3.867161,0,0,external
5,5,64,1,32.6,126,0,1,0,0,1,...,3.4,0,0,0,0,2,5.495322,0,0,external
6,6,63,1,26.7,133,0,0,0,1,1,...,2.98,0,1,1,0,2,2.2916,1,0,external


In [14]:
dataset_final <- rbind(dataset, dataset_external)

In [15]:
save(dataset_final, file = 'simulated_dataset.R')

In [16]:
head(dataset_final)

Unnamed: 0_level_0,ID,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_11,X_12,X_13,X_14,X_15,X_16,Y,delta,Y_binary,type
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,56,0,15.4,127,0,1,1,0,1,...,4.2,1,0,0,0,3,1.511,1,0,raw
2,2,63,0,14.8,122,0,0,0,0,1,...,3.72,0,0,0,0,3,2.854611,0,0,raw
3,3,67,1,28.5,130,0,1,1,1,1,...,2.57,0,0,0,0,4,1.614861,0,1,raw
4,4,50,1,23.3,85,0,0,0,0,1,...,3.1,0,0,0,0,2,5.20872,0,0,raw
5,5,64,0,17.9,109,0,0,0,0,1,...,2.61,0,1,0,0,1,4.71357,0,0,raw
6,6,65,0,18.9,82,0,0,1,0,2,...,2.57,1,0,0,0,3,1.4283,1,0,raw
