# 1.数据准备

## 1.1 Import packages

## 1.1 载入R包

In [1]:
# Install required packages and library them
packages <- c("dplyr",
              "tableone",
              "stringr",
              "R.utils",
              "biostat3",
              "spatstat",
              "rms")

for (i in packages) {
    if (!suppressMessages(require(i, character.only = TRUE, quietly = TRUE))) {
        install.packages(i, quietly = TRUE)
    }
}

## 1.2 Define function

## 1.2 定义函数

In [2]:
# Daly LE. Confidence Limits Made Easy: Interval Estimation Using a Substitution Method. American journal of epidemiology. 1998;147(8):783-90.
exactBinomCI <- function (x, n, conf.level = 0.95) {
  alpha <- 1 - conf.level 
  upper <- qbinom((1 - (alpha / 2)), size = n, prob = x/n) # 改为binary
  lower <- qbinom(alpha / 2, size = n, prob = x/n)
  return(c(lower, upper))
}

In [3]:
Tab1b_of_binary_outcome <- function(outcome, dataset, digit = 2, 
                                          groups = c('external', 'raw'), ...) {
    additional_arguments <- list(...)
    if ('weight' %in% names(additional_arguments)) {
      dataset <- dataset %>% mutate(weight = .data[[weight]])
    } else {
      dataset$weight = 1
    }

    if ('exposure' %in% names(additional_arguments)) {
      dataset <- dataset %>%
        mutate(exposure = .data[[additional_arguments$exposure]])

      levels(dataset$exposure) = groups

      incid_trt <- dataset %>%
          group_by(exposure) %>%
          summarise(
                n = round(sum(weight), digit),
                event = round(sum(.data[[outcome]] * weight), digit),
                # 发病率% = 发生人数/总人数 * 100
                rate = round(event / n * 100, digit), 
                lci = round(exactBinomCI(x=event, n)[1] / n * 100, digit),
                uci = round(exactBinomCI(x=event, n)[2] / n * 100, digit))
    } else {
      exposure <- rep('all', nrow(dataset))

      dataset$exposure <- 'all'

      incid_trt <- dataset %>%
          group_by(exposure) %>%
          summarise(
                n = round(sum(weight), digit),
                event = round(sum(.data[[outcome]] * weight), digit),
                rate = round(event / n * 100, digit),
                lci = round(exactBinomCI(x=event, n)[1] / n * 100, digit),
                uci = round(exactBinomCI(x=event, n)[2] / n * 100, digit))
    }

    incid_trt <- as.data.frame(incid_trt) %>%
        mutate(rate = paste0(rate, " (", lci, "-", uci, ")"))

    colnames(incid_trt) <- c('group', 'n', 'event', 'incidence(%)')

    results <- incid_trt[, c('group', 'n', 'event', 'incidence(%)')]

    return(results)
}

# 2.数据清洗和描述

## 2.1 加载数据集

## 2.1 Import dataset

In [4]:
load("simulated_dataset.R")

In [5]:
head(dataset_final)

Unnamed: 0_level_0,ID,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_11,X_12,X_13,X_14,X_15,X_16,Y,delta,Y_binary,type
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,56,0,15.4,127,0,1,1,0,1,...,4.2,1,0,0,0,3,1.511,1,0,raw
2,2,63,0,14.8,122,0,0,0,0,1,...,3.72,0,0,0,0,3,2.854611,0,0,raw
3,3,67,1,28.5,130,0,1,1,1,1,...,2.57,0,0,0,0,4,1.614861,0,1,raw
4,4,50,1,23.3,85,0,0,0,0,1,...,3.1,0,0,0,0,2,5.20872,0,0,raw
5,5,64,0,17.9,109,0,0,0,0,1,...,2.61,0,1,0,0,1,4.71357,0,0,raw
6,6,65,0,18.9,82,0,0,1,0,2,...,2.57,1,0,0,0,3,1.4283,1,0,raw


In [6]:
names(dataset_final) <- c('ID', 'age', 'male', 'BMI', 'SBP', 'MI', 'HF', 'COPD', 
                          'cancer', 'albuminuria', 'TC', 'LDLC', 'No_outpatient', 'No_inpatient', 
                          'liver_disease', 'hypoglycemia', 'CKD_stage', 
                          'AKI_time', 'AKI_status', 'AKI_binary', 'type') # 原文件中没有binary outcome

In [7]:
head(dataset_final)

Unnamed: 0_level_0,ID,age,male,BMI,SBP,MI,HF,COPD,cancer,albuminuria,...,LDLC,No_outpatient,No_inpatient,liver_disease,hypoglycemia,CKD_stage,AKI_time,AKI_status,AKI_binary,type
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,56,0,15.4,127,0,1,1,0,1,...,4.2,1,0,0,0,3,1.511,1,0,raw
2,2,63,0,14.8,122,0,0,0,0,1,...,3.72,0,0,0,0,3,2.854611,0,0,raw
3,3,67,1,28.5,130,0,1,1,1,1,...,2.57,0,0,0,0,4,1.614861,0,1,raw
4,4,50,1,23.3,85,0,0,0,0,1,...,3.1,0,0,0,0,2,5.20872,0,0,raw
5,5,64,0,17.9,109,0,0,0,0,1,...,2.61,0,1,0,0,1,4.71357,0,0,raw
6,6,65,0,18.9,82,0,0,1,0,2,...,2.57,1,0,0,0,3,1.4283,1,0,raw


## 2.2 分类变量的注释

## 2.2 Relabel of categorical covariates

In [8]:
dataset_final <- dataset_final %>%
        mutate(
        albuminuria = case_when( ## relabel of categorical variable
            albuminuria == 1 ~ "normal to mild",
            albuminuria == 2 ~ "moderate",
            albuminuria == 3 ~ "severe"),
        albuminuria = factor(albuminuria, levels = c("normal to mild", "moderate", "severe")),
        CKD_stage = case_when(
            CKD_stage == 1 ~ "G1-2",
            CKD_stage == 2 ~ "G3a",
            CKD_stage == 3 ~ "G3b",
            CKD_stage == 4 ~ "G4"),
        CKD_stage = factor(CKD_stage, levels = c("G1-2", "G3a", "G3b", "G4")))

In [9]:
dataset_binary <- dataset_final[, c('ID', 'age', 'male', 'BMI', 'SBP', 'MI', 'HF', 'COPD', 
                          'cancer', 'albuminuria', 'TC', 'LDLC', 'No_outpatient', 'No_inpatient', 
                          'liver_disease', 'hypoglycemia', 'CKD_stage', 'AKI_binary', 'type')]

## 2.3 表1

## 2.3 Table one

In [10]:
xvars <- c('age', 'male', 
           ## 
           'BMI', 'SBP', 
           ## cormobidities
           'MI', 'HF', 'COPD', 'cancer', 'liver_disease', 'hypoglycemia', 
           ## lab tests
           'albuminuria', 'CKD_stage', 'TC', 'LDLC', 
           ## healthcare utilization
           'No_outpatient', 'No_inpatient')
xfactorvars <- c('male', 
                 ## cormobidities
                 'MI', 'HF', 'COPD', 'cancer', 'liver_disease', 'hypoglycemia', 
                 ## lab tests
                 'albuminuria', 'CKD_stage')

xmultilevelfactorvars <- c("albuminuria", "CKD_stage") # multilevel categories

xnonnormvars <- c(## healthcare utilization
                  'No_outpatient', 'No_inpatient')

In [11]:
tb1.all <- CreateTableOne(xvars, data = dataset_binary, factorVars = xfactorvars, includeNA = T)
tb1.all <- print(tb1.all, nonnormal = xnonnormvars, printToggle = F)
tb1.part <- CreateTableOne(xvars, strata = 'type', data = dataset_binary, factorVars = xfactorvars, includeNA = T)
tb1.part <- print(tb1.part, nonnormal = xnonnormvars, test = F, smd = T, printToggle = F)

N <- c(N=nrow(dataset_binary), colSums(!is.na(dataset_binary[xvars])))
N_mcatv <- sapply(dataset_binary[, xmultilevelfactorvars], table) # 修改了添加multilevel categories的代码，增加了亚组的人数
for (i in xmultilevelfactorvars) {
    start <- str_which(names(N), i)
    N_inset <- as.numeric(N_mcatv[[i]])
    names(N_inset)  <- names(N_mcatv[[i]])
    N <- c(N[1:start], N_inset, N[-(1:start)])
}
tb1 <- cbind(N, tb1.all, tb1.part)

In [12]:
tb1

Unnamed: 0,N,Overall,external,raw,SMD
N,8000,8000,3000,5000,
age,8000,61.56 (5.91),60.87 (7.16),61.96 (4.97),0.177
male,8000,3950 (49.4),1493 (49.8),2457 (49.1),0.013
BMI,8000,22.34 (4.79),22.57 (5.05),22.20 (4.62),0.078
SBP,8000,113.80 (19.19),119.74 (19.90),110.24 (17.84),0.503
MI,8000,1136 (14.2),447 (14.9),689 (13.8),0.032
HF,8000,2073 (25.9),800 (26.7),1273 (25.5),0.027
COPD,8000,2329 (29.1),868 (28.9),1461 (29.2),0.006
cancer,8000,1693 (21.2),636 (21.2),1057 (21.1),0.001
liver_disease,8000,626 ( 7.8),240 ( 8.0),386 ( 7.7),0.01


## 2.4 表2 

## 2.4 Table 2

In [13]:
dataset <- dataset_binary %>% filter(type == "raw")

In [14]:
Tab1b_of_binary_outcome("AKI_binary", dataset = dataset, digit = 2)

group,n,event,incidence(%)
<chr>,<dbl>,<dbl>,<chr>
all,5000,453,9.06 (8.28-9.86)


In [15]:
dataset_external <- dataset_binary %>% filter(type == "external")

In [16]:
Tab1b_of_binary_outcome("AKI_binary", dataset = dataset_external, digit = 2)

group,n,event,incidence(%)
<chr>,<dbl>,<dbl>,<chr>
all,3000,267,8.9 (7.9-9.93)


In [17]:
Tab1b_of_binary_outcome("AKI_binary", dataset = dataset_binary, digit = 2, exposure = "type")

group,n,event,incidence(%)
<chr>,<dbl>,<dbl>,<chr>
external,3000,267,8.9 (7.9-9.93)
raw,5000,453,9.06 (8.28-9.86)


上述的代码和表格使用了预先定义的功能*Tab1b_of_time_to_event_outcome*在训练集和验证集中分别生成了结局。从表中可以看到，模型训练集总人数为5000人，在所观察的人群中，有453人发生了结局事件(Acute Kidney Injury, AKI),而AKI的发病率为9.06%(95%CI: 8.26%-9.91%)；验证集总人数3000人，有267人发生了结局事件(Acute Kidney Injury, AKI),而AKI的发病率为8.90% (95%CI: 7.90%-10.0%)。

## 2.5 Transformation of covariates

## 2.5 协变量的转换

### （1）Standardization 
$$
\frac{X-E(X)}{\sqrt{Var(X)}}
$$

标准化：这里给出了Z-Score标准化的公式，这种方法给予原始数据的均值(mean)和标准差(standard deviation)进行数据的标准化。经过处理的数据符合标准正态分布，即均值为0，标准差为1。

### （2）Normalization
$$
\frac{X-min(X)}{max(X)-min(X)}
$$

归一化：这里给出了Min-Max归一化(Min-Max Normalization)，也称为离差标准化，是对原始数据的线性变换，使结果值映射到[0-1]之间。其中max(X)为样本数据的最大值，min(X)为样本数据的最小值。这种归一化方法比较适用在数值比较集中的情况。


### （3）Restricted cubic spline

$$
r(X,K) = X, S_{K,1}, ..., S_{K,(K-2)}
$$

$$
S_{Ki} = \gamma (X-\xi_{i})_{+}^{3} - \frac{(\xi_{K}-\xi_{i})(\gamma(X-\xi_{K-1}))_{+}^{3}}{\xi_{K}-\xi_{K-1}} + \frac{(\xi_{K-1}-\xi_{i})(\gamma(X-\xi_{K}))_{+}^{3}}{\xi_{K}-\xi_{K-1}}, i = 1, ..., K-2
$$

$$ \gamma=\left\{
    \begin{array}{rcl}
    	1 & & {norm=0} \\
    	\xi_{K}-\xi_{K-1} & & {norm=1} \\
    	(\xi_{K}-\xi_{1})^{2} & & {norm=2} 
    \end{array} \right.
$$

$K$ number of knots, parameter *nk* in function <span style="color:blue">rcspline.eval</span>. For 3 knots, the outer quantiles used are 0.10 and 0.90. For 4-6 knots, the outer quantiles used are 0.05 and 0.95. For *nk*>6, the outer quantiles are 0.025 and 0.975. The knots are equally spaced between these on the quantile scale. When *nk* knots are set, a matrix with *nk-1* columns will be returned. 
<br>
$\gamma$ normalization constant, parameter *norm* in function <span style="color:blue">rcspline.eval</span>. For *norm = 0*, no normalization is used. For *norm = 1*, constant $\xi_{K}-\xi_{K-1}$ is used to normalize non-linear terms. For *norm = 2*, constant $(\xi_{K}-\xi_{1})^{2}$ is used to normalize non-linear terms, which has the advantage of making all nonlinear terms beon the x-scale.

### （3）限制性立方样条 (Restricted cubic spline，RCS)
在上述公式中，$K$为节点数，在<span style = "color:blue">rms</span>包函数<span style = "color:blue">rcspline.eval</span>中为参数*nk*。如果节点数为3，对应的最外侧节点位置为0.10和0.90；如果节点数为4-6个，对应的位置为0.05和0.95；如果节点数大于6，则对应的位置为0.025和0.975，这些节点的间距相等。如果节点数为*nk*个，则返回*nk-1*列。
<br>
$\gamma$为归一化常数，在函数<span style = "color:blue">rcspline.eval</span> 中为参数*norm*。当*norm = 0* 时，不使用归一化；当*norm = 1* 时，常数$\xi_{K}-\xi_{K-1}$ 用于非线性项的归一化；当*norm = 2* 时，常数$(\xi_{K}-\xi_{1})^{\frac{2}{3}}$ 用于非线性项的归一化，这样做的优点是使所有的非线性项在x轴上具有相同的尺度。


In [18]:
dataset_binary_trans <- dataset_binary %>% 
    mutate(age_square = age ^ 2) %>% ## generate square of continuous variable
    mutate(log_LDLC = log(LDLC)) %>% ## generate log of continuous variable
    mutate(age_std = (age - mean(age)) / sd(age)) %>% ## standardization
    mutate(age_nor = (age - min(age)) / (max(age) - min(age))) %>% ## normalization [0, 1]
    mutate(age_category = case_when( ## categorization of continuous variable
               age < 50 ~ '<50', 
               age >= 50 & age < 60 ~ '50-59', 
               age >= 60 & age < 70 ~ '60-69', 
               age >= 70 & age < 80 ~ '70-79',
               age >= 80 ~ '>=80'),
           age_category = factor(age_category, levels = c('<50', '50-59', '60-69', '70-79', '>=80'))) %>%
    mutate(hypertension = ifelse(SBP > 130, 1, 0)) %>% 
    mutate(BMI_category = case_when(
               BMI < 18.5 ~ 'underweight', 
               BMI >= 18.5 & BMI < 25 ~ 'normal weight',
               BMI >= 25 & BMI < 30 ~ 'overweight', 
               BMI >= 30 ~ 'obesity'), 
           BMI_category = factor(BMI_category,levels = c('underweight', 'normal weight', 'overweight', 'obesity'))) %>% 
    mutate(TC_category = cut(TC, breaks = quantile(TC, probs = seq(0, 1, 0.25)), include.lowest = T), 
           TC_category = relevel(TC_category, ref = '(3.75,4.49]')) %>% ## reset reference group
    mutate(TC_rcs_1 = rcspline.eval(TC, nk = 4, norm = 0, knots.only = F, inclx = T)[, 1], ## generate resctricted cubic spline
           TC_rcs_2 = rcspline.eval(TC, nk = 4, norm = 0, knots.only = F, inclx = T)[, 2], 
           TC_rcs_3 = rcspline.eval(TC, nk = 4, norm = 0, knots.only = F, inclx = T)[, 3]) %>%
    mutate(age_male = age * (male == 1), ## generate interaction terms 
           male_cancer = male * cancer,
           male_CKD_stage_G3a = (male == 1) * (CKD_stage == "G3a"),
           male_CKD_stage_G3b = (male == 1) * (CKD_stage == "G3b"),
           male_CKD_stage_G4 = (male == 1) * (CKD_stage == "G4"),
           age_TC = age * TC,
           age_BMI_TC = age * BMI * TC)

## 2.6 预测变量池

*dataset_binary_trans*这个数据集中包含了所有原始变量和上述新生成的变量，除去患者ID和结局，现在预测变量池(canditate predictor pool)内共有35个变量，包括原始变量:**年龄、性别、身体质量指数(BMI)、收缩压(SBP)、有无心肌梗死(MI)、心力衰竭(HF)、慢性阻塞性肺病(COPD)、癌症、尿蛋白、总胆固醇(TC)、低密度脂蛋白(LDLC)、门诊就诊次数、住院次数、有无肝脏疾病、低血糖症、CKD分期**，转换后和新生成的变量:**年龄的平方、标准化年龄、归一化年龄、年龄段、对数化LDLC、有无高血压、BMI水平、TC水平、限制性立方样条转换后不同水平下的TC(TC_rcs_1/2/3)**, 交互项包括：<b>年龄$\times$性别(age_male)、性别$\times$有无癌症(male_cancer)、性别$\times$CKD分期(male_CKD_stage_G3a/G3b/G4)、年龄$\times$TC(age_TC)、年龄$\times$BMI$\times$TC(age_BMI_TC)</b>。

In [19]:
dataset <- dataset_binary_trans %>% filter(type == 'raw')
dataset_external <- dataset_binary_trans %>% filter(type == 'external')
save(dataset, dataset_external, file = 'dataset_binary_after_description.R')