In [1]:
library(readr)

url <- "https://ndownloader.figshare.com/files/62102364"
df <- read_csv(url, show_col_types = FALSE, name_repair = "minimal")

nms <- names(df)
nms[nms == ""] <- "row_index"
names(df) <- nms

drop_cols <- intersect(c("Unnamed: 0", "...1", "row_index"), names(df))
df <- df[, !(names(df) %in% drop_cols), drop = FALSE]

first <- df[[1]]
if (is.numeric(first) && all(first == seq(0, length(first) - 1))) {
  df <- df[, -1, drop = FALSE]
}

In [2]:
print(dim(df))
print(head(df))

[1] 50000    12
[90m# A tibble: 6 × 12[39m
  IRSD_quintile   Age smoking_status   BMI diabetes   CKD HbA1c  eGFR   SBP
          [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m    [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m [3m[90m<dbl>[39m[23m
[90m1[39m             4  50.4 non             28.2        0     0  4.32  83.1  119.
[90m2[39m             3  39.2 ex              16.8        0     0  4.70  81.5  124.
[90m3[39m             5  55.5 non             23.5        0     0  3.67  86.8  127.
[90m4[39m             4  51.9 non             32.0        0     0  4.49  94.7  114.
[90m5[39m             1  47.1 ex              25.4        0     0  4.32  86.3  126.
[90m6[39m             5  56.5 non             27.6        0     0  4.52  90.9  110.
[90m# ℹ 3 more variables: AF <dbl>, cvd_event <dbl>, cvd_time <dbl>[39m


In [6]:
url <- "https://ndownloader.figshare.com/files/62130498"
zip_file <- "DataAsset2_PatientEMR.zip"

download.file(url, destfile = zip_file, mode = "wb")

files <- unzip(zip_file, list = TRUE)$Name
print(files)

read_csv_in_zip <- function(zip_path, inner_csv) {
  con <- unz(zip_path, inner_csv, open = "rb")
  on.exit(close(con), add = TRUE)

  df <- read_csv(con, show_col_types = FALSE, name_repair = "minimal")

  nms <- names(df)
  nms[nms == ""] <- "row_index"
  names(df) <- nms

  df <- df[, !(names(df) %in% c("Unnamed: 0", "...1", "row_index")), drop = FALSE]

  df
}

dfs <- list()
for (f in files) {
  if (grepl("\\.csv$", f, ignore.case = TRUE)) {
    dfs[[f]] <- read_csv_in_zip(zip_file, f)
  }
}

[1] "Data002_PatientEMR_ChronicDiseases_RE.csv"
[2] "Data002_PatientEMR_MasterSummary_RE.csv"  
[3] "Data002_PatientEMR_MeasAndPath_RE.csv"    


In [7]:
patient_master   <- dfs[[grep("MasterSummary", names(dfs), value = TRUE)[1]]]
chronic_diseases <- dfs[[grep("ChronicDiseases", names(dfs), value = TRUE)[1]]]
meas_and_path    <- dfs[[grep("MeasAndPath", names(dfs), value = TRUE)[1]]]

cat("Master:", dim(patient_master), "\n")
cat("Chronic:", dim(chronic_diseases), "\n")
cat("Meas/Path:", dim(meas_and_path), "\n")

Master: 50000 6 
Chronic: 4417 3 
Meas/Path: 210000 5 


In [8]:
print(head(patient_master))

[90m# A tibble: 6 × 6[39m
  Patient_ID Age_At_2024 SMOKING_STATUS IRSD_Quintile CVD_Event CVD_Time
       [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m                  [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m   
[90m1[39m        269        57.4 non                        4         1 2019-12 
[90m2[39m        272        46.2 ex                         3         0 2022-12 
[90m3[39m        281        62.5 non                        5         0 2022-12 
[90m4[39m        296        58.9 non                        4         0 2022-12 
[90m5[39m        317        54.1 ex                         1         0 2022-12 
[90m6[39m        344        63.5 non                        5         0 2022-12 


In [9]:
print(head(chronic_diseases))

[90m# A tibble: 6 × 3[39m
  Patient_ID Category   Date   
       [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      [3m[90m<chr>[39m[23m  
[90m1[39m [4m1[24m338[4m1[24m[4m6[24m[4m3[24m469 Diabetes   2012-05
[90m2[39m  150[4m5[24m[4m9[24m[4m1[24m944 Diabetes   2014-12
[90m3[39m  545[4m9[24m[4m4[24m[4m0[24m569 ICD10: E11 2016-10
[90m4[39m [4m7[24m134[4m0[24m[4m7[24m[4m5[24m944 Diabetes   2013-02
[90m5[39m [4m5[24m959[4m7[24m[4m2[24m[4m2[24m392 Diabetes   2014-09
[90m6[39m  354[4m2[24m[4m1[24m[4m0[24m137 Diabetes   2015-08


In [10]:
print(head(meas_and_path))

[90m# A tibble: 6 × 5[39m
  Patient_ID Value              Description Date    Unit         
       [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m              [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m        
[90m1[39m  819[4m1[24m[4m2[24m[4m7[24m997 78.08730821809296  eGFR        2014-03 mL/min/1.73m²
[90m2[39m [4m3[24m847[4m7[24m[4m1[24m[4m3[24m176 8.369465931621287  HbA1c       2015-02 %            
[90m3[39m [4m3[24m508[4m3[24m[4m0[24m[4m4[24m696 140.32800536506784 SBP         2016-09 mmHg         
[90m4[39m [4m1[24m685[4m2[24m[4m1[24m[4m2[24m472 69.9920201360966   GFR         2015-06 mL/min/1.73m²
[90m5[39m [4m1[24m105[4m8[24m[4m0[24m[4m5[24m072 29.95              BMI         2015-04 [31mNA[39m           
[90m6[39m [4m3[24m049[4m0[24m[4m0[24m[4m3[24m469 24.87              BMI         2016-06 [31mNA[39m           
