# Create Cohort and demographics

In [None]:
########################
# set working directory
setwd("/Users/luca/Projects/rl_sepsis/SEPSIS")

########################
# load libraries
library(tidyverse) # install.packages("tidyverse")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to 7GB)
memory.limit(size=56000)

# Path to directory with data files
data_path <- 'ICV_data/'

########################
### Import demographics.csv
D <- read.csv("ICV_data/demographics.csv",sep=",", header = TRUE)

# remove duplicate admissions
D <- D[!duplicated(D$PatientID), ]

# inspection
print(names(D))

########################
# FULL DATASET
FULL <- read_csv("ICV_data/admissions.csv")#,sep=",", header = TRUE)

# keep selection of columns
FULL_columns <- c("PatientID", "first_SYS_value", "first_Temp_value", "first_AF_value", "first_leuco_value")
FULL_SIRS <- FULL[c(FULL_columns)]

# keep selection of rows (PatientID's)
D_SIRS <- FULL_SIRS[FULL$PatientID %in% (D$PatientID), ]

# check count of patients selected (table: TRUE) out of all patients (nrow(FULL))
table(FULL_SIRS$PatientID %in% (D$PatientID))
nrow(FULL)

### MERGE D_SIRS WITH D
D <- merge(D,D_SIRS,by='PatientID')

In [None]:
#######################
### data conversions

# nor
D$nor_dose <- D$Noradrenaline/D$Weight
D$nor_score <- cut(D$nor_dose ,
                   breaks=c(-Inf, 0, 0.1, Inf),
                   labels=c(0,3,4))

# dopa
D$dopa_dose <- D$Dopamine/D$Weight
D$dopa_score <- cut(D$dopa_dose ,
                    breaks=c(-Inf, 5, 15, Inf),
                    labels=c(0,3,4))

# vaso score (pick heighest of dopa or nor)
nor=as.numeric(as.character(D$dopa_score))
dopa=as.numeric(as.character(D$nor_score))

# replace NA's
nor[is.na(nor)] <- 0
dopa[is.na(dopa)] <- 0

# merge Nor and Dopa to vaso_score
D$vaso_score <- ifelse(nor>=dopa, nor, dopa)

# Vasopressor score (pick highest value)
D$S_Vaso <- as.factor(rep(NA,nrow(D)))
levels(D$S_Vaso) <- c(0,1,2,3,4)
D$S_Vaso[D$MAP > 70 & D$vaso_score == 0] <- 0
D$S_Vaso[D$MAP < 70 & D$vaso_score == 0] <- 1
D$S_Vaso[D$vaso_score > 0] <- D$vaso_score


########################
# Convert Gender to 0/1 (1 is male)
D$Gender <- as.character(D$Gender)
D$Gender[D$Gender != 'Vrouw' & D$Gender != 'Man'] <- NA
D$Gender[D$Gender == 'Man'] <- 1
D$Gender[D$Gender == 'Vrouw'] <- 0
D$Gender <- as.integer(as.numeric(D$Gender))

#######################
## impute missing data: USING PMM WITH MICE PACKAGE! -> won't work, just hard coded! (mean of mode)
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
for (var in 2:ncol(D)) {
  if (class(D[,var])=="numeric") {
    D[is.na(D[,var]),var] <- mean(D[,var], na.rm = TRUE)
  } else if (class(D[,var]) %in% c("character", "factor")) {
    D[is.na(D[,var]),var] <- getmode(D[ ,var])
  }
}

# check after imputation
print(head(D))

# Gender get's a second imputation turn, it's stubborn
D$Gender[is.na(D$Gender)] <- 1

########################################################################################################################
### Calculate SOFA score from components at admission to the ICU

# add Ventilation level yes (only relevant for test dataset but still can't hurt to be complete)
levels(D$Ventilation) <- c(levels(D$Ventilation),'YES')
# Bilirubine
D$S_BILI <- cut(D$BILI ,
                breaks=c(-Inf, 19, 32, 101, 204, Inf),
                labels=c(0,1,2,3,4))

# EMV
D$GCS <- D$E + D$M + D$v
D$S_GCS <- cut(D$GCS ,
               breaks=c(-Inf, 5, 9, 12, 14, Inf),
               labels=c(4,3,2,1,0))

# Trombo's
D$S_trombo <- cut(D$TROMBO ,
                  breaks=c(-Inf, 19, 49, 99, 149, Inf),
                  labels=c(4,3,2,1,0))

### P/F ratio SOFA SCORE
D$PaO2_mmhg <- D$PaO2_kpa *  7.50061683 # convert kpa to mmhg
D$PaO2_mmhg[is.na(D$PaO2_mmhg)] <- 1000 # recode NA's to absurdly high number so it won't be picked as the "lowest value"

# pick the lowest value from either PaO2 (mmhg) or Pao2_mmhg (kpa converted to mmhg)
D$PO2 <- pmin( D[,14], D[,29]) # CREATE NEW COLUMN WITH LOWEST VALUE OF TWO COLUMNS
# calc P/F ratio
D$PF <- D$PO2/(D$FiO2/100)

# create P/F score (up to 2 points if P/F < 300)
D$S_PF <- cut(D$PO2/(D$FiO2/100),
               breaks=c(-Inf, 299, 400, Inf),
               labels=c(2,1,0))
levels(D$S_PF) <- c(levels(D$S_PF),3,4)
# add points 3 and 4 if patient is also mechanically ventilated
D$S_PF[D$PF < 200 & D$Ventilation == 'YES'] <- 3
D$S_PF[D$PF < 100 & D$Ventilation == 'YES'] <- 4

# Kidney score
D$S_creat <- cut(D$CREAT ,
                 breaks=c(-Inf, 109, 170, 299, 440, Inf),
                 labels=c(0,1,2,3,4))
D$S_UP <- cut(D$urine_first_24Hours ,
              breaks=c(-Inf, 199, 499, Inf),
              labels=c(4,3,0))
D$S_kidney <- as.factor(rep(NA,nrow(D)))
levels(D$S_kidney) <- c(0,1,2,3,4)
D$S_kidney[D$S_UP == 0 & D$S_creat == 0] <- 0
D$S_kidney[D$S_UP == 0 & D$S_creat == 1] <- 1
D$S_kidney[D$S_UP == 0 & D$S_creat == 2] <- 2
D$S_kidney[D$S_UP == 3 | D$S_creat == 3] <- 3 # UP: 3 OR creat: 3
D$S_kidney[D$S_UP == 4 | D$S_creat == 4] <- 4 # UP: 4 OR creat: 4

# final sofa score
D$SOFA <- as.numeric(as.character(D$S_BILI)) + as.numeric(as.character(D$S_kidney)) + as.numeric(as.character(D$S_GCS)) + as.numeric(as.character(D$S_trombo)) + as.numeric(as.character(D$S_Vaso)) + as.numeric(as.character(D$S_PF))

########################################################################################################################
### Calculate qSOFA components (qSOFA GCS not possible, data not adequatly available)

# qSOFA ventilation score
D$qsofa_vent <- as.factor(rep(NA,nrow(D)))
levels(D$qsofa_vent) <- c(0,1)
D$qsofa_vent[D$first_AF_value > 21] <- 1
D$qsofa_vent[D$first_AF_value < 22] <- 0
D$qsofa_vent <- as.numeric(as.character(D$qsofa_vent))

# qSOFA systolic blood pressure score
D$qsofa_sysbp <- as.factor(rep(NA,nrow(D)))
levels(D$qsofa_sysbp) <- c(0,1)
D$qsofa_sysbp[D$first_SYS_value/133.3224 < 100] <- 1
D$qsofa_sysbp[D$first_SYS_value/133.3224 > 100] <- 0
D$qsofa_sysbp <- as.numeric(as.character(D$qsofa_sysbp))

########################################################################################################################
### Calculate SIRS components + final score

# SIRS TEMP
D$sirs_temp <- as.factor(rep(NA,nrow(D)))
levels(D$sirs_temp) <- c(0,1)
D$temp <- (D$first_Temp_value - 32)/1.8 
D$sirs_temp <- as.factor(as.integer((D$temp > 39 | D$temp < 36)))
levels(D$sirs_temp) <- c(0,1)

# SIRS Heartfrequency
D$sirs_hf <- as.factor(rep(NA,nrow(D)))
levels(D$sirs_hf) <- c(0,1)
D$sirs_hf <- as.factor(as.integer(D$first_hf_value > 90))
levels(D$sirs_hf) <- c(0,1)

# SIRS Respiratory
D$sirs_resp <- as.factor(rep(NA,nrow(D)))
levels(D$sirs_resp) <- c(0,1)
D$sirs_resp <- as.factor(as.integer(D$first_AF_value > 20))
levels(D$sirs_resp) <- c(0,1)

# SIRS Leucocytes
D$leuco <- D$first_leuco_value / 1000000
D$sirs_leuco <- as.factor(rep(NA,nrow(D)))
levels(D$sirs_leuco) <- c(0,1)
D$sirs_leuco <- as.factor(as.integer((D$leuco > 12 | D$leuco < 4)))
levels(D$sirs_leuco) <- c(0,1)

# final SIRS score
D$SIRS <- as.numeric(as.character(D$sirs_temp)) + as.numeric(as.character(D$sirs_hf)) + as.numeric(as.character(D$sirs_resp)) + as.numeric(as.character(D$sirs_leuco))

print("Finished Demographics")

### SAVE cohort and demographics dataframes

In [None]:
########################################################################################################################
### Create final demographics.csv subset
demographics_vars <- c("PatientID", "Age", "Gender", "Lengte", "Weight", "GCS", "SOFA", "qsofa_vent", "qsofa_sysbp", "SIRS", "Ventilation")
demographics <- D[c(demographics_vars)]
head(demographics)

########################
# WRITE DATA
write.csv(demographics, file = "ICV_data/demographics_df.csv", row.names = FALSE)

########################################################################################################################
### Create cohort.csv

### Import all ICU addmissions with Inclusion SQL criteria applied
Admissions <- read.csv("ICV_data/admissions_df.csv",sep=",", header = TRUE)

### keep Discharge variable from demographics
cohort_vars <- c("PatientID", "Discharge")
cohort <- D[c(cohort_vars)]

### merge demographics with admissions
cohort_df <- merge(Admissions, cohort, by=c("PatientID"))
cohort_df$Discharge <- as.character(cohort_df$Discharge)
cohort_df$Discharge[cohort_df$Discharge == "DEAD"] <- 1
cohort_df$Discharge[cohort_df$Discharge == "ALIVE"] <- 0
cohort_df$Discharge <- as.numeric(cohort_df$Discharge)

### WRITE DATA
write.csv(cohort_df, file = "ICV_data/cohort_df.csv", row.names = FALSE)

# Create data from Chunks directories

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library("readr")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to 7GB)
memory.limit(size=56000)

########################################################################################################################
### Chunk to FULL files

# set paths
data_path <- 'ICV_data/'
chunk_path <- 'ICV_data/chunks'

# loop through chunks
chunk_dir <- list.dirs(chunk_path,recursive = FALSE)
chunk_dir_list <- paste0(chunk_dir)
#chunk_dir_list[2]

In [None]:
# for each directory (fluids, vitals, labs, vasopressors):
for (chunk in chunk_dir_list[4]){                           # add [1] for fluid chunks, add [2] for labs, add [3] for vasopressors, add [4] for vitals
  path_dir <- list.files(chunk)
  path_list <- paste0(chunk, "/",path_dir)
  i = 0
 
  # for each path in the subset directory:
  for (path in path_list) { 
    print(path)
    i = i + 1
    if (i == 1) {
      the_df <- read_csv(path) 
      results_df <- the_df 
      print("CHUNK done")
        
    } else {
      the_df <- read_csv(path) 
      results_df <- rbind(results_df, the_df)
      print("CHUNK done")
    }
  } # end of (for path in path_list)
  
  # save to csv
  filename = strsplit(path,split='/', fixed=TRUE)[[1]][3] # the third directory is the filename
  write_csv(results_df, path = paste0(data_path,filename, ".csv"))
  print('CSV DONE')
} # end of (for chunk in chunk_dir_list)

### finish up
print("ALL done")

## Full DATA ACQUIRED
    
# START OF DATA PREPROCESSING

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to 7GB)
memory.limit(size=56000)

# Path to directory with data files
data_path <- 'ICV_data/'

sprintf("Finished at: %st", Sys.time())

# Cohort Interval Times

In [None]:
########################
### Import COHORT
D <- read_csv(paste0(data_path, 'admissions_df.csv'))#,sep=",", header = TRUE)

# cleanup
D$start_time <- as.POSIXct(strptime(x = as.character(D$start_time), format = "%Y-%m-%d %H:%M:%S"))
D$end_time <- as.POSIXct(strptime(x = as.character(D$end_time), format = "%Y-%m-%d %H:%M:%S"))

# keep only correct cases
D <- na.omit(D)
D <- D[(D$end_time > D$start_time), ]

# keep only cases where datediff begin to end time > 8h
D <- D[((D$end_time - D$start_time) > 8), ]

########################
# DATA TRANSFORMATIONS (from cohort to interval dataframe)
get_time_intervals <- function(start, end) {
  interval_times <- seq(start, end, by = '4 hour')
  interval_start_time <- interval_times[1:length(interval_times) -1]
  interval_end_time <- interval_times[2:length(interval_times)]
  return(data.frame(interval_start_time, interval_end_time))
}
interval_times_df <- D %>%
  group_by(PatientID) %>%
  do(get_time_intervals(.$start_time, .$end_time))
dim(interval_times_df)

########################
# WRITE DATA
#write_csv(interval_times_df, paste0(data_path, 'interval_times_df.csv'))
write.csv(interval_times_df, file = "ICV_data/interval_times_df.csv", row.names = FALSE)

sprintf("Finished at: %st", Sys.time())

# Vasopressors

In [None]:
########################################################################################################################
# START WITH WEIGHT AND INTERVAL TIMES
interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
weight_df <- read_csv(paste0(data_path, 'Weight.csv'))
mean_weight = mean(weight_df)
weight_df$Weight[weight_df$Weight<40] = 40
weight_df$Weight[weight_df$Weight>200] = 200

In [None]:
# LOAD Vasopressor
vassopressor_mv_df <- read_csv(paste0(data_path, 'Vasopressor.csv'))
names(vassopressor_mv_df)[1] <- "PatientID"
vassopressor_mv_df <- vassopressor_mv_df %>% left_join(weight_df, by = "PatientID") %>% replace_na(list(Weight = mean_weight)) %>% mutate(mcgkgmin=mcgmin/Weight) # from g/minute to mcg/kg/min
head(vassopressor_mv_df)
nrow(vassopressor_mv_df)

In [None]:
# Create max of vasopressor dose over interval times
max_vassopressor_mv <- interval_times_df %>%
  left_join(vassopressor_mv_df) %>%
  group_by(PatientID,
           interval_start_time,
           interval_end_time) %>%
  filter((EndTime > interval_start_time & EndTime <= interval_end_time) | (StartTime > interval_start_time & StartTime < interval_end_time)) %>%
  summarise(max_amount = max(mcgkgmin)) %>% 
  replace_na(list(max_amount = 0)) %>% mutate(max_amount = case_when(max_amount > 1e-02 ~ max_amount, TRUE ~ 0))

# Visual check
head(max_vassopressor_mv)
nrow(max_vassopressor_mv)
sprintf("Finished at: %st", Sys.time())

## Vasopressor: MIMIC vs ICV
#### CREATE VP ACTIONS
    ### MIMIC: 
    max_vassopressor_mv$discrete_VP <- as.integer(cut(max_vassopressor_mv$max_amount, breaks = c(-Inf, 0, 0.07411527, 0.200000, 0.4500000, Inf)))-1 
    unique(max_vassopressor_mv$discrete_VP)
        # action 0 == dose=0
        # action 1 == 0>dose<0.074
        # action 2 == 0.074<dose<0.2
        # action 3 == 0.2<dose<0.45
        # action 4 == dose>0.45

In [None]:
# A function to map the data to quartiles
num2quartile <- function(x) {
  #     x <- x[is.na(x)]
  quants <- quantile(x, probs=0:4/4, na.rm = TRUE)
  print(round(quants,5))
  return(as.integer(cut(x, quants, include.lowest=TRUE)))
}
ICV_VP_Quantiles = num2quartile(max_vassopressor_mv$max_amount)

#### MV ONLY MIMIC DATA - VASOPRESSOR QUANTILES --> Perform manual cut on ICV DATA
                  0%          25%          50%          75%         100% 
            0.00200       0.11016      0.22509      0.45035    222.73125 

In [None]:
### ICV
table(as.integer(cut(max_vassopressor_mv$max_amount, breaks = c(-Inf, 0, 0.10771, 0.24242, 0.57143, Inf)))-1 )

### MIMIC
table(as.integer(cut(max_vassopressor_mv$max_amount, breaks = c(-Inf, 0, 0.11016, 0.22509, 0.45035, Inf)))-1 )

In [None]:
# merge vasopressor with interval_times (essentialy adds empty interval times that were left out in the initial max_vasopressor_mv join)
action_df_VP <- max_vassopressor_mv %>% right_join(interval_times_df) %>% group_by(PatientID, interval_start_time, interval_end_time) %>% summarise(max_amount = max(max_amount)) 

### add action MIMIC
action_df_VP$discrete_VP <- as.integer(cut(action_df_VP$max_amount, breaks = c(-Inf, 0, 0.11016, 0.22509, 0.45035, Inf)))-1

# Change NA to zero's
action_df_VP$max_amount[is.na(action_df_VP$max_amount)] <- 0
action_df_VP$discrete_VP[is.na(action_df_VP$discrete_VP)] <- 0

# Visual inspection
head(action_df_VP)
nrow(action_df_VP)
table(action_df_VP$discrete_VP)

# Import FLuids

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")

# To increase the storage capacity (increases storage capacity to 7GB)
memory.limit(size=56000)

# data.table improves performance
library(data.table)

# Path to directory with data files
data_path <- 'ICV_data/'

########################################################################################################################
# IMPORT FLUID RELATED DATA
interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
fluids_df <- read_csv(paste0(data_path, 'Fluids.csv'))

# clean names
names(fluids_df)[1] <- "PatientID"
names(fluids_df)

In [None]:
fluids_dt <- as.data.table(fluids_df, key = 'PatientID') %>% filter(FullDose > 0 & FullDose < 5000)
interval_times_dt <- as.data.table(interval_times_df, key = 'PatientID')

# Process IV fluids

### 1. IV dose started before interval and ended after interval

In [None]:
total_IV_during <- interval_times_dt %>% left_join(fluids_dt, allow.cartesian = TRUE) %>% group_by(PatientID, interval_start_time, interval_end_time) %>%
                   filter(StartTime < interval_start_time & EndTime > interval_end_time) %>% summarise(total_amount_during = sum(DosePerMinute*60*4))
all_IV_during <- total_IV_during %>% rename(total_amount = total_amount_during)
nrow(all_IV_during)

### 2. IV dose started before and ended within interval

In [None]:
# IV dose started before interval and ended within interval
total_IV_before <- interval_times_dt %>% left_join(fluids_dt, allow.cartesian = TRUE) %>% group_by(PatientID, interval_start_time, interval_end_time) %>%
                   filter(FullDose > 0 & StartTime < interval_start_time & EndTime <= interval_end_time & EndTime > interval_start_time)
all_IV_before <- total_IV_before %>% mutate(total_amount_before = round(as.numeric(difftime(EndTime, interval_start_time, units='mins'))*DosePerMinute,3))
all_IV_before <- all_IV_before %>% select(PatientID, interval_start_time, interval_end_time, total_amount_before) %>% group_by(PatientID, interval_start_time,interval_end_time) %>% summarise(total_amount = sum(total_amount_before))
nrow(all_IV_before)

### 3. IV dose started within interval and ended within interval

In [None]:
# IV dose started within the interval and ended within the interval
all_IV_within <- interval_times_dt %>% left_join(fluids_dt, allow.cartesian = TRUE) %>% group_by(PatientID, interval_start_time, interval_end_time) %>%
                   filter(FullDose > 0 & StartTime >= interval_start_time & StartTime < interval_end_time & EndTime <= interval_end_time & EndTime > interval_start_time) %>% summarise(total_amount = sum(FullDose))
nrow(all_IV_within)

### 4. IV Dose started within interval and ended after interval

In [None]:
# IV dose started within interval and ended after interval
total_IV_after <- interval_times_dt %>% left_join(fluids_dt, allow.cartesian = TRUE) %>% group_by(PatientID, interval_start_time, interval_end_time) %>%
                   filter(StartTime > interval_start_time & StartTime < interval_end_time & EndTime > interval_end_time) 
all_IV_after <- total_IV_after %>% mutate(total_amount_after = round((as.numeric(difftime(interval_end_time, StartTime, units='mins'))/DurationMinutes)*DosePerMinute,3))
all_IV_after <- all_IV_after %>% select(PatientID, interval_start_time, interval_end_time, total_amount_after) %>% 
                                 group_by(PatientID, interval_start_time,interval_end_time) %>% 
                                 summarise(total_amount = sum(total_amount_after))
nrow(all_IV_after)

In [None]:
summary(all_IV_before$total_amount)
summary(all_IV_within$total_amount)
summary(all_IV_after$total_amount)
summary(all_IV_during$total_amount)

### 5. Join all IV tables

In [None]:
# full join all IV tables
total_IV <- all_IV_during %>% full_join(all_IV_before) %>% 
                                full_join(all_IV_within) %>% 
                                full_join(all_IV_after) %>% 
                                group_by(PatientID, interval_start_time, interval_end_time) %>%
                                summarise(total_amount = sum(total_amount))
head(total_IV)
nrow(total_IV)
sprintf("Finished at: %st", Sys.time())

### 6. RIGHT JOIN with INTERVAL TIMES and replace NA fluid dose with zero fluid dose

In [None]:
# no fluid signals = zero fluid dose given
total_IV_mv <- total_IV %>% right_join(interval_times_df) %>% group_by(PatientID, interval_start_time, interval_end_time) %>% replace_na(list(total_amount = 0))
head(total_IV_mv)
nrow(total_IV_mv)

## 7. Plot the dose per interval distribution

In [None]:
# visual inspection of total amount of fluids given to each patient within the min/max time_interval
hist(total_IV$total_amount, xlim = c(0,5000),breaks=100)

## Fluid ACTION SPACE: MIMIC vs ICV
#### Create action_df_IV
    ### MIMIC:
    # total_IV_mv$discrete_IV <- as.integer(cut(total_IV_mv$total_amount, breaks = c(-Inf, 0, 215, 474, 1000, Inf)))-1 

In [None]:
# A function to map the data to quartiles
num2quartile <- function(x) {
  #     x <- x[is.na(x)]
  quants <- quantile(x, probs=0:4/4, na.rm = TRUE)
  print(round(quants,5))
  return(as.integer(cut(x, quants, include.lowest=TRUE)))
}
ICV_fluid_Quantiles = num2quartile(total_IV$total_amount)

In [None]:
### ICV
table(as.integer(cut(total_IV$total_amount, breaks = c(-Inf, 0, 266.5781, 463.2597, 812.0620, Inf)))-1 )

### MIMIC
table(as.integer(cut(total_IV$total_amount, breaks = c(-Inf, 0, 39.83359, 204.40254, 635.20516, Inf)))-1 )

### MV only MIMIC DATA - IV FLUID QUANTILES --> Perform manual cut on ICV DATA:
         0%         25%         50%         75%        100% 
    0.00000    39.83359   204.40254   635.20516 24852.55100 

In [None]:
# MIMIC BREAKS:
total_IV_mv$discrete_IV <- as.integer(cut(total_IV_mv$total_amount, breaks = c(-Inf, 0, 39.83359, 204.40254, 635.20516, Inf)))-1 

# merge vasopressor with interval_times (essentialy adds empty interval times that were left out in the initial max_vasopressor_mv join)
action_df_IV <- total_IV_mv %>% right_join(interval_times_df)

# Change NA to zero's
action_df_IV$total_amount[is.na(action_df_IV$total_amount)] <- 0
action_df_IV$discrete_IV[is.na(action_df_IV$discrete_IV)] <- 0

# CREATE ACTION SPACE

In [None]:
# check first:
table(action_df_IV$discrete_IV)
table(action_df_VP$discrete_VP)

### Merge VP and IV dataframes

In [None]:
########################################################################################################################
### MERGE DATA
action_df <- action_df_IV %>% full_join(action_df_VP) #%>% group_by(PatientID, interval_start_time, interval_end_time)
head(action_df)
nrow(action_df)

## Create final actions

    AS in MIMIC: action_df <- action_df %>% mutate(discrete_action = discrete_IV + 5 * discrete_pressor)

In [None]:
# Get the final state as discrete_IV + 5 * discrete_VP
action_df <- action_df %>% mutate(discrete_action = as.numeric(discrete_IV) + 5 * as.numeric(discrete_VP))
action_df <- action_df %>% rename(total_IV = total_amount, max_VP = max_amount)
sprintf("Finished at: %st", Sys.time())

# Add Fluid running total

In [None]:
# Add running total
action_df <- action_df %>% group_by(PatientID) %>% arrange(PatientID, interval_start_time, interval_end_time) %>% mutate(Running_total_IV = cumsum(total_IV))

In [None]:
### table FINAL ACTION
table(action_df$discrete_action)
head(action_df)

In [None]:
print("amount of unique actions:")
print(as.character(length(unique(action_df$discrete_action))))

In [None]:
########################
# WRITE DATA
write.csv(action_df, file = "ICV_data/action_df.csv", row.names = FALSE)

# Visually inspect final action space

In [None]:
ggplot(action_df, aes(total_IV, max_VP, color = as.factor(discrete_action))) + 
  geom_point() + scale_y_log10() + scale_x_log10() + xlab('Total IV fluid (mL)') + ylab('Max VP Dose')

# labs and Vitals

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'ICV_data/'

########################
### import cohort and time series
cohort_df <- read_csv(paste0(data_path, 'admissions_df.csv'))
interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
interval_times_dt <- as.data.table(interval_times_df, key = 'PatientID')

## Import vitals

In [None]:
### import vitals data
vitals_df <- read_csv(paste0(data_path, 'vitals.csv'))

# subset
keep <- c("PatientID", "Time", "Parameter", "Value")
vitals_df <- vitals_df[c(keep)]

# Cast to DT
vitals_dt <- as.data.table(vitals_df, key = 'PatientID') # add filter with mutation: all SPO2 values < 1 -> *100!

# clear up memory
rm(vitals_df)
sprintf("Finished at: %st", Sys.time())

## Clean up Vitals

In [None]:
vitals_dt %>% group_by(Parameter) %>% summarize(mean=mean(Value), min=min(Value), max=max(Value), var=var(Value))

In [None]:
# Cleanup
vitals_dt = vitals_dt %>% mutate(Value = case_when(Parameter == 'AF' & Value < 5 ~ -1,
                                                   Parameter == 'AF' & Value > 450 ~ -1,
                                                   Parameter == 'AF' & Value > 150 & Value < 450 ~ Value/10,
                                                   Parameter == 'AF' & Value > 45 ~ -1,
                                                   
                                                   Parameter == 'TEMP' & Value < 25~ -1,
                                                   Parameter == 'TEMP' & Value > 400 ~ -1,
                                                   Parameter == 'TEMP' & Value > 300 & Value < 400 ~ Value/10,
                                                   Parameter == 'TEMP' & Value > 45 ~ -1,
                                                   
                                                   Parameter == 'SYS' & Value < 30 ~ -1,
                                                   Parameter == 'SYS' & Value > 3000 ~ -1,
                                                   Parameter == 'SYS' & Value > 700 & Value < 3000 ~ Value/10,
                                                   Parameter == 'SYS' & Value > 350 ~ -1,
                                                   
                                                   Parameter == 'DIA' & Value < 30 ~ -1,
                                                   Parameter == 'DIA' & Value > 1500 ~ -1,
                                                   Parameter == 'DIA' & Value > 400 & Value < 1500 ~ Value/10,
                                                   Parameter == 'DIA' & Value > 250 ~ -1,
                                                   
                                                   Parameter == 'MAP' & Value < 30 ~ -1,
                                                   Parameter == 'MAP' & Value > 2500 ~-1,
                                                   Parameter == 'MAP' & Value > 400 & Value < 2500 ~ Value/10,
                                                   Parameter == 'MAP' & Value > 250 ~-1,
                                                   
                                                   Parameter == 'HF' & Value < 0 ~ -1,
                                                   Parameter == 'HF' & Value > 1500 ~ -1,
                                                   Parameter == 'HF' & Value > 500 & Value < 1500 ~ Value/10,
                                                   Parameter == 'HF' & Value > 300 ~ -1,
                                                   
                                                   Parameter == 'FiO2' & Value < 0 ~ -1,
                                                   Parameter == 'FiO2' & Value > 100 ~ -1,
                                                   Parameter == 'FiO2' & Value < 21 & Value > 1 ~ 21,
                                                   Parameter == 'FiO2' & Value > 0.21 & Value < 1 ~ Value*100,
                                                   
                                                   Parameter == 'SpO2' & Value < 0 ~ -1,
                                                   Parameter == 'SpO2' & Value > 100 ~ -1,
                                                   Parameter == 'SpO2' & Value > 0.5 & Value < 1 ~ Value*100,
                                                   
                                                   TRUE ~ Value
                                                  )
                                ) %>% filter(Value != -1)
vitals_dt %>% group_by(Parameter) %>% summarize(mean=mean(Value), min=min(Value), max=max(Value), var=var(Value))

In [None]:
write_csv(vitals_dt, paste0(data_path, 'vitals_dt_clean.csv'))

## Process vitals in chunks

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'ICV_data/'

interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
interval_times_dt <- as.data.table(interval_times_df, key = 'PatientID')

vitals_dt <- read_csv(paste0(data_path, 'vitals_dt_clean.csv'))

### DO IN CHUNKS:
pt_list <- unique(interval_times_dt$PatientID)
pt_list_subsets <- split(pt_list, ceiling(seq_along(pt_list)/100))

# loop through chunk:
print("starting vital processing")
for(i in 1:length(pt_list_subsets)){
  print(paste('run ',as.character(i),' out of ',as.character(length(pt_list_subsets))))
  vitals_subset <- vitals_dt[vitals_dt$PatientID %in% pt_list_subsets[[i]], ]
  interval_times_dt_subset <- interval_times_dt[interval_times_dt$PatientID %in% pt_list_subsets[[i]], ]
  
  # Merge vitals with interval times
  merged_vitals <- interval_times_dt_subset %>%  merge(vitals_subset, allow.cartesian = TRUE)
  
  # INNER JOIN BY "mean(Values) between interval_start_end & interval_end_time"
  mean_vitals_subset <- merged_vitals[Time > interval_start_time & Time <= interval_end_time,
                               .(mean_vital = mean(Value)), by = .(PatientID, 
                                                                   interval_start_time, 
                                                                   interval_end_time, 
                                                                   Parameter)]
  if(i==1) {
    all_mean_vitals <- mean_vitals_subset
  } else{
    all_mean_vitals <- rbind(all_mean_vitals, mean_vitals_subset)
  }
}

#########################################
# Visual check merged vitals
head(all_mean_vitals)

# save to csv
write_csv(all_mean_vitals, paste0(data_path, 'all_mean_vitals.csv'))

# finish
sprintf("Finished at: %st", Sys.time())

## Import and process labs

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'ICV_data/'

# load the labs data
labs_dt <- read_csv(paste0(data_path, 'Labs.csv'))

# subset
keep <- c("PatientID", "Time", "Parameter", "Value")
labs_dt <- labs_dt[c(keep)]

# Cast labs data to data.table
labs_dt <- as.data.table(labs_dt, key = 'PatientID')

## Clean up Labs

In [None]:
labs_dt %>% group_by(Parameter) %>% summarize(mean=mean(Value), min=min(Value), max=max(Value), var=var(Value))

In [None]:
# cleanup
labs_dt = labs_dt %>% mutate(Value = case_when(Parameter == 'Magnesium' & Value < 0.2 ~ -1,

                                               Parameter == 'INR' & Value > 90 & Value < 300 ~ Value/100,
                                               Parameter == 'INR' & Value < 0 ~ -1,
                                               Parameter == 'INR' & Value > 20 ~ -1,
                                               
                                               Parameter == 'Ion_Ca' & Value < 0 ~ -1,
                                               Parameter == 'Ion_Ca' & Value > 800 & Value < 1400 ~ Value/100,
                                               Parameter == 'Ion_Ca' & Value > 10 ~ -1,
                                               
                                               Parameter == 'Fosfaat' & Value > 4 ~ -1,
                                               
                                               Parameter == 'PCO2' & Value < 10 ~ -1,
                                               Parameter == 'PCO2' & Value > 250 ~ -1,
                                               
                                               Parameter == 'PCO2' & Value < 10 ~ -1,
                                               Parameter == 'PCO2' & Value > 250 ~ -1,
                                               
                                               Parameter == 'PCO2' & Value < 10 ~ -1,
                                               Parameter == 'PCO2' & Value > 250 ~ -1,
                                               
                                               Parameter == 'ANION-GAP' & Value < -10 ~ -1,
                                               Parameter == 'ANION-GAP' & Value > 100 ~ -1,
                                               
                                               Parameter == 'Bicarbonaat' & Value < -20 ~ -1,
                                               Parameter == 'bicarbonaat' & Value > 130 & Value < 300 ~ Value/10,
                                               Parameter == 'bicarbonaat' & Value > 1300 & Value < 3000 ~ Value/100,
                                               Parameter == 'Bicarbonaat' & Value > 50 ~ -1,
                                               
                                               Parameter == 'Chloor' & Value < 50 ~ -1,
                                               Parameter == 'Chloor' & Value > 250 ~ -1,

                                               Parameter == 'PCO2' & Value < 10 ~ -1,
                                               Parameter == 'PCO2' & Value > 250 ~ -1,
                                               
                                               Parameter == 'PO2' & Value < 10 ~ -1,
                                               Parameter == 'PO2' & Value > 250 ~ -1,
                                               
                                               Parameter == 'PH' & Value < 5 ~ -1,
                                               Parameter == 'PH' & Value < 9 ~ -1,
                                               Parameter == 'PH' & Value > 6700 & Value < 7600 ~ Value/100,
                                               Parameter == 'PH' & Value > 670 & Value < 760 ~ Value/10,
                                               
                                               Parameter == 'Natrium' & Value < 90 ~ -1,
                                               Parameter == 'Natrium' & Value > 1000 & Value < 15000 ~ Value/10,
                                               Parameter == 'Natrium' & Value > 180 ~ -1,
                                               
                                               Parameter == 'Kalium' & Value < 1 ~ -1,
                                               Parameter == 'Kalium' & Value > 10 ~ -1,
                                               
                                               Parameter == 'Lactaat' & Value < 0 ~ -1,
                                               Parameter == 'Lactaat' & Value > 50 ~ -1,
                                               
                                               TRUE ~ Value
                                                  )
                                ) %>% filter(Value != -1) 
print(labs_dt %>% group_by(Parameter) %>% summarize(mean=mean(Value), min=min(Value), max=max(Value), var=var(Value)))

In [None]:
labs_dt %>% group_by(Parameter) %>% summarize(mean=mean(Value), min=min(Value), max=max(Value), var=var(Value))

In [None]:
write_csv(labs_dt, paste0(data_path, 'labs_dt_clean.csv'))

## Process Labs in chunks - merge with interval_times

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'ICV_data/'

interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
interval_times_dt <- as.data.table(interval_times_df, key = 'PatientID')

labs_dt <- read_csv(paste0(data_path, 'labs_dt_clean.csv'))

### DO IN CHUNKS:
pt_list <- unique(interval_times_dt$PatientID)
pt_list_subsets <- split(pt_list, ceiling(seq_along(pt_list)/100))

# loop through chunk:
print("starting labs processing")
for(i in 1:length(pt_list_subsets)){
  print(paste('run ',as.character(i),' out of ',as.character(length(pt_list_subsets))))
  labs_subset <- labs_dt[labs_dt$PatientID %in% pt_list_subsets[[i]], ]
  interval_times_dt_subset <- interval_times_dt[interval_times_dt$PatientID %in% pt_list_subsets[[i]], ]
  
  # Merge vitals with interval times
  merged_labs <- interval_times_dt_subset %>%  merge(labs_subset, allow.cartesian = TRUE)
  
  # INNER JOIN BY "mean(Values) between interval_start_end & interval_end_time"
  mean_labs_subset <- merged_labs[Time > interval_start_time & Time <= interval_end_time,
                               .(mean_lab = mean(Value)), by = .(PatientID, 
                                                                   interval_start_time, 
                                                                   interval_end_time, 
                                                                   Parameter)]
  if(i==1) {
    all_mean_labs <- mean_labs_subset
  } else{
    all_mean_labs <- rbind(all_mean_labs, mean_labs_subset)
  }
}

# visual check merged labs
head(all_mean_labs)

# write to csv
write_csv(all_mean_labs, paste0(data_path, 'all_mean_labs.csv'))

# cleanup
sprintf("Finished at: %st", Sys.time())

## Merge vitals and labs

In [None]:
########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'ICV_data/'

# import data
mean_vitals <- read_csv(paste0(data_path, 'all_mean_vitals.csv'))
mean_labs <- read_csv(paste0(data_path, 'all_mean_labs.csv'))

# make it pretty
mean_labs <- mean_labs %>% rename(mean_value = mean_lab)
mean_vitals <- mean_vitals %>% rename(mean_value = mean_vital)

# Join tables
mean_labs_vitals_temp <- mean_labs %>% full_join(mean_vitals)

# take mean of duplicates (to do: find origin of duplicates)
mean_labs_vitals = mean_labs_vitals_temp %>% group_by(PatientID, interval_start_time, interval_end_time, Parameter) %>% summarise(mean_value = mean(mean_value))

# PIVOT and join with interval times
vitals_labs_spread <- mean_labs_vitals %>% spread(Parameter, mean_value) %>% right_join(interval_times_dt) 
print(c(names(vitals_labs_spread)))

In [None]:
# Exclude times in which no measurements were made
vitals_labs_spread_filt <- vitals_labs_spread %>%
  gather(Parameter, mean_value, 4:ncol(vitals_labs_spread)) %>%  # these are column indexes for the "actual parameter now transposed"
  group_by(PatientID, interval_start_time, interval_end_time) %>%
  summarise(exclude = all(is.na(mean_value))) %>%
  full_join(vitals_labs_spread) %>% filter(!exclude) %>% select(-exclude) # filter can be applied to identify rows without any lab or vital measurements

# visual check of combined vitals and labs spread file (almost final file)
print(names(vitals_labs_spread_filt))
sprintf("Finished at: %st", Sys.time())

## Save vitals_and_labs

In [None]:
########################
### export data to csv
vitals_labs_spread_filt = vitals_labs_spread_filt[ ,-ncol(vitals_labs_spread_filt)]
write_csv(vitals_labs_spread_filt, paste0(data_path, 'vitals_labs_spread_filt.csv'))
sprintf("Finished at: %st", Sys.time())

# Create ICV_state_space

In [None]:
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'ICV_data/'

########################
# load interval times
interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
interval_times_dt <- as.data.table(interval_times_df, key = 'PatientID')

# Load the cohort and the interval times -- interval times derived in 'create_action_space.ipynb'
data_path <- 'ICV_data/'

### import cohort, labs and vitals, demographics and actions
vitals_labs <- read_csv(paste0(data_path, 'vitals_labs_spread_filt.csv'))
demographics <- read_csv(paste0(data_path, 'demographics_df.csv'))

# visual inspect
print(names(interval_times_df))
print(names(vitals_labs))
print(names(demographics))

## FILL lab and vital values: arranged by start time (ordered by time) and grouped by patientID

In [None]:
########################
### Merge vitals with labs with interval times
vitals_labs_lvcf <- vitals_labs %>% group_by(PatientID) %>%
  arrange(PatientID, interval_start_time, interval_end_time) %>%
  fill(AF:Ureum)

# final merge with demographics
data_filled <- vitals_labs_lvcf %>% full_join(demographics) %>% ungroup()
sprintf("Finished at: %st", Sys.time())
dim(data_filled)

## ADD URINE and cumulative URINE output to ICV_state_space

In [None]:
# load the urine output data
urine_df <- read_csv(paste0(data_path, 'UrineOutput.csv'))
urine_dt <- as.data.table(urine_df, key = 'PatientID') %>% filter(UrineOutput >0 & UrineOutput <5000)

# Merge with interval times
merged_urine <- interval_times_dt %>% merge(urine_dt, allow.cartesian = TRUE)

In [None]:
# MERGE WITH MEAN() + Keep only 'real value' (urine production within interval times)
total_urine_df <- merged_urine[Time > interval_start_time & Time <= interval_end_time,
                         .(total_UP = sum(UrineOutput)), by = .(PatientID, 
                                                           interval_start_time, 
                                                           interval_end_time)]
total_urine_df <- total_urine_df %>% right_join(interval_times_dt) %>%  group_by(PatientID, interval_start_time, interval_end_time) %>%replace_na(list(total_UP = 0))

# add running total
all_UP <- total_urine_df %>% group_by(PatientID) %>% arrange(PatientID, interval_start_time, interval_end_time) %>% mutate(Running_total_UP = cumsum(total_UP))

# visual check
head(all_UP)
dim(all_UP)
dim(interval_times_dt)

# cleanup
sprintf("Finished at: %st", Sys.time())

## Add urine produciton to ICV_state_space

In [None]:
# CREATE FINAL DATASET
data_all <- data_filled %>% full_join(all_UP)
# INSPECT
dim(data_all)
head(data_all)
print(names(data_all))

In [None]:
#######################
### export data to csv
write_csv(data_all, paste0(data_path, 'ICV_statespace.csv'))
sprintf("Finished at: %st", Sys.time())

# LAST STEP: Add outcome and reward

In [None]:
##########
### PREPARATION
data_path <- 'ICV_data/'

# load libraries
library(tidyverse)   # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

########################
### import import data
cohort_df <- read_csv(paste0(data_path, 'cohort_df.csv'))
action_df <- read_csv(paste0(data_path, 'action_df.csv'))
alldata_df <- read_csv(paste0(data_path, 'ICV_statespace.csv'))

In [None]:
########################
### Start with patient cohort (patientID and survival=Discharge)
outcome_df <- cohort_df %>% group_by(PatientID) %>% select(PatientID, Discharge)

########################
### Merge data with Actions and add outcome
alldata_df_action <- alldata_df %>% inner_join(select(action_df, PatientID,
                                                      interval_start_time, 
                                                      interval_end_time,
                                                      discrete_action,
                                                      total_IV,
                                                      Running_total_IV,
                                                      max_VP)) %>%inner_join(outcome_df)

######################## 
# Add final reward 
temp_alldata <- alldata_df_action %>% group_by(PatientID,Discharge) %>% summarise(interval_start_time = max(interval_start_time)) %>%  mutate(reward = ifelse(Discharge, -15, 15)) # dead = 1, alive = 0

# add zeros for intermediate reward
temp_alldata_action_reward <- alldata_df_action %>% left_join(temp_alldata) %>% replace_na(list(reward = 0))

# visual check
print(names(temp_alldata_action_reward))

# total_IV and max_VP Action shift (lag one state behind current state)

In [None]:
temp_alldata_action_reward = temp_alldata_action_reward %>% 
        group_by(PatientID) %>% mutate(total_IV_prev = lag(total_IV,order_by=PatientID)) %>% 
        mutate(total_IV_prev = replace_na(total_IV_prev, 0))

temp_alldata_action_reward = temp_alldata_action_reward %>% 
mutate(max_VP_prev = lag(max_VP,order_by = PatientID)) %>% mutate(max_VP_prev = replace_na(max_VP_prev, 0)) 

# FINAL TABLE

In [None]:
########################
### Create final table
ICV_all <- temp_alldata_action_reward %>% rename(ANION_GAP = 'ANION-GAP') %>% 
                                          rename(Height = Lengte) %>% 
                                          rename(Gender = Gender) %>%
                                          rename(Reward = reward) %>%
                                          rename(Temp = TEMP) %>%
                                          mutate(Ventilator = ifelse(Ventilation == "NO", 0, 1)) %>%
                                          rename(HeartRate = HF) %>%
                                          rename(Sofa_score = SOFA) %>%
                                          rename(Sirs_score = SIRS) %>%
                                          rename(RespRate = AF) %>%
                                          rename(Lactate = Lactaat) %>%
                                          rename(PaO2 = PO2) %>%
                                          rename(PaCO2 = PCO2) %>%
                                          rename(SpO2 = SpO2) %>%
                                          rename(Art_PH = PH) %>%
                                          rename(Bicarbonaat = Bicarbonaat) %>%
                                          rename(Art_BE= Art_BE) %>%
                                          rename(Calcium = Calcium) %>%
                                          rename(Ion_Ca = Ion_Ca) %>%
                                          rename(Chloride = Chloor) %>%
                                          rename(Ureum = Ureum) %>% 
                                          rename(Creat = CREAT) %>%
                                          rename(Bili = BILI) %>%
                                          rename(INR=INR) %>%
                                          rename(Albumine = Albumine) %>% 
                                          rename(Trombo = TROMBO) %>%
                                          rename(FiO2 = FiO2) %>%
                                          rename(Running_total_IV = Running_total_IV) %>%
                                          mutate(Shock_Index = HeartRate / na_if(SYS,0)) %>%
                                          mutate(PF_ratio = PaO2 / na_if(FiO2, 0)) %>% #mutate(PF_ratio = na_if(PF_ratio, NA))
                                          select(-GCS,-qsofa_vent,-qsofa_sysbp,-CRP,-CK,-Fosfaat,-Amylase,-Ventilation)
# order collumns by name (the order in which they will appear in the pickled data dictionaty)
ICV_all = ICV_all[,c(names(ICV_all)[1:3],sort(colnames(ICV_all[4:ncol(ICV_all)] )))]
print(names(ICV_all))

In [None]:
########################
### export data to csv
final_data_path = 'data/'
write_csv(ICV_all, paste0(final_data_path, 'ICV_data.csv'))
sprintf("Finished at: %st", Sys.time())

## Merge with original admission_df to create < and => 2014 subsets

In [None]:
########################
library(tidyverse) # install.packages("tidyverse") # library(devtools) # install_version("rlang", version = "0.2.2", repos = "http://cran.us.r-project.org")

########################
# IMPORT DATA
ICV_data_path <- 'ICV_data/'
ICV_df <- read_csv(paste0(ICV_data_path, 'Admissions.csv'))  %>% select(PatientID, AddmissionDate)
ICV_data <- read_csv('data/ICV_data.csv')

In [None]:
########################
### Combine vitals and labs (data_all.csv) with actions (action_df.csv) and cohort (cohort.csv: only for discharge[0=alive])
ICV_data <- ICV_data %>% left_join(ICV_df)
final_data_path = 'data/'

########################
# export OLD data to csv
ICV_before = ICV_data %>% filter(AddmissionDate < as.POSIXct("2014-01-01 00:00:00")) %>% select(-AddmissionDate)
write_csv(ICV_before, paste0(final_data_path, 'ICV_olddata.csv'))

# export new data
ICV_after = ICV_data %>% filter(AddmissionDate >= as.POSIXct("2014-01-01 00:00:00")) %>% select(-AddmissionDate)
write_csv(ICV_after, paste0(final_data_path, 'ICV_newdata.csv'))

sprintf("Finished at: %st", Sys.time())


## Reproducability is important

In [None]:
sessionInfo()
sprintf("Finished at: %st", Sys.time())