In [None]:
########################
# set working directory
setwd("/Users/luca/Projects/rl_sepsis/SEPSIS")
#setwd("C:/Users/l.roggeveen/SRL/rl_sepsis/S EPSIS/")
#setwd("D:/ResearchData/rl_sepsis/SEPSIS")

# load libraries
library(tidyverse) # install.packages("tidyverse") # library(devtools) # install_version("rlang", version = "0.2.2", repos = "http://cran.us.r-project.org")

# Get Cohort data and create interval_times_df

In [None]:
########################
# IMPORT DATA
data_path <- 'MIMIC_data/'
cohort_df <- read_csv(paste0(data_path, 'cohort.csv'))
head(cohort_df)

In [None]:
########################
# DATA TRANSFORMATIONS
get_time_intervals <- function(start, end) {
  interval_times <- seq(start, end, by = '4 hour')
  interval_start_time <- interval_times[1:length(interval_times) -1]
  interval_end_time <- interval_times[2:length(interval_times)]
  return(data.frame(interval_start_time, interval_end_time))
}
interval_times_df <- cohort_df %>% 
  group_by(subject_id, hadm_id, icustay_id) %>% 
  do(get_time_intervals(.$window_start, .$window_end))

# Visual check
dim(interval_times_df)

sprintf("Finished at: %st", Sys.time())

# VASOPRESSORS
## Get vasopressor data

In [None]:
########################
# IMPORT MORE DATA
# Load some vassopressors
vassopressor_cv_df <- read_csv(paste0(data_path, 'vassopressors_cv_cohort.csv'))
vassopressor_mv_df <- read_csv(paste0(data_path, 'vassopressors_mv_cohort.csv'))
sprintf("Finished at: %st", Sys.time())

## Process vasopressor data from the CareVue datatable

In [None]:
max_vassopressor_cv <- interval_times_df %>% 
  left_join(vassopressor_cv_df) %>% 
  group_by(subject_id, 
           hadm_id, 
           icustay_id, 
           interval_start_time, 
           interval_end_time) %>%
  filter(charttime > interval_start_time & charttime <= interval_end_time) %>%
  summarise(max_amount = max(mcgkgmin)) %>% mutate(max_amount = case_when(max_amount > 1e-02 ~ max_amount, TRUE ~ 0))

summary(max_vassopressor_cv)
sprintf("Finished at: %st", Sys.time())

## Process vasopressor data from the MetaVision datatable

In [None]:
max_vassopressor_mv <- interval_times_df %>% 
  left_join(vassopressor_mv_df) %>% 
  group_by(subject_id, 
           hadm_id, 
           icustay_id, 
           interval_start_time, 
           interval_end_time) %>%
  filter(endtime > interval_start_time & endtime <= interval_end_time) %>%
  summarise(max_amount = max(mcgkgmin)) %>% mutate(max_amount = case_when(max_amount > 1e-02 ~ max_amount, TRUE ~ 0))

summary(max_vassopressor_mv)
sprintf("Finished at: %st", Sys.time())

## Determine Quartiles

### CREATE CUT-OFFS USING METAVISION PATIENTS ONLY

In [None]:
# A function to map the data to quartiles
num2quartile <- function(x) {
  #     x <- x[is.na(x)]
  quants <- quantile(x, probs=0:4/4, na.rm = TRUE)
  print(round(quants,5))
  return(as.integer(cut(x, quants, include.lowest=TRUE)))
}
MIMIC_MV_VP_Quantiles = num2quartile(max_vassopressor_mv$max_amount)

#### MV ONLY MIMIC DATA - VASOPRESSOR QUANTILES --> Perform manual cut on ICV DATA
                  0%          25%          50%          75%         100% 
            0.00200       0.11016      0.22509      0.45035    222.73125 

In [None]:
### Table inspection
table(as.integer(cut(max_vassopressor_mv$max_amount, breaks = c(-Inf, 0.00200, 0.11016, 0.22509, 0.45035, 222.73125)))-1 )
table(as.integer(cut(max_vassopressor_mv$max_amount, breaks = c(-Inf, 0, 0.11016, 0.22509, 0.45035, Inf)))-1 )

## Merge MV and CareVue

In [None]:
# Map to discrete vassopressor states
temp_VP <- max_vassopressor_cv %>% 
  full_join(max_vassopressor_mv) %>%
  filter(!is.na(max_amount))

# merge vasopressor with interval_times (essentialy adds empty interval times that were left out in the initial max_vasopressor_mv join)
action_df_VP <- temp_VP %>% right_join(interval_times_df) %>% group_by(subject_id, hadm_id, icustay_id, interval_start_time, interval_end_time) %>% summarise(max_amount = max(max_amount)) 

### add action MIMIC
action_df_VP$discrete_VP <- as.integer(cut(action_df_VP$max_amount, breaks = c(-Inf, 0, 0.11016, 0.22509, 0.45035, Inf)))-1 

# Change NA to zero's
action_df_VP$max_amount[is.na(action_df_VP$max_amount)] <- 0
action_df_VP$discrete_VP[is.na(action_df_VP$discrete_VP)] <- 0

# Visual inspection
head(action_df_VP)
nrow(action_df_VP)
table(action_df_VP$discrete_VP)

# FLUIDS

In [None]:
########################
# Pulling out some data.table stuff to go FAST
library(data.table)
interval_times_dt <- as.data.table(interval_times_df, key = 'icustay_id')
head(interval_times_dt)

## Import and process CareVue fluids

In [None]:
########################
# load CAREVUE FLUIDS
inputevents_cv_df <- read_csv(paste0(data_path, 'inputevents_cv_cohort.csv'))
ie_filt_cv <- inputevents_cv_df %>% filter(amountuom == 'ml' & !is.na(amount) & amount >= 0 & amount < 5000)
ie_filt_cv_dt <- as.data.table(ie_filt_cv, key = 'icustay_id')
sprintf("Finished at: %st", Sys.time())

In [None]:
########################
# preprocess CAREVUE FLUIDS
total_IV_cv <- interval_times_dt %>% 
  merge(ie_filt_cv_dt, allow.cartesian = TRUE) %>% 
  group_by(subject_id,
           hadm_id, 
           icustay_id, 
           interval_start_time, 
           interval_end_time) %>%
  filter(charttime > interval_start_time & charttime <= interval_end_time) %>%
  summarise(total_amount = sum(amount))
sprintf("Finished at: %st", Sys.time())

## Import and process MetaVision fluids

In [None]:
# Load the fluids
inputevents_mv_df <- read_csv(paste0(data_path, 'inputevents_mv_cohort.csv'))


MV_input = inputevents_mv_df %>% select(subject_id, icustay_id, hadm_id, starttime, endtime, 
                                        amount, amountuom, rate, rateuom, orderid, ordercategoryname, 
                                        secondaryordercategoryname, patientweight, totalamount) %>% 
                                 filter(amountuom == 'ml' & ordercategoryname != '14-Oral/Gastric Intake' & !is.na(amount) & amount > 0 & amount < 5000) %>% 
                                 mutate(duration = as.numeric(c(difftime(endtime, starttime)))) %>% 
                                 mutate(amount = round(amount,2)) %>% 
                                 select(-rate, -rateuom,-orderid,-ordercategoryname,-secondaryordercategoryname,-patientweight, -amountuom,-totalamount)
head(MV_input)

# Preprocess METAVISION IV fluids

### 1. METAVISION IV dose started before interval and ended after interval

In [None]:
total_IV_during <- interval_times_dt %>% left_join(MV_input, allow.cartesian = TRUE) %>% group_by(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time) %>%
                   filter(starttime < interval_start_time & endtime > interval_end_time) 

all_IV_during <- total_IV_during %>% mutate(total_amount_during = as.numeric(interval_end_time-interval_start_time) * (amount/duration))

all_IV_during <- all_IV_during %>% select(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time, total_amount_during) %>% 
                                   group_by(subject_id, icustay_id, hadm_id, interval_start_time ,interval_end_time) %>% 
                                   summarise(total_amount = sum(total_amount_during))
nrow(all_IV_during)
head(all_IV_during)

### 2. METAVISION IV dose started before and ended within interval

In [None]:
# IV dose started before interval and ended within interval
total_IV_before <- interval_times_dt %>% left_join(MV_input, allow.cartesian = TRUE) %>% group_by(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time) %>%
                   filter(starttime < interval_start_time & endtime > interval_start_time & endtime <= interval_end_time) 

all_IV_before <- total_IV_before %>% mutate(total_amount_before = as.numeric(difftime(endtime, interval_start_time, units='mins')) * (amount/duration))

all_IV_before <- all_IV_before %>% select(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time, total_amount_before) %>% 
                                   group_by(subject_id, icustay_id, hadm_id, interval_start_time ,interval_end_time) %>% 
                                   summarise(total_amount = sum(total_amount_before))

nrow(all_IV_before)
head(all_IV_before)

### 3. METAVISION IV dose started within interval and ended within interval

In [None]:
# IV dose started within the interval and ended within the interval
all_IV_within <- interval_times_dt %>% left_join(MV_input, allow.cartesian = TRUE) %>% group_by(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time) %>%
                 filter(starttime >= interval_start_time & starttime < interval_end_time & endtime <= interval_end_time & endtime > interval_start_time) %>%
                 summarise(total_amount = sum(amount))
nrow(all_IV_within)
head(all_IV_within)

### 4. METAVISION IV Dose started within interval and ended after interval

In [None]:
# IV dose started within interval and ended after interval
total_IV_after <- interval_times_dt %>% left_join(MV_input, allow.cartesian = TRUE) %>% group_by(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time) %>%
                   filter(starttime > interval_start_time & starttime < interval_end_time & endtime > interval_end_time) 

all_IV_after <- total_IV_after %>% mutate(total_amount_after = round((as.numeric(difftime(interval_end_time, starttime, units='mins'))/duration)*(amount/duration),3))

all_IV_after <- all_IV_after %>% select(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time, total_amount_after) %>% 
                                 group_by(subject_id, icustay_id, hadm_id, interval_start_time,interval_end_time) %>% 
                                 summarise(total_amount = sum(total_amount_after))
nrow(all_IV_after)

In [None]:
summary(all_IV_before$total_amount)
summary(all_IV_within$total_amount)
summary(all_IV_after$total_amount)
summary(all_IV_during$total_amount)

### 5. Join all METAVISION IV tables

In [None]:
# full join all IV tables
total_IV <- all_IV_during %>% full_join(all_IV_before) %>% 
                                full_join(all_IV_within) %>% 
                                full_join(all_IV_after) %>% 
                                group_by(subject_id, icustay_id, hadm_id, interval_start_time, interval_end_time) %>%
                                summarise(total_amount = sum(total_amount))
head(total_IV)
nrow(total_IV)
sprintf("Finished at: %st", Sys.time())

## 6. Create QUARTILES actions for the IV fluid dose BASED ON MV ONLY!!!

In [None]:
MV_IV_Fluids = num2quartile(total_IV$total_amount)
table(as.integer(cut(total_IV$total_amount, breaks = c(-Inf, 0, 39.83359, 204.40254, 635.20516, 24852.55100)))-1 )
table(as.integer(cut(total_IV$total_amount, breaks = c(-Inf, 0, 39.83359, 204.40254, 635.20516, Inf)))-1 )

### MV only MIMIC DATA - IV FLUID QUANTILES --> Perform manual cut on ICV DATA:
         0%         25%         50%         75%        100% 
    0.00000    39.83359   204.40254   635.20516 24852.55100 

# Merge Metavision and CareVue fluids

In [None]:
# Map to discrete IV states
temp_IV <- total_IV_cv %>% 
  full_join(total_IV) %>%
  filter(!is.na(total_amount))

# merge vasopressor with interval_times (essentialy adds empty interval times that were left out in the initial max_vasopressor_mv join)
action_df_IV <- temp_IV %>% right_join(interval_times_df) %>% group_by(subject_id, hadm_id, icustay_id, interval_start_time, interval_end_time) %>% summarise(total_amount = sum(total_amount)) 

### add action MIMIC
# c(-Inf, 39.83359, 204.40254, 635.20516, Inf)
action_df_IV$discrete_IV <- as.integer(cut(action_df_IV$total_amount, breaks = c(-Inf, 0, 39.83359, 204.40254, 635.20516, Inf)))-1 

# Change NA to zero's
action_df_IV$total_amount[is.na(action_df_IV$total_amount)] <- 0
action_df_IV$discrete_IV[is.na(action_df_IV$discrete_IV)] <- 0

# Visual inspection
head(action_df_IV)
nrow(action_df_IV)

# Create final discrete action

In [None]:
action_df <- action_df_IV %>% full_join(action_df_VP)
sprintf("Finished at: %st", Sys.time())

In [None]:
# Get the final state as discrete_IV + 5 * discrete_pressor
action_df <- action_df %>% mutate(discrete_action = discrete_IV + 5 * discrete_VP)
action_df <- action_df %>% rename(total_IV = total_amount, max_VP = max_amount)
sprintf("Finished at: %st", Sys.time())

In [None]:
# check first:
table(action_df_IV$discrete_IV)
table(action_df_VP$discrete_VP)

## Add Fluid running total

In [None]:
# Add running total
action_df <- action_df %>% group_by(icustay_id) %>% arrange(subject_id,hadm_id,icustay_id,interval_start_time, interval_end_time) %>% mutate(Running_total_IV = cumsum(total_IV))

In [None]:
names(action_df)

In [None]:
########################
# WRITE DATA
write_csv(action_df, paste0(data_path, 'action_df.csv'))
write_csv(interval_times_df, paste0(data_path, 'interval_times_df.csv'))
sprintf("Finished at: %st", Sys.time())

## Visually inspect final action space

In [None]:
# check to make sure everything is ok
ggplot(action_df, aes(total_IV, max_VP, color = as.factor(discrete_action))) + 
  geom_point() + scale_y_log10() + scale_x_log10() + xlab('Total IV fluid (mL)') + ylab('Max VP Dose')
sprintf("Finished at: %st", Sys.time())

### END OF PART 1
# Part 2: vitals and labs

In [None]:
########################
### import cohort and time series
cohort_df <- read_csv(paste0(data_path, 'cohort.csv')) %>% select(-X1)
sprintf("Finished at: %st", Sys.time())

# visual check
head(cohort_df)
sprintf("Finished at: %st", Sys.time())

### VITALS

In [None]:
########################
### import vitals data
vitals_df <- read_csv(paste0(data_path, 'vitals_cohort.csv'))

# visual check
head(vitals_df)
sprintf("Finished at: %st", Sys.time())

In [None]:
# Cast to DataTable
vitals_dt <- as.data.table(vitals_df, key = 'icustay_id')
interval_times_dt <- as.data.table(interval_times_df, key = 'icustay_id')

# Assign vital signs to the corresponding interval times
merged_vitals <- interval_times_dt %>% 
                    merge(vitals_dt, allow.cartesian = TRUE)

# take the mean vital sign for each interval time
mean_vitals <- merged_vitals[charttime > interval_start_time & charttime <= interval_end_time,
                                   .(mean_vital = mean(valuenum)), by = .(subject_id, 
                                                                          hadm_id, 
                                                                          icustay_id, 
                                                                          interval_start_time, 
                                                                          interval_end_time, 
                                                                          vital_id)]
# visual check merged vitals
head(mean_vitals)
sprintf("Finished at: %st", Sys.time())

### LABS

In [None]:
########################
# load the labs data
labs_dt <- read_csv(paste0(data_path, 'labs_cohort.csv'))
labs_dt <- as.data.table(labs_dt, key = 'icustay_id')

# visual check
head(labs_dt)
sprintf("Finished at: %st", Sys.time())

# Assign lab value to the corresponding interval times
merged_labs <- interval_times_dt %>% 
                    merge(labs_dt, allow.cartesian = TRUE)

# take the mean lab value for each interval time
mean_labs <- merged_labs[charttime > interval_start_time & charttime <= interval_end_time,
                                   .(mean_lab = mean(valuenum)), by = .(subject_id, 
                                                                          hadm_id, 
                                                                          icustay_id, 
                                                                          interval_start_time, 
                                                                          interval_end_time, 
                                                                          lab_id)]
# visual check merged labs
head(mean_labs)
sprintf("Finished at: %st", Sys.time())

### MERGE LABS AND VITALS

In [None]:
########################
### Merge Labs and Vitals

# make it pretty
mean_labs <- mean_labs %>% rename(meas_id = lab_id, mean_value = mean_lab)
mean_vitals <- mean_vitals %>% rename(meas_id = vital_id, mean_value = mean_vital)

# Join tables
mean_labs_vitals <- mean_labs %>% full_join(mean_vitals)
vitals_labs_spread <- mean_labs_vitals %>% 
                        spread(meas_id, mean_value) %>% 
                        right_join(interval_times_dt) 

# Exclude times in which no measurements were made
vitals_labs_spread_filt <- vitals_labs_spread %>%
                                gather(lab_id, meas_value, ALBUMIN:WBC) %>%
                                group_by(subject_id, hadm_id, icustay_id, interval_start_time, interval_end_time) %>%
                                summarise(exclude = all(is.na(meas_value))) %>%
                                full_join(vitals_labs_spread) %>%
                                filter(!exclude)

# visual check of combined vitals and labs spread file (final file)
head(vitals_labs_spread_filt)
sprintf("Finished at: %st", Sys.time())

In [None]:
print(names(vitals_labs_spread_filt))

In [None]:
########################
### export data to csv
write_csv(vitals_labs_spread_filt, paste0(data_path, 'vitals_labs_spread_filt.csv'))
sprintf("Finished at: %st", Sys.time())

### End op part 2

# Create MIMIC_DATA_ALL
### Part 3: Merge vitals and labs with demographics and save full as data_all.csv

In [None]:
####################################################################################
####################################################################################
####################################################################################
### PREPARATION

########################
# set working directory
setwd("/Users/luca/Projects/rl_sepsis/SEPSIS")
#setwd("C:/Users/l.roggeveen/SRL/rl_sepsis/SEPSIS/")
#setwd("D:/ResearchData/rl_sepsis/SEPSIS")

########################
# load libraries
library(tidyverse) # install.packages("tidyverse")
library(data.table)  # install.packages("data.table")

# To know the current storage capacity
memory.limit()

# To increase the storage capacity (increases storage capacity to >>>7GB)
memory.limit(size=99000)

# Path to directory with data files
data_path <- 'MIMIC_data/'

### IMPORT
interval_times_df <- read_csv(paste0(data_path, 'interval_times_df.csv'))
vitals_labs <- read_csv(paste0(data_path, 'vitals_labs_spread_filt.csv'))
demographics <- read_csv(paste0(data_path, 'demographics_cohort.csv'))
urine_df <- read_csv(paste0(data_path, 'UrineOutput_cohort.csv'))

sprintf("Finished at: %st", Sys.time())

In [None]:
# Perform LVCF
vitals_labs_lvcf <- vitals_labs %>% group_by(subject_id, hadm_id, icustay_id) %>%
                        arrange(subject_id, hadm_id, icustay_id, interval_start_time) %>%
                        fill(ALAT:WBC) # ALAT:WBC is based on the on 7th up to 40th collumn corresponding to all the actual features in this dataframe

# final merge
data_all <- vitals_labs_lvcf %>% full_join(demographics) %>% ungroup() %>% select(-"<NA>")
head(data_all)
print(names(data_all))
sprintf("Finished at: %st", Sys.time())

## Urine output + CUMULATIVE

In [None]:
# load the urine output data
urine_dt <- read_csv(paste0(data_path, 'UrineOutput_cohort.csv'))
urine_dt <- as.data.table(urine_df, key = 'icustay_id') %>% filter(value<5000 & value>0)

# Merge with interval times
merged_urine <- interval_times_dt %>% merge(urine_dt, allow.cartesian = TRUE)

# MERGE WITH MEAN() + Keep only 'real value' (urine production within interval times)
total_urine_df <- merged_urine[charttime > interval_start_time & charttime <= interval_end_time,
                         .(total_UP = sum(value)), by = .(subject_id,hadm_id,icustay_id, 
                                                           interval_start_time, 
                                                           interval_end_time)]
total_urine_df <- total_urine_df %>% right_join(interval_times_dt) %>%  group_by(subject_id,hadm_id,icustay_id,interval_start_time, interval_end_time) %>%replace_na(list(total_UP = 0))

# add running total
all_UP <- total_urine_df %>% group_by(subject_id,hadm_id,icustay_id) %>% arrange(subject_id,hadm_id,icustay_id, interval_start_time, interval_end_time) %>% mutate(Running_total_UP = cumsum(total_UP))

# visual check
dim(all_UP)
dim(interval_times_dt)
tail(all_UP)

# cleanup
sprintf("Finished at: %st", Sys.time())

## FIO2

In [None]:
# load the urine output data
fio2_dt <- read_csv(paste0(data_path, 'FiO2_cohort.csv'))
fio2_dt <- as.data.table(fio2_dt, key = 'icustay_id')

# Merge with interval times
merged_fio2 <- interval_times_dt %>% merge(fio2_dt, allow.cartesian = TRUE)

# MERGE WITH MEAN() + Keep only 'real value' (urine production within interval times)
total_fio2_df <- merged_fio2[charttime > interval_start_time & charttime <= interval_end_time,
                         .(FiO2 = mean(fio2)), by = .(subject_id,hadm_id,icustay_id, 
                                                           interval_start_time, 
                                                           interval_end_time)]
total_fio2_df <- total_fio2_df %>% right_join(interval_times_dt) %>%  group_by(subject_id,hadm_id,icustay_id,interval_start_time, interval_end_time)


total_fio2_df <- total_fio2_df %>% group_by(subject_id, hadm_id, icustay_id) %>%
                        arrange(subject_id, hadm_id, icustay_id, interval_start_time) %>%
                        fill(FiO2)

# visual check
dim(total_fio2_df)
dim(interval_times_dt)
head(total_fio2_df)

# cleanup
sprintf("Finished at: %st", Sys.time())

## Merge Data_all with Urine output and FIO2

In [None]:
# CREATE FINAL DATASET
data_final <- data_all %>% full_join(total_fio2_df) %>% full_join(all_UP)
# INSPECT
dim(data_final)
head(data_final)
print(names(data_final))

In [None]:
########################
### export data to csv
write_csv(data_final, paste0(data_path, 'data_all.csv'))
sprintf("Finished at: %st", Sys.time())

# MERGE ALL DATA

In [None]:
########################
# set working directory
setwd("/Users/luca/Projects/rl_sepsis/SEPSIS")
#setwd("C:/Users/l.roggeveen/SRL/rl_sepsis/SEPSIS/")
#setwd("D:/ResearchData/rl_sepsis/SEPSIS")

# load libraries
library(tidyverse) # install.packages("tidyverse") # library(devtools) # install_version("rlang", version = "0.2.2", repos = "http://cran.us.r-project.org")

########################
# IMPORT DATA
data_path <- 'MIMIC_data/'
cohort_df <- read_csv(paste0(data_path, 'cohort.csv'))
action_df <- read_csv(paste0(data_path, 'action_df.csv'))
alldata_df <- read_csv(paste0(data_path, 'data_all.csv'))

########################
### Merge all data
outcome_df <- cohort_df %>% group_by(subject_id, 
                                     hadm_id, 
                                     icustay_id)

########################
### Combine vitals and labs (data_all.csv) with actions (action_df.csv) and cohort (cohort.csv: only for discharge[0=alive])
alldata_df_action <- alldata_df %>% inner_join(select(action_df, subject_id, 
                                                      hadm_id, 
                                                      icustay_id, 
                                                      interval_start_time, 
                                                      interval_end_time,
                                                      discrete_action,
                                                      total_IV,
                                                      Running_total_IV,
                                                      max_VP)) %>%
  inner_join(cohort_df)

########################
### Add final reward and zeros for intermediate reward
temp_alldata <- alldata_df_action %>% group_by(subject_id, 
                                               hadm_id, 
                                               icustay_id,
                                               hospital_expire_flag) %>%
  summarise(interval_start_time = max(interval_start_time)) %>%
  mutate(reward = ifelse(hospital_expire_flag, -15, 15))

# add "intermediate reward" to dataset (all 'NA' rewards are set to 0)
temp_alldata_action_reward <- alldata_df_action %>% 
  left_join(temp_alldata) %>%
  replace_na(list(reward = 0))

########################
### create final table
final_df_all <- temp_alldata_action_reward %>% select(
  -exclude, 
  -intime, 
  -outtime, 
  -window_start,
  -window_end,
  -suspected_infection_time_poe
)
# visual check
print(names(final_df_all))

# total_IV and max_VP Action shift (lag one state behind current state)

In [None]:
final_df_all = final_df_all %>% 
        group_by(icustay_id) %>% 
        mutate(total_IV_prev = lag(total_IV,order_by=icustay_id)) %>% 
        mutate(total_IV_prev = replace_na(total_IV_prev, 0)) %>%
        ungroup()

final_df_all = final_df_all %>% 
        group_by(icustay_id) %>%
        mutate(max_VP_prev = lag(max_VP,order_by=icustay_id)) %>% 
        mutate(max_VP_prev = replace_na(max_VP_prev, 0)) %>%
        ungroup()

# Normalize Feature names and data Units to match ICV
### from Raghu
        binary_fields = ['gender','mechvent','re_admission']
        
        norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
            'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
            'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
            'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
            'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
            
        log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
                      'input_total_tev','input_4hourly_tev','output_total','output_4hourly', 'bloc']

In [None]:
# subset features of interest
MIMIC_temp_all <- final_df_all %>% select(-X1, -hadm_id, -subject_id, -race_white, -race_black, -blood_culture_positive, -BANDS,
                                          -race_hispanic, -race_other, -elixhauser_hospital, -lods, -qsofa, -qsofa_gcs_score, -qsofa_sysbp_score, -qsofa_resprate_score) %>% 
                                   rename(PatientID = icustay_id)
print(names(MIMIC_temp_all))

In [None]:
########################
### transform data to ICV units 
MIMIC_all = MIMIC_temp_all %>%  rename(discrete_action = discrete_action) %>%             # MIMIC action   to ICV action   (no conversion needed) 
                                rename(max_VP = max_VP) %>%                               # MIMIC action   to ICV action   (no conversion needed) 
                                rename(total_IV = total_IV) %>%                           # MIMIC action   to ICV action   (no conversion needed) 
                                rename(Discharge = hospital_expire_flag) %>%              # MIMIC 1=death  to ICV 1=death  (no conversion needed) 
                                rename(Reward = reward) %>%                               # MIMIC score    to ICV score    (no conversion needed) 
                                rename(Sirs_score = sirs) %>%                             # MIMIC score    to ICV score    | ICV admission | MIMIC -> sepsis3.sql (onset)
                                rename(Sofa_score = sofa) %>%                             # MIMIC score    to ICV score    | ICV admission | MIMIC -> sepsis3.sql (onset)
                                rename(Weight = weight) %>%                               # MIMIC kg       to ICV kg       | ICV admission | MIMIC -> sepsis3.sql (onset)
                                rename(Ventilator = vent) %>%                             # MIMIC 0/1      to ICV 0/1      | (-2 to +24h)  | MIMIC -> sepsis3.sql (-4 to +24h)
                                rename(Height = height) %>%                               # MIMIC cm       to ICV cm       (no conversion needed)
                                rename(Age = age) %>%                                     # MIMIC years    to ICV years    (no conversion needed)
                                rename(Gender = is_male) %>%                              # MIMIC male=1   to ICV male=1   (no conversion needed)
                                rename(HeartRate = HeartRate) %>%                         # MIMIC /min     to ICV /min     (no conversion needed) 
                                rename(Temp = TempC) %>%                                  # MIMIC Celcius  to ICV Celcius  (no conversion needed) 
                                rename(MAP = MeanBP) %>%                                  # MIMIC mmHg     to ICV mmHg     (no conversion needed) 
                                rename(DIA = DiasBP) %>%                                  # MIMIC mmHg     to ICV mmHg     (no conversion needed) 
                                rename(SYS = SysBP) %>%                                   # MIMIC mmHg     to ICV mmHg     (no conversion needed) 
                                rename(RespRate = RespRate) %>%                           # MIMIC /min     to ICV /min     (no conversion needed) 
                                rename(SpO2 = SpO2) %>%                                   # MIMIC %        to ICV %        (no conversion needed) 
                                rename(Natrium = SODIUM) %>%                              # MIMIC mEq/L    to ICV mmol/L   (no conversion needed) 
                                rename(Chloride = CHLORIDE) %>%                           # MIMIC mEq/L    to ICV mmol/L   (no conversion needed)
                                rename(Kalium = POTASSIUM) %>%                            # MIMIC mEq/L    to ICV mmol/L   (no conversion needed) 
                                rename(Trombo = PLATELET) %>%                             # MIMIC k/ul     to ICV 10^9/L   (no conversion needed) 
                                rename(LEU = WBC) %>%                                     # MIMIC K/uL     to ICV 10e^9/L  (no conversion needed)
                                rename(ANION_GAP = 'ANION GAP') %>%                       # MIMIC mEq/L    to ICV mmol/l   (no conversion needed)
                                rename(APTT = PTT) %>%                                    # MIMIC sec      to ICV sec      (no conversion needed)
                                rename(Art_PH = PH) %>%                                   # MIMIC geen     to ICV geen     (no conversion needed)
                                rename(ASAT = ASAT) %>%                                   # MIMIC IU/L     to ICV IE/L     (no conversion needed)
                                rename(ALAT = ALAT) %>%                                   # MIMIC IU/L     to ICV IE/L     (no conversion needed)
                                rename(Bicarbonaat = BICARBONATE) %>%                     # MIMIC mEq/L    to ICV mmol/L   (no conversion needed)
                                rename(Art_BE = BaseExcess) %>%                           # MIMIC mEq/L    to ICV mmol/L   (no conversion needed)
                                rename(Ion_Ca = ION_CALCIUM) %>%                          # MIMIC mmol/l   to ICV mmol/L   (no conversion needed)
                                rename(Lactate = LACTATE) %>%                             # MIMIC ?        to ICV ?        (no conversion needed)
                                rename(PaCO2 = PACO2) %>%                                 # MIMIC mmHg     to ICV mmhg     (ICV SQL file converts Kpa to mmHg values, no further conversion needed)
                                rename(PaO2 = PAO2) %>%                                   # MIMIC mmHg     to ICV mmhg     (ICV SQL file converts Kpa to mmHg values, no further conversion needed)
                                mutate(Shock_Index = HeartRate / SYS) %>%                 # MIMIC score    to ICV score    (calculation of score) 
                                mutate(HB = HEMOGLOBIN * 0.6206) %>%                      # MIMIC g/dL     to ICV mmol/l   (http://www.scymed.com/en/smnxpf/pfxdq210_c.htm)
                                mutate(Bili = BILIRUBIN * (1/0.05847953)) %>%             # MIMIC mg/dl    to ICV umol/L   (http://www.endmemo.com/medical/unitconvert/Bilirubin.php)
                                mutate(Creat = CREATININE * (1/0.01131222)) %>%           # MIMIC mg/dl    to ICV umol/l   (http://www.endmemo.com/medical/unitconvert/Creatinine.php)
                                mutate(INR = PT/12) %>%                                   # MIMIC sec      to ICV INR      (normal_PT~12)
                                mutate(Ureum = BUN * 0.3571) %>%                          # MIMIC mg/dL    to ICV mmol/L   (http://www.scymed.com/en/smnxps/psxff047_c.htm)
                                mutate(Albumine = ALBUMIN * 10) %>%                       # MIMIC g/dL     to ICV g/L      (simple conversion)
                                mutate(Magnesium = MAGNESIUM * 0.411) %>%                 # MIMIC mg/dl    to ICV mmol/l   (http://www.endmemo.com/medical/unitconvert/Magnesium.php)
                                mutate(Calcium = CALCIUM * 0.25) %>%                      # MIMIC mg/dl    to ICV mmol/L   (http://www.endmemo.com/medical/unitconvert/Calcium.php)
                                mutate(PF_ratio = PaO2 / FiO2) %>%
                                rename(glucose = Glucose) %>% 
                                       mutate(glutemp = rowMeans(data.frame(GLUCOSE, glucose),na.rm = TRUE)) %>% 
                                       mutate(Glucose = glutemp * 0.0555) %>%             # MIMIC mg/dL    to ICV mmol/l   (http://www.endmemo.com/medical/unitconvert/Glucose.php)
                                select(-ALBUMIN, -CALCIUM, - CREATININE, -glucose, 
                                       -GLUCOSE, -glutemp, -BUN, -MAGNESIUM, 
                                       -HEMOGLOBIN, -BILIRUBIN,- PT)                           # remove old features

# sort columns
MIMIC_all = MIMIC_all[,c(names(MIMIC_all)[1:3],sort(colnames(MIMIC_all[4:ncol(MIMIC_all)] )))]
print(names(MIMIC_all))
head(MIMIC_all)

In [None]:
########################
### export data to csv
final_data_path = 'data/'
write_csv(MIMIC_all, paste0(final_data_path, 'MIMIC_data.csv'))

sprintf("Finished at: %st", Sys.time())

# Data preprocessing TO DO

    !!! MISSING: re_admission?
    !!! MISSING: GCS values (a mess in the ICV dataset, EXCLUDE FOR NOW)
    !!! MISSING: mechvent as a binary measurement over time (daily or for each 4h timestep?) instead of singular binary feature for only first 24hours
    !!! MISSING: SOFA scores over time (currently only admission Sofa score)
    !!! MISSING: fluid balance (currently only IV fluids, should we also include gastro-intestinal intake?!!?)

## Merge with original sepsis_no_exclusion to add: datasource

In [None]:
##########
### PREPARATION
data_path <- 'data/'

# set working directory
setwd("/Users/luca/Projects/rl_sepsis/SEPSIS")
#setwd("C:/Users/l.roggeveen/SRL/rl_sepsis/SEPSIS/")
#setwd("D:/ResearchData/rl_sepsis/SEPSIS")

# load libraries
library(tidyverse) # install.packages("tidyverse") # library(devtools) # install_version("rlang", version = "0.2.2", repos = "http://cran.us.r-project.org")

########################
# IMPORT DATA
MIMIC_data_path <- 'MIMIC_data/'
sepsis_df <- read_csv(paste0(MIMIC_data_path, 'sepsis3-df-no-exclusions.csv')) %>% rename(PatientID = icustay_id) %>% select(PatientID, dbsource)
MIMIC_data <- read_csv('data/MIMIC_data.csv')
names(MIMIC_data)

########################
### Combine data with admission source
MIMIC_all <- MIMIC_data %>% left_join(sepsis_df)
summary(as.factor(MIMIC_all$dbsource))

# SUBSET dbsource = METAVISION

In [None]:
MIMIC_MV = MIMIC_all %>% filter(dbsource == 'metavision') %>% select(-dbsource)
head(MIMIC_MV)
dim(MIMIC_MV)

########################
### export data to csv
final_data_path = 'data/'
write_csv(MIMIC_MV, paste0(final_data_path, 'MIMIC_MVdata.csv'))
sprintf("Finished at: %st", Sys.time())

In [None]:
## Required packages
library(ggplot2)
library(RColorBrewer)
library(colorRamps)


## You need to expand palette size
colourCount = 21 # number of actions
getPalette = colorRampPalette(brewer.pal(9, "Set2"))
long_colors = colorRampPalette(brewer.pal(12, "Spectral"))(colourCount)

# plot
ggplot( MIMIC_MV, aes(total_IV, max_VP, color = as.factor(discrete_action))) + 
        geom_point() + scale_color_manual(values = long_colors) + 
        scale_y_log10(limits = c(0.01,10)) + 
        scale_x_log10(limits = c(0.01,10000), labels = scales::comma) +
        ggtitle("Fluids (IV) vs Vasopressor dose Quartile bins") + 
        theme(legend.position="none") + 
        labs(subtitle="MIMIC Metavision dataset",y = "Max VP Dose (ugram/min)",x = "Total IV fluid (mL)",caption="X and Y Axis on log10 scale") 

# SUBSET SOFA>3

In [None]:
########################
# set working directory
setwd("/Users/luca/Projects/rl_sepsis/SEPSIS")
#setwd("C:/Users/l.roggeveen/SRL/rl_sepsis/SEPSIS/")
#setwd("D:/ResearchData/rl_sepsis/SEPSIS")

# load libraries
library(tidyverse) # install.packages("tidyverse") # library(devtools) # install_version("rlang", version = "0.2.2", repos = "http://cran.us.r-project.org")

########################
# IMPORT DATA
data_path <- 'MIMIC_data/'
sepsis_df <- read_csv(paste0(data_path, 'sepsis3-df-no-exclusions.csv')) %>% rename(PatientID = icustay_id) %>% select(PatientID, dbsource)
#MIMIC_MV <- read_csv('data/MIMIC_MVdata.csv')

In [None]:
# Apply IV fluid filter
sick_MIMIC <- MIMIC_MV %>% group_by(PatientID) %>%
                        filter(interval_start_time == max(interval_start_time)) %>%
                        summarise(END_total_IV = max(Running_total_IV)) %>%
                        filter(END_total_IV > 100) %>%
                        filter(END_total_IV < 40000)

# Apply SOFA filter
sofa_MIMIC <- MIMIC_MV %>% group_by(PatientID) %>%
                        summarise(SOFA = mean(Sofa_score, na.rm = TRUE)) %>%
                        filter(SOFA>3)

# Join filters
filt_MIMIC = sofa_MIMIC # sick_MIMIC %>% inner_join(sofa_MIMIC) %>% group_by(PatientID)

# inspect filtered dataset
length(unique(MIMIC_MV$PatientID))
nrow(filt_MIMIC)
summary(filt_MIMIC)

# apply filters and create new dataset from subset of MIMIC data 
filt_MIMIC_dataset <- MIMIC_MV %>% filter (PatientID %in% c(filt_MIMIC$PatientID))

########################
### export data to csv
final_data_path = 'data/'
write_csv(filt_MIMIC_dataset, paste0(final_data_path, 'MIMIC_MVfilt_data.csv'))

sprintf("Finished at: %st", Sys.time())

In [None]:
## Required packages
library(ggplot2)
library(RColorBrewer)
library(colorRamps)


## You need to expand palette size
colourCount = 21 # number of actions
getPalette = colorRampPalette(brewer.pal(9, "Set2"))
long_colors = colorRampPalette(brewer.pal(12, "Spectral"))(colourCount)

# plot
ggplot( filt_MIMIC_dataset, aes(total_IV, max_VP, color = as.factor(discrete_action))) + 
        geom_point() + scale_color_manual(values = long_colors) + 
        scale_y_log10(limits = c(0.01,10)) + 
        scale_x_log10(limits = c(0.01,10000), labels = scales::comma) +
        ggtitle("Fluids (IV) vs Vasopressor dose Quartile bins") + 
        theme(legend.position="none") + 
        labs(subtitle="MIMIC Metavision SOFA>3 dataset",y = "Max VP Dose (ugram/min)",x = "Total IV fluid (mL)",caption="X and Y Axis on log10 scale") 

## Reproducability is important

In [None]:
sessionInfo()