In [1]:
import sqlite3
import pandas as pd 

In [2]:
##############################################################################
####################### CONNECT TO MIMIC #####################################
connection = sqlite3.connect("C:\\Users\\Maria\\Desktop\\Projects Data Scripts\\MIMIC\\data\\mimic3.db")

#We can verify we successfully created our connection object by running:
print(connection.total_changes)
# Be sure to close the connection
#con.close()

# Create our test query
test_query = """
SELECT subject_id, hadm_id, admittime, dischtime, admission_type, diagnosis
FROM admissions
"""

# Run the query and assign the results to a variable
test = pd.read_sql_query(test_query,connection)

print("TEST:", test.head())
##############################################################################
##################### QUERYING, MORTALITY COHORT #############################


###positive cohort

query = """

WITH icu_patients AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, pat.DOB, pat.gender, adm.admittime 
, (JulianDay(OUTTIME) - JulianDay(INTIME)) as icu_length_of_stay
, (JulianDay(icu.INTIME) - JulianDay(pat.DOB))/ 364.242 as age
, adm.hospital_expire_flag as mortality 


FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN admissions adm
    ON adm.subject_id = icu.subject_id
    AND adm.hadm_id = icu.hadm_id
)

SELECT * 
FROM icu_patients icu_cohort
WHERE icu_cohort.mortality == 1 AND icu_cohort.age >= 18 AND icu_cohort.icu_length_of_stay>=1

"""


positive_cohort = pd.read_sql_query(query, connection)

0
TEST:    SUBJECT_ID  HADM_ID            ADMITTIME            DISCHTIME  \
0          22   165315  2196-04-09 12:26:00  2196-04-10 15:54:00   
1          23   152223  2153-09-03 07:15:00  2153-09-08 19:10:00   
2          23   124321  2157-10-18 19:34:00  2157-10-25 14:00:00   
3          24   161859  2139-06-06 16:14:00  2139-06-09 12:48:00   
4          25   129635  2160-11-02 02:06:00  2160-11-05 14:55:00   

  ADMISSION_TYPE                                          DIAGNOSIS  
0      EMERGENCY                            BENZODIAZEPINE OVERDOSE  
1       ELECTIVE  CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...  
2      EMERGENCY                                         BRAIN MASS  
3      EMERGENCY                     INTERIOR MYOCARDIAL INFARCTION  
4      EMERGENCY                            ACUTE CORONARY SYNDROME  


In [10]:

positive_cohort = positive_cohort.copy() #keeping the original icu patients
positive_cohort.drop_duplicates()

#the dataset contains dupicate values, so i will drop them 
positive_cohort = positive_cohort.drop_duplicates(keep='first')

In [11]:
positive_cohort

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
0,31,128652,254478,2036-05-17 00:00:00,M,2108-08-22 23:27:00,7.937940,72.465499,1
1,56,181711,275642,1804-01-02 00:00:00,F,2104-01-02 02:01:00,1.849190,300.824960,1
2,61,189535,217135,2063-10-21 00:00:00,M,2119-01-04 18:12:00,2.009144,55.401808,1
3,91,121205,256972,2095-09-26 00:00:00,F,2177-04-23 00:08:00,3.475000,81.836145,1
4,101,175533,233111,2114-07-22 00:00:00,M,2196-09-26 18:36:00,9.891852,82.411628,1
...,...,...,...,...,...,...,...,...,...
5356,98669,108710,224242,2072-07-04 00:00:00,F,2147-04-07 15:09:00,6.837303,74.962888,1
5357,98753,185764,207506,2094-05-01 00:00:00,F,2172-02-11 21:26:00,4.246400,78.021019,1
5358,98753,185764,233107,2094-05-01 00:00:00,F,2172-02-11 21:26:00,2.740428,78.035799,1
5359,98768,127022,213468,2027-07-26 00:00:00,F,2112-09-15 09:45:00,2.151123,85.375677,1


In [12]:
positive_cohort.hadm_id.value_counts()

175448    5
154357    5
131118    5
153926    4
192825    4
         ..
121734    1
193415    1
127805    1
105354    1
114201    1
Name: hadm_id, Length: 4774, dtype: int64

In [13]:
positive_cohort[positive_cohort["hadm_id"] == 175448]


Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
3059,27427,175448,203143,2075-01-01 00:00:00,M,2150-11-03 22:05:00,50.860046,76.199964,1
3060,27427,175448,207593,2075-01-01 00:00:00,M,2150-11-03 22:05:00,2.49581,76.153348,1
3061,27427,175448,245197,2075-01-01 00:00:00,M,2150-11-03 22:05:00,22.85375,76.048125,1
3062,27427,175448,282192,2075-01-01 00:00:00,M,2150-11-03 22:05:00,3.115451,76.119077,1
3063,27427,175448,295239,2075-01-01 00:00:00,M,2150-11-03 22:05:00,1.814514,76.143537,1


In [14]:
positive_cohort

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
0,31,128652,254478,2036-05-17 00:00:00,M,2108-08-22 23:27:00,7.937940,72.465499,1
1,56,181711,275642,1804-01-02 00:00:00,F,2104-01-02 02:01:00,1.849190,300.824960,1
2,61,189535,217135,2063-10-21 00:00:00,M,2119-01-04 18:12:00,2.009144,55.401808,1
3,91,121205,256972,2095-09-26 00:00:00,F,2177-04-23 00:08:00,3.475000,81.836145,1
4,101,175533,233111,2114-07-22 00:00:00,M,2196-09-26 18:36:00,9.891852,82.411628,1
...,...,...,...,...,...,...,...,...,...
5356,98669,108710,224242,2072-07-04 00:00:00,F,2147-04-07 15:09:00,6.837303,74.962888,1
5357,98753,185764,207506,2094-05-01 00:00:00,F,2172-02-11 21:26:00,4.246400,78.021019,1
5358,98753,185764,233107,2094-05-01 00:00:00,F,2172-02-11 21:26:00,2.740428,78.035799,1
5359,98768,127022,213468,2027-07-26 00:00:00,F,2112-09-15 09:45:00,2.151123,85.375677,1


## Negative Cohort

In [17]:

###negative cohort

query = """

WITH icu_patients AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, pat.DOB, pat.gender, adm.admittime 
, (JulianDay(OUTTIME) - JulianDay(INTIME)) as icu_length_of_stay
, (JulianDay(icu.INTIME) - JulianDay(pat.DOB))/ 364.242 as age
, adm.hospital_expire_flag 


FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN admissions adm
    ON adm.subject_id = icu.subject_id
    AND adm.hadm_id = icu.hadm_id
)

SELECT icu_cohort.subject_id, icu_cohort.hadm_id, icu_cohort.icustay_id, icu_cohort.DOB, icu_cohort.gender, icu_cohort.admittime,  icu_cohort.icu_length_of_stay, icu_cohort.age, icu_cohort.hospital_expire_flag as mortality
FROM icu_patients icu_cohort
WHERE icu_cohort.subject_id NOT IN(
    SELECT icu_cohort.subject_id
    FROM icu_patients icu_cohort
    WHERE icu_cohort.hospital_expire_flag == 1 AND icu_cohort.age >= 18 AND icu_cohort.icu_length_of_stay>=1
)
AND icu_cohort.age >= 18 AND icu_cohort.icu_length_of_stay>=1
ORDER BY icu_cohort.subject_id
"""


negative_cohort = pd.read_sql_query(query, connection)

In [18]:
negative_cohort

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
0,3,145834,211552,2025-04-11 00:00:00,M,2101-10-20 19:08:00,6.064560,76.736891,0
1,4,185777,294638,2143-05-12 00:00:00,F,2191-03-16 00:28:00,1.678472,47.976402,0
2,6,107064,228232,2109-06-21 00:00:00,F,2175-05-30 07:15:00,3.672917,66.123337,0
3,11,194540,229441,2128-02-22 00:00:00,F,2178-04-16 06:18:00,1.584410,50.285974,0
4,13,143045,263738,2127-02-27 00:00:00,F,2167-01-08 18:43:00,3.666042,39.975568,0
...,...,...,...,...,...,...,...,...,...
37807,99985,176670,279638,2127-04-08 00:00:00,M,2181-01-27 02:47:00,11.299838,53.962013,0
37808,99991,151118,226241,2137-04-07 00:00:00,M,2184-12-24 08:30:00,3.142616,47.860296,0
37809,99992,197084,242052,2078-10-17 00:00:00,F,2144-07-25 18:03:00,1.974456,65.952727,0
37810,99995,137810,229633,2058-05-29 00:00:00,F,2147-02-08 08:00:00,2.161481,88.942459,0


### Preprocessing Negagive (to mortality) ICU patients (Keep the oldest date for admission)

In [46]:
# exclude patients < 18 years old
patients = negative_cohort.copy() #keeping the original icu patients
patients.drop_duplicates()

#patients = patients[patients["exclusion_age"]==0] #keep patients older than 18

#the dataset contains dupicate values, so i will drop them 
patients = patients.drop_duplicates(keep='first')


In [47]:
patients[patients["subject_id"] == 18108]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
12594,18108,163281,240938,2074-04-05 00:00:00,F,2122-02-24 14:23:00,8.816134,48.039858,0
12595,18108,163281,200137,2074-04-05 00:00:00,F,2122-02-24 14:23:00,3.345822,48.084474,0
12596,18108,163281,295394,2074-04-05 00:00:00,F,2122-02-24 14:23:00,1.495498,48.10037,0
12597,18108,163281,240543,2074-04-05 00:00:00,F,2122-02-24 14:23:00,2.681991,48.124844,0
12598,18108,163281,277556,2074-04-05 00:00:00,F,2122-02-24 14:23:00,5.083623,48.138267,0
12599,18108,163281,277859,2074-04-05 00:00:00,F,2122-02-24 14:23:00,6.145567,48.161997,0
12600,18108,163281,260163,2074-04-05 00:00:00,F,2122-02-24 14:23:00,13.260023,48.183711,0


In [48]:
patients.subject_id.value_counts().sort_values

<bound method Series.sort_values of 13033    36
11861    32
11318    21
19213    21
7809     20
         ..
40728     1
69407     1
89180     1
67741     1
2047      1
Name: subject_id, Length: 28790, dtype: int64>

In [49]:
#check if we actually dropped duplicates
patients[patients["subject_id"]==11861]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
8301,11861,142176,250513,2105-05-05 00:00:00,F,2125-07-10 04:01:00,1.486354,20.237007,0
8302,11861,108945,205880,2105-05-05 00:00:00,F,2125-08-07 17:36:00,1.864549,20.315434,0
8303,11861,172888,249009,2105-05-05 00:00:00,F,2125-09-09 14:16:00,1.095775,20.405653,0
8304,11861,110532,270433,2105-05-05 00:00:00,F,2125-09-22 15:11:00,2.033472,20.441448,0
8305,11861,102760,217247,2105-05-05 00:00:00,F,2125-10-05 16:29:00,2.127894,20.477286,0
8306,11861,131344,237405,2105-05-05 00:00:00,F,2125-10-11 15:13:00,2.643449,20.493616,0
8307,11861,123134,223323,2105-05-05 00:00:00,F,2126-02-27 20:39:00,1.855035,20.875851,0
8308,11861,141109,238230,2105-05-05 00:00:00,F,2126-03-28 06:19:00,1.333299,20.953827,0
8309,11861,173048,263259,2105-05-05 00:00:00,F,2126-04-09 16:55:00,3.150532,20.987986,0
8310,11861,188478,229368,2105-05-05 00:00:00,F,2126-06-15 20:50:00,3.141852,21.172378,0


In [50]:
#i extracted patients with multiple hadm_id, so we need to keep only the first hadm_id for each patient. and for each hadm_id all icu_Stays_id (i need them for labs and stuff)


#count how  many visits in the icu each patient has 
counts = patients.groupby(patients.subject_id).nunique()["hadm_id"]

# index of patients with more than one hadm_id
# select multi visit patients from patients
idx_pat_multi_hadmid = counts[counts.values>1].index
multi_hadmid_patients = patients[patients.subject_id.isin(list(idx_pat_multi_hadmid))]

In [51]:
#extract the one with one unique hadm_id for later concat of negative cohort
uni_hadmid_patients = patients[~patients.subject_id.isin(list(idx_pat_multi_hadmid))]

In [52]:
uni_hadmid_patients.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
0,3,145834,211552,2025-04-11 00:00:00,M,2101-10-20 19:08:00,6.06456,76.736891,0
1,4,185777,294638,2143-05-12 00:00:00,F,2191-03-16 00:28:00,1.678472,47.976402,0
2,6,107064,228232,2109-06-21 00:00:00,F,2175-05-30 07:15:00,3.672917,66.123337,0
3,11,194540,229441,2128-02-22 00:00:00,F,2178-04-16 06:18:00,1.58441,50.285974,0
4,13,143045,263738,2127-02-27 00:00:00,F,2167-01-08 18:43:00,3.666042,39.975568,0


In [53]:
multi_hadmid_patients.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.32731,0
11,23,124321,234044,2082-07-17 00:00:00,M,2157-10-18 19:34:00,1.186227,75.470392,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
21,36,122659,211200,2061-08-17 00:00:00,M,2131-05-12 19:49:00,6.859549,69.936941,0
22,36,165660,241249,2061-08-17 00:00:00,M,2134-05-10 11:30:00,4.348299,72.933068,0


In [54]:
#just checking
multi_hadmid_patients[multi_hadmid_patients['subject_id'] == 11861]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
8301,11861,142176,250513,2105-05-05 00:00:00,F,2125-07-10 04:01:00,1.486354,20.237007,0
8302,11861,108945,205880,2105-05-05 00:00:00,F,2125-08-07 17:36:00,1.864549,20.315434,0
8303,11861,172888,249009,2105-05-05 00:00:00,F,2125-09-09 14:16:00,1.095775,20.405653,0
8304,11861,110532,270433,2105-05-05 00:00:00,F,2125-09-22 15:11:00,2.033472,20.441448,0
8305,11861,102760,217247,2105-05-05 00:00:00,F,2125-10-05 16:29:00,2.127894,20.477286,0
8306,11861,131344,237405,2105-05-05 00:00:00,F,2125-10-11 15:13:00,2.643449,20.493616,0
8307,11861,123134,223323,2105-05-05 00:00:00,F,2126-02-27 20:39:00,1.855035,20.875851,0
8308,11861,141109,238230,2105-05-05 00:00:00,F,2126-03-28 06:19:00,1.333299,20.953827,0
8309,11861,173048,263259,2105-05-05 00:00:00,F,2126-04-09 16:55:00,3.150532,20.987986,0
8310,11861,188478,229368,2105-05-05 00:00:00,F,2126-06-15 20:50:00,3.141852,21.172378,0


In [55]:
#convert to datetime
multi_hadmid_patients.admittime = pd.to_datetime(multi_hadmid_patients.admittime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [56]:
multi_hadmid_patients

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0
11,23,124321,234044,2082-07-17 00:00:00,M,2157-10-18 19:34:00,1.186227,75.470392,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
21,36,122659,211200,2061-08-17 00:00:00,M,2131-05-12 19:49:00,6.859549,69.936941,0
22,36,165660,241249,2061-08-17 00:00:00,M,2134-05-10 11:30:00,4.348299,72.933068,0
...,...,...,...,...,...,...,...,...,...
37783,99883,150755,276467,2058-02-04 00:00:00,M,2131-12-24 17:41:00,2.030938,74.087387,0
37784,99883,198523,293088,2058-02-04 00:00:00,M,2132-09-15 00:36:00,1.275567,74.819541,0
37803,99982,151454,221194,2091-10-02 00:00:00,M,2156-11-28 11:56:00,7.949259,65.337051,0
37804,99982,112748,275958,2091-10-02 00:00:00,M,2157-01-05 17:27:00,4.942708,65.444599,0


In [57]:
#for the people that have multiple hadm_id, get the idx of the hadm_id with the min date
indices = multi_hadmid_patients.groupby("subject_id")['admittime'].idxmin()

In [58]:
indices

subject_id
23          10
36          20
68          40
85          51
94          54
         ...  
99660    37736
99712    37746
99781    37757
99883    37783
99982    37803
Name: admittime, Length: 4393, dtype: int64

In [59]:
#get the df where the indices are in indices
sub_cohort_multi = multi_hadmid_patients.loc[indices]

In [60]:
#checking
sub_cohort_multi[sub_cohort_multi["subject_id"] == 11861]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
8301,11861,142176,250513,2105-05-05 00:00:00,F,2125-07-10 04:01:00,1.486354,20.237007,0


In [61]:
sub_cohort_multi

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
40,68,170467,294232,2132-02-29 00:00:00,F,2173-12-15 16:16:00,3.536806,41.912036,0
51,85,116630,209562,2090-09-18 00:00:00,M,2162-03-02 14:04:00,2.235324,71.651661,0
54,94,183686,229012,2101-09-20 00:00:00,M,2176-02-25 16:49:00,1.098542,74.639117,0
...,...,...,...,...,...,...,...,...,...
37736,99660,168541,259405,2067-02-23 00:00:00,F,2142-04-25 12:04:00,1.203715,75.374350,0
37746,99712,194361,285471,2106-11-20 00:00:00,F,2159-03-30 07:15:00,1.245995,52.502051,0
37757,99781,167791,239830,2061-06-30 00:00:00,M,2133-07-23 17:11:00,5.074236,72.261618,0
37783,99883,150755,276467,2058-02-04 00:00:00,M,2131-12-24 17:41:00,2.030938,74.087387,0


In [62]:
#concating the negative cohort


negative_cohort = pd.concat([sub_cohort_multi, uni_hadmid_patients])

In [63]:
negative_cohort.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.32731,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
40,68,170467,294232,2132-02-29 00:00:00,F,2173-12-15 16:16:00,3.536806,41.912036,0
51,85,116630,209562,2090-09-18 00:00:00,M,2162-03-02 14:04:00,2.235324,71.651661,0
54,94,183686,229012,2101-09-20 00:00:00,M,2176-02-25 16:49:00,1.098542,74.639117,0


In [64]:
negative_cohort[negative_cohort['hadm_id'] == 163281]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
12594,18108,163281,240938,2074-04-05 00:00:00,F,2122-02-24 14:23:00,8.816134,48.039858,0
12595,18108,163281,200137,2074-04-05 00:00:00,F,2122-02-24 14:23:00,3.345822,48.084474,0
12596,18108,163281,295394,2074-04-05 00:00:00,F,2122-02-24 14:23:00,1.495498,48.10037,0
12597,18108,163281,240543,2074-04-05 00:00:00,F,2122-02-24 14:23:00,2.681991,48.124844,0
12598,18108,163281,277556,2074-04-05 00:00:00,F,2122-02-24 14:23:00,5.083623,48.138267,0
12599,18108,163281,277859,2074-04-05 00:00:00,F,2122-02-24 14:23:00,6.145567,48.161997,0
12600,18108,163281,260163,2074-04-05 00:00:00,F,2122-02-24 14:23:00,13.260023,48.183711,0


### Concat final cohort with negative and positive

In [65]:
negative_cohort

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
40,68,170467,294232,2132-02-29 00:00:00,F,2173-12-15 16:16:00,3.536806,41.912036,0
51,85,116630,209562,2090-09-18 00:00:00,M,2162-03-02 14:04:00,2.235324,71.651661,0
54,94,183686,229012,2101-09-20 00:00:00,M,2176-02-25 16:49:00,1.098542,74.639117,0
...,...,...,...,...,...,...,...,...,...
37807,99985,176670,279638,2127-04-08 00:00:00,M,2181-01-27 02:47:00,11.299838,53.962013,0
37808,99991,151118,226241,2137-04-07 00:00:00,M,2184-12-24 08:30:00,3.142616,47.860296,0
37809,99992,197084,242052,2078-10-17 00:00:00,F,2144-07-25 18:03:00,1.974456,65.952727,0
37810,99995,137810,229633,2058-05-29 00:00:00,F,2147-02-08 08:00:00,2.161481,88.942459,0


In [66]:
negative_cohort.hadm_id.value_counts()

163281    7
123178    6
155295    4
125400    4
195641    4
         ..
137032    1
151363    1
149314    1
175935    1
141307    1
Name: hadm_id, Length: 28790, dtype: int64

In [67]:
positive_cohort

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
0,31,128652,254478,2036-05-17 00:00:00,M,2108-08-22 23:27:00,7.937940,72.465499,1
1,56,181711,275642,1804-01-02 00:00:00,F,2104-01-02 02:01:00,1.849190,300.824960,1
2,61,189535,217135,2063-10-21 00:00:00,M,2119-01-04 18:12:00,2.009144,55.401808,1
3,91,121205,256972,2095-09-26 00:00:00,F,2177-04-23 00:08:00,3.475000,81.836145,1
4,101,175533,233111,2114-07-22 00:00:00,M,2196-09-26 18:36:00,9.891852,82.411628,1
...,...,...,...,...,...,...,...,...,...
5356,98669,108710,224242,2072-07-04 00:00:00,F,2147-04-07 15:09:00,6.837303,74.962888,1
5357,98753,185764,207506,2094-05-01 00:00:00,F,2172-02-11 21:26:00,4.246400,78.021019,1
5358,98753,185764,233107,2094-05-01 00:00:00,F,2172-02-11 21:26:00,2.740428,78.035799,1
5359,98768,127022,213468,2027-07-26 00:00:00,F,2112-09-15 09:45:00,2.151123,85.375677,1


In [68]:
patient_cohort = pd.concat([negative_cohort, positive_cohort])

In [69]:
patient_cohort[patient_cohort["subject_id"] == 27427]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
3059,27427,175448,203143,2075-01-01 00:00:00,M,2150-11-03 22:05:00,50.860046,76.199964,1
3060,27427,175448,207593,2075-01-01 00:00:00,M,2150-11-03 22:05:00,2.49581,76.153348,1
3061,27427,175448,245197,2075-01-01 00:00:00,M,2150-11-03 22:05:00,22.85375,76.048125,1
3062,27427,175448,282192,2075-01-01 00:00:00,M,2150-11-03 22:05:00,3.115451,76.119077,1
3063,27427,175448,295239,2075-01-01 00:00:00,M,2150-11-03 22:05:00,1.814514,76.143537,1


In [70]:
patient_cohort[patient_cohort["hadm_id"] == 178137]

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
7911,11318,178137,241703,2052-02-14 00:00:00,F,2119-09-21 13:38:00,3.15625,67.786166,0


In [71]:
patient_cohort

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
40,68,170467,294232,2132-02-29 00:00:00,F,2173-12-15 16:16:00,3.536806,41.912036,0
51,85,116630,209562,2090-09-18 00:00:00,M,2162-03-02 14:04:00,2.235324,71.651661,0
54,94,183686,229012,2101-09-20 00:00:00,M,2176-02-25 16:49:00,1.098542,74.639117,0
...,...,...,...,...,...,...,...,...,...
5356,98669,108710,224242,2072-07-04 00:00:00,F,2147-04-07 15:09:00,6.837303,74.962888,1
5357,98753,185764,207506,2094-05-01 00:00:00,F,2172-02-11 21:26:00,4.246400,78.021019,1
5358,98753,185764,233107,2094-05-01 00:00:00,F,2172-02-11 21:26:00,2.740428,78.035799,1
5359,98768,127022,213468,2027-07-26 00:00:00,F,2112-09-15 09:45:00,2.151123,85.375677,1


## Extracting vitals

In [86]:
query = """
WITH ce as
(
    select ce.icustay_id, ce.charttime
        , (case when itemid in (211,220045) and valuenum > 0 and valuenum < 300 then valuenum else null end) as HeartRate
        , (case when itemid in (51,442,455,6701,220179,220050) and valuenum > 0 and valuenum < 400 then valuenum else null end) as SysBP
        , (case when itemid in (8368,8440,8441,8555,220180,220051) and valuenum > 0 and valuenum < 300 then valuenum else null end) as DiasBP
        , (case when itemid in (456,52,6702,443,220052,220181,225312) and valuenum > 0 and valuenum < 300 then valuenum else null end) as MeanBP
        , (case when itemid in (615,618,220210,224690) and valuenum > 0 and valuenum < 70 then valuenum else null end) as RespRate
        , (case when itemid in (223761,678) and valuenum > 70 and valuenum < 120 then (valuenum-32)/1.8 -- converted to degC in valuenum call
               when itemid in (223762,676) and valuenum > 10 and valuenum < 50  then valuenum else null end) as TempC
        , (case when itemid in (646,220277) and valuenum > 0 and valuenum <= 100 then valuenum else null end) as SpO2
        , (case when itemid in (807,811,1529,3745,3744,225664,220621,226537) and valuenum > 0 then valuenum else null end) as Glucose
    from chartevents ce
    
    WHERE ce.itemid in
    (
    -- HEART RATE
    211, --"Heart Rate"
    220045, --"Heart Rate"

    -- Systolic/diastolic

    51, --	Arterial BP [Systolic]
    442, --	Manual BP [Systolic]
    455, --	NBP [Systolic]
    6701, --	Arterial BP #2 [Systolic]
    220179, --	Non Invasive Blood Pressure systolic
    220050, --	Arterial Blood Pressure systolic

    8368, --	Arterial BP [Diastolic]
    8440, --	Manual BP [Diastolic]
    8441, --	NBP [Diastolic]
    8555, --	Arterial BP #2 [Diastolic]
    220180, --	Non Invasive Blood Pressure diastolic
    220051, --	Arterial Blood Pressure diastolic


    -- MEAN ARTERIAL PRESSURE
    456, --"NBP Mean"
    52, --"Arterial BP Mean"
    6702, --	Arterial BP Mean #2
    443, --	Manual BP Mean(calc)
    220052, --"Arterial Blood Pressure mean"
    220181, --"Non Invasive Blood Pressure mean"
    225312, --"ART BP mean"

    -- RESPIRATORY RATE
    618,--	Respiratory Rate
    615,--	Resp Rate (Total)
    220210,--	Respiratory Rate
    224690, --	Respiratory Rate (Total)


    -- SPO2, peripheral
    646, 220277,

    -- TEMPERATURE
    223762, -- "Temperature Celsius"
    676,	-- "Temperature C"
    223761, -- "Temperature Fahrenheit"
    678 --	"Temperature F"

    )
    )
select
    icustays.hadm_id
  , ce.charttime
  , avg(HeartRate) as HeartRate
  , avg(SysBP) as SysBP
  , avg(DiasBP) as DiasBP
  , avg(MeanBP) as MeanBP
  , avg(RespRate) as RespRate
  , avg(TempC) as TempC
  , avg(SpO2) as SpO2
  , avg(Glucose) as Glucose
from icustays
left join ce
on ce.icustay_id = icustays.icustay_id
group by icustays.hadm_id, ce.charttime
order by icustays.hadm_id, ce.charttime;
"""

In [87]:
vital  = pd.read_sql_query(query, connection)

In [88]:
vital

Unnamed: 0,HADM_ID,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose
0,100001,2117-09-11 12:57:00,122.0,,,,14.0,,,
1,100001,2117-09-11 13:00:00,118.0,,,,22.0,,,
2,100001,2117-09-11 13:01:00,,192.0,100.0,122.0,,,,
3,100001,2117-09-11 13:48:00,,,,,,36.666667,,
4,100001,2117-09-11 13:50:00,118.0,,,,22.0,,,
...,...,...,...,...,...,...,...,...,...,...
8641278,199999,2136-04-10 08:00:00,69.0,,,,24.0,36.722222,96.0,
8641279,199999,2136-04-10 09:00:00,79.0,,,,24.0,,93.0,
8641280,199999,2136-04-10 09:10:00,,128.0,81.0,89.0,,,,
8641281,199999,2136-04-10 10:00:00,72.0,,,,26.0,,97.0,


## Prescriptions

In [72]:
query = """
WITH filter_prescriptions AS
(
SELECT d.subject_id, adm.hadm_id, adm.admittime, d.icustay_id, d.drug as drug_name, CAST(d.NDC AS varchar) as NDC, d.startdate as drug_startdate, d.enddate as drug_enddate, d.prod_strength as drug_strength, d.dose_val_rx as drug_dosage, d.dose_unit_rx as drug_unit
FROM prescriptions d
INNER JOIN admissions adm
    ON adm.subject_id = d.subject_id
    AND adm.hadm_id = d.hadm_id

ORDER  BY d.SUBJECT_ID ASC
)
SELECT *
FROM filter_prescriptions 
"""

drugs = pd.read_sql_query(query, connection)

In [73]:
drugs

Unnamed: 0,subject_id,hadm_id,admittime,icustay_id,drug_name,NDC,drug_startdate,drug_enddate,drug_strength,drug_dosage,drug_unit
0,2,163353,2138-07-17 19:04:00,243653.0,NEO*IV*Gentamicin,63323017302.0,2138-07-18 00:00:00,2138-07-20 00:00:00,10mg/mL-2mL,15.5,mg
1,2,163353,2138-07-17 19:04:00,243653.0,Syringe (Neonatal) *D5W*,0.0,2138-07-18 00:00:00,2138-07-20 00:00:00,1 Syringe,2.4,ml
2,2,163353,2138-07-17 19:04:00,243653.0,Ampicillin Sodium,63323038810.0,2138-07-18 00:00:00,2138-07-21 00:00:00,500mg Vial,500,mg
3,2,163353,2138-07-17 19:04:00,243653.0,Send 500mg Vial,0.0,2138-07-18 00:00:00,2138-07-21 00:00:00,Send 500mg Vial,1,VIAL
4,4,185777,2191-03-16 00:28:00,294638.0,Iso-Osmotic Dextrose,0.0,2191-03-16 00:00:00,2191-03-16 00:00:00,200ml Bag,200,ml
...,...,...,...,...,...,...,...,...,...,...,...
4156445,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-04 00:00:00,2118-01-04 00:00:00,8mg Tablet,16,mg
4156446,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-05 00:00:00,2118-01-04 00:00:00,8mg Tablet,8,mg
4156447,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-06 00:00:00,2118-01-04 00:00:00,8mg Tablet,8,mg
4156448,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-07 00:00:00,2118-01-04 00:00:00,8mg Tablet,8,mg


In [5]:
#del drugs


## Diagnoses

In [74]:
query = """
WITH filter_diagnoses AS 
(
    SELECT icd.subject_id, icd.hadm_id, admissions.admittime, icd.icd9_code, d_icd.short_title as icd9_title
    FROM diagnoses_icd icd
    INNER JOIN icustays, admissions
        ON icd.subject_id = icustays.subject_id
        AND icustays.hadm_id = admissions.hadm_id
    INNER JOIN d_icd_diagnoses d_icd
        ON icd.icd9_code = d_icd.icd9_code
)
SELECT *
FROM filter_diagnoses
ORDER BY subject_id
"""

diagnoses = pd.read_sql_query(query, connection)

In [75]:
diagnoses

Unnamed: 0,subject_id,hadm_id,admittime,icd9_code,icd9_title
0,2,163353,2138-07-17 19:04:00,V3001,Single lb in-hosp w cs
1,2,163353,2138-07-17 19:04:00,V053,Need prphyl vc vrl hepat
2,2,163353,2138-07-17 19:04:00,V290,NB obsrv suspct infect
3,3,145834,2101-10-20 19:08:00,0389,Septicemia NOS
4,3,145834,2101-10-20 19:08:00,78559,Shock w/o trauma NEC
...,...,...,...,...,...
1431230,99999,113369,2117-12-30 07:15:00,75612,Spondylolisthesis
1431231,99999,113369,2117-12-30 07:15:00,7861,Stridor
1431232,99999,113369,2117-12-30 07:15:00,4019,Hypertension NOS
1431233,99999,113369,2117-12-30 07:15:00,25000,DMII wo cmp nt st uncntr


## Vital cohort

In [89]:
vital_cohort = vital[vital.HADM_ID.isin(list(patient_cohort.hadm_id))]

In [90]:
p = patient_cohort.copy()
v = vital_cohort.copy()


In [91]:
v.rename(columns={"HADM_ID": "hadm_id"}, inplace=True)

In [92]:
v

Unnamed: 0,hadm_id,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose
0,100001,2117-09-11 12:57:00,122.0,,,,14.0,,,
1,100001,2117-09-11 13:00:00,118.0,,,,22.0,,,
2,100001,2117-09-11 13:01:00,,192.0,100.0,122.0,,,,
3,100001,2117-09-11 13:48:00,,,,,,36.666667,,
4,100001,2117-09-11 13:50:00,118.0,,,,22.0,,,
...,...,...,...,...,...,...,...,...,...,...
8641278,199999,2136-04-10 08:00:00,69.0,,,,24.0,36.722222,96.0,
8641279,199999,2136-04-10 09:00:00,79.0,,,,24.0,,93.0,
8641280,199999,2136-04-10 09:10:00,,128.0,81.0,89.0,,,,
8641281,199999,2136-04-10 10:00:00,72.0,,,,26.0,,97.0,


In [93]:
p

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality
10,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0
20,36,182104,280987,2061-08-17 00:00:00,M,2131-04-30 07:15:00,1.109595,69.902529,0
40,68,170467,294232,2132-02-29 00:00:00,F,2173-12-15 16:16:00,3.536806,41.912036,0
51,85,116630,209562,2090-09-18 00:00:00,M,2162-03-02 14:04:00,2.235324,71.651661,0
54,94,183686,229012,2101-09-20 00:00:00,M,2176-02-25 16:49:00,1.098542,74.639117,0
...,...,...,...,...,...,...,...,...,...
5356,98669,108710,224242,2072-07-04 00:00:00,F,2147-04-07 15:09:00,6.837303,74.962888,1
5357,98753,185764,207506,2094-05-01 00:00:00,F,2172-02-11 21:26:00,4.246400,78.021019,1
5358,98753,185764,233107,2094-05-01 00:00:00,F,2172-02-11 21:26:00,2.740428,78.035799,1
5359,98768,127022,213468,2027-07-26 00:00:00,F,2112-09-15 09:45:00,2.151123,85.375677,1


In [94]:
f = p.merge(v, how='left', on='hadm_id')


In [95]:
f

Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,admittime,icu_length_of_stay,age,mortality,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose
0,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0,2153-09-03 12:20:00,,,,,12.0,,,
1,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0,2153-09-03 12:30:00,90.0,106.0,62.0,77.0,7.0,35.0,100.0,
2,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0,2153-09-03 12:45:00,90.0,109.0,63.0,79.0,,,100.0,
3,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0,2153-09-03 13:00:00,90.0,92.0,57.0,69.0,13.0,,100.0,
4,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 07:15:00,1.264074,71.327310,0,2153-09-03 13:15:00,90.0,98.0,55.0,70.0,12.0,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163291,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:06:00,1.238171,88.234852,1,2132-12-25 22:00:00,,,,,,,61.0,
6163292,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:06:00,1.238171,88.234852,1,2132-12-25 22:01:00,,,,,,,,
6163293,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:06:00,1.238171,88.234852,1,2132-12-25 22:03:00,48.0,,,,,,,
6163294,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:06:00,1.238171,88.234852,1,2132-12-25 23:00:00,107.0,,,,34.0,,79.0,


In [96]:
f.to_csv("C:\\Users\\Maria\\Desktop\\data\\patients_vitals.csv")

In [77]:
data = pd.read_csv("C:\\Users\\Maria\\Desktop\\data\\patients_vitals.csv")

In [78]:
data 

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,icustay_id,DOB,gender,intime,icu_length_of_stay,age,mortality,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose
0,0,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 09:38:55,1.264074,71.327310,0,2153-09-03 12:20:00,,,,,12.0,,,
1,1,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 09:38:55,1.264074,71.327310,0,2153-09-03 12:30:00,90.0,106.0,62.0,77.0,7.0,35.0,100.0,
2,2,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 09:38:55,1.264074,71.327310,0,2153-09-03 12:45:00,90.0,109.0,63.0,79.0,,,100.0,
3,3,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 09:38:55,1.264074,71.327310,0,2153-09-03 13:00:00,90.0,92.0,57.0,69.0,13.0,,100.0,
4,4,23,152223,227807,2082-07-17 00:00:00,M,2153-09-03 09:38:55,1.264074,71.327310,0,2153-09-03 13:15:00,90.0,98.0,55.0,70.0,12.0,,100.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163291,6163291,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:08:16,1.238171,88.234852,1,2132-12-25 22:00:00,,,,,,,61.0,
6163292,6163292,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:08:16,1.238171,88.234852,1,2132-12-25 22:01:00,,,,,,,,
6163293,6163293,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:08:16,1.238171,88.234852,1,2132-12-25 22:03:00,48.0,,,,,,,
6163294,6163294,98797,105447,244147,2044-12-27 00:00:00,M,2132-12-24 20:08:16,1.238171,88.234852,1,2132-12-25 23:00:00,107.0,,,,34.0,,79.0,


In [79]:
drugs

Unnamed: 0,subject_id,hadm_id,admittime,icustay_id,drug_name,NDC,drug_startdate,drug_enddate,drug_strength,drug_dosage,drug_unit
0,2,163353,2138-07-17 19:04:00,243653.0,NEO*IV*Gentamicin,63323017302.0,2138-07-18 00:00:00,2138-07-20 00:00:00,10mg/mL-2mL,15.5,mg
1,2,163353,2138-07-17 19:04:00,243653.0,Syringe (Neonatal) *D5W*,0.0,2138-07-18 00:00:00,2138-07-20 00:00:00,1 Syringe,2.4,ml
2,2,163353,2138-07-17 19:04:00,243653.0,Ampicillin Sodium,63323038810.0,2138-07-18 00:00:00,2138-07-21 00:00:00,500mg Vial,500,mg
3,2,163353,2138-07-17 19:04:00,243653.0,Send 500mg Vial,0.0,2138-07-18 00:00:00,2138-07-21 00:00:00,Send 500mg Vial,1,VIAL
4,4,185777,2191-03-16 00:28:00,294638.0,Iso-Osmotic Dextrose,0.0,2191-03-16 00:00:00,2191-03-16 00:00:00,200ml Bag,200,ml
...,...,...,...,...,...,...,...,...,...,...,...
4156445,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-04 00:00:00,2118-01-04 00:00:00,8mg Tablet,16,mg
4156446,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-05 00:00:00,2118-01-04 00:00:00,8mg Tablet,8,mg
4156447,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-06 00:00:00,2118-01-04 00:00:00,8mg Tablet,8,mg
4156448,99999,113369,2117-12-30 07:15:00,,Methylprednisolone,66993084225.0,2118-01-07 00:00:00,2118-01-04 00:00:00,8mg Tablet,8,mg


In [80]:
drugs_cohort = drugs[drugs.subject_id.isin(list(data.subject_id))]

In [81]:
drugs_cohort.subject_id.sort_values()

4              4
36             4
37             4
38             4
39             4
           ...  
4156411    99999
4156412    99999
4156413    99999
4156398    99999
4156449    99999
Name: subject_id, Length: 3885634, dtype: int64

In [82]:
diagnoses_cohort = diagnoses[diagnoses.subject_id.isin((data.subject_id))]

In [83]:
diagnoses.subject_id.sort_values()

0              2
1              2
2              2
3              3
4              3
           ...  
1431233    99999
1431230    99999
1431231    99999
1431232    99999
1431234    99999
Name: subject_id, Length: 1431235, dtype: int64

In [84]:
drugs_cohort.to_csv("C:\\Users\\Maria\\Desktop\\data\\drugs.csv")

In [85]:
diagnoses_cohort.to_csv("C:\\Users\\Maria\\Desktop\\data\\diagnoses.csv")