In [None]:
import sqlite3
import pandas as pd 

In [None]:
##############################################################################
####################### CONNECT TO MIMIC #####################################
connection = sqlite3.connect("C:\\Users\\Maria\\Desktop\\Projects Data Scripts\\MIMIC\\data\\mimic3.db")

#We can verify we successfully created our connection object by running:
print(connection.total_changes)
# Be sure to close the connection
#con.close()

# Create our test query
test_query = """
SELECT subject_id, hadm_id, admittime, dischtime, admission_type, diagnosis
FROM admissions
"""

# Run the query and assign the results to a variable
test = pd.read_sql_query(test_query,connection)

print("TEST:", test.head())
##############################################################################
##################### QUERYING, MORTALITY COHORT #############################


###positive cohort

query = """

WITH icu_patients AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, pat.DOB, pat.gender, icu.intime 
, (JulianDay(OUTTIME) - JulianDay(INTIME)) as icu_length_of_stay
, (JulianDay(icu.INTIME) - JulianDay(pat.DOB))/ 364.242 as age
, adm.hospital_expire_flag as mortality 


FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN admissions adm
    ON adm.subject_id = icu.subject_id
    AND adm.hadm_id = icu.hadm_id
)

SELECT * 
FROM icu_patients icu_cohort
WHERE icu_cohort.mortality == 1 AND icu_cohort.age >= 18 AND icu_cohort.icu_length_of_stay>=1

"""


positive_cohort = pd.read_sql_query(query, connection)

In [None]:

positive_cohort = positive_cohort.copy() #keeping the original icu patients
positive_cohort.drop_duplicates()

#the dataset contains dupicate values, so i will drop them 
positive_cohort = positive_cohort.drop_duplicates(keep='first')

In [None]:
positive_cohort

In [None]:
positive_cohort[positive_cohort["subject_id"] == 27427]


## Megative Cohort

In [None]:

###negative cohort

query = """

WITH icu_patients AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, pat.DOB, pat.gender, icu.intime 
, (JulianDay(OUTTIME) - JulianDay(INTIME)) as icu_length_of_stay
, (JulianDay(icu.INTIME) - JulianDay(pat.DOB))/ 364.242 as age
, adm.hospital_expire_flag 


FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN admissions adm
    ON adm.subject_id = icu.subject_id
    AND adm.hadm_id = icu.hadm_id
)

SELECT icu_cohort.subject_id, icu_cohort.hadm_id, icu_cohort.icustay_id, icu_cohort.DOB, icu_cohort.gender, icu_cohort.intime,  icu_cohort.icu_length_of_stay, icu_cohort.age, icu_cohort.hospital_expire_flag as mortality
FROM icu_patients icu_cohort
WHERE icu_cohort.subject_id NOT IN(
    SELECT icu_cohort.subject_id
    FROM icu_patients icu_cohort
    WHERE icu_cohort.hospital_expire_flag == 1 AND icu_cohort.age >= 18 AND icu_cohort.icu_length_of_stay>=1
)
AND icu_cohort.age >= 18 AND icu_cohort.icu_length_of_stay>=1
ORDER BY icu_cohort.subject_id
"""


negative_cohort = pd.read_sql_query(query, connection)

In [None]:
negative_cohort

### Preprocessing Negagive (to mortality) ICU patients (Keep the oldest date for admission)

In [None]:
# exclude patients < 18 years old
patients = negative_cohort.copy() #keeping the original icu patients
patients.drop_duplicates()

#patients = patients[patients["exclusion_age"]==0] #keep patients older than 18

#the dataset contains dupicate values, so i will drop them 
patients = patients.drop_duplicates(keep='first')


In [None]:
patients[patients["subject_id"] == 18108]

In [None]:
patients.subject_id.value_counts().sort_values

In [None]:
#check if we actually dropped duplicates
patients[patients["subject_id"]==11861]

In [None]:
#i extracted patients with multiple hadm_id, so we need to keep only the first hadm_id for each patient. and for each hadm_id all icu_Stays_id (i need them for labs and stuff)


#count how  many visits in the icu each patient has 
counts = patients.groupby(patients.subject_id).nunique()["hadm_id"]

# index of patients with more than one hadm_id
# select multi visit patients from patients
idx_pat_multi_hadmid = counts[counts.values>1].index
multi_hadmid_patients = patients[patients.subject_id.isin(list(idx_pat_multi_hadmid))]

In [None]:
#extract the one with one unique hadm_id for later concat of negative cohort
uni_hadmid_patients = patients[~patients.subject_id.isin(list(idx_pat_multi_hadmid))]

In [None]:
uni_hadmid_patients.head()

In [None]:
multi_hadmid_patients.head()

In [None]:
#just checking
multi_hadmid_patients[multi_hadmid_patients['subject_id'] == 11861]

In [None]:
#convert to datetime
multi_hadmid_patients.intime = pd.to_datetime(multi_hadmid_patients.intime)

In [None]:
#for the people that have multiple hadm_id, get the idx of the hadm_id with the min date
indices = multi_hadmid_patients.groupby("subject_id")['intime'].idxmin()

In [None]:
indices

In [None]:
#get the df where the indices are in indices
sub_cohort_multi = multi_hadmid_patients.loc[indices]

In [None]:
#checking
sub_cohort_multi[sub_cohort_multi["subject_id"] == 11861]

In [None]:
sub_cohort_multi

In [None]:
#concating the negative cohort


negative_cohort = pd.concat([sub_cohort_multi, uni_hadmid_patients])

In [None]:
negative_cohort.head()

In [None]:
negative_cohort[negative_cohort["subject_id"] == 18108]

### Concat final cohort with negative and positive

In [None]:
negative_cohort

In [None]:
positive_cohort

In [None]:
patient_cohort = pd.concat([negative_cohort, positive_cohort])

In [None]:
patient_cohort[patient_cohort["subject_id"] == 27427]

In [None]:
patient_cohort[patient_cohort["subject_id"] == 81846]