In [None]:
import os 
import pandas as pd 

import warnings
warnings.filterwarnings("ignore")

In [None]:
folder = '../sepsis3_queries/data'
sepsis3 = pd.read_csv(f'{folder}/sepsis-df-3.csv')
text = pd.read_csv(f'{folder}/text-df-3.csv')

In [None]:
sepsis3.head()

In [None]:
text.head()

CHARTDATE records the date at which the note was charted. CHARTDATE will always have a time value of 00:00:00.

CHARTTIME records the date and time at which the note was charted. If both CHARTDATE and CHARTTIME exist, then the date portions will be identical. All records have a CHARTDATE. A subset are missing CHARTTIME. More specifically, notes with a CATEGORY value of ‘Discharge Summary’, ‘ECG’, and ‘Echo’ never have a CHARTTIME, only CHARTDATE. Other categories almost always have both CHARTTIME and CHARTDATE, but there is a small amount of missing data for CHARTTIME (usually less than 0.5% of the total number of notes for that category).

STORETIME records the date and time at which a note was saved into the system. Notes with a CATEGORY value of ‘Discharge Summary’, ‘ECG’, ‘Radiology’, and ‘Echo’ never have a STORETIME. All other notes have a STORETIME.

A ‘1’ in the ISERROR column indicates that a physician has identified this note as an error.



## Exclude errors

In [None]:
text.iserror.value_counts()

In [None]:
# exclude errors
text = text[text.iserror != 1]

## 24 hours in the ICU

In [None]:
text.head()

In [None]:
### 1 hours intervals based on if the have entries in the charttime 

KEYS = ['hadm_id']

text[['charttime', 'chartdate']] = text[['charttime','chartdate']].apply(pd.to_datetime)


sepsis3[["intime", "outtime"]] = sepsis3[["intime", "outtime"]].apply(pd.to_datetime)

print("Hourly buckets")

#### hourly buckets ####################################
to_hours = lambda x: max(0, x.days*24 + x.seconds // 3600)

# join and add in labs_vital the icu intime and outime, to separate in hourly buckets
sepsis3 = sepsis3.set_index(KEYS)
text = text.set_index(KEYS).join(sepsis3[['intime', 'outtime', 'icustay_id']])
# to hourly buckets
text['hours_in'] = (text['charttime'] - text['intime']).apply(to_hours)

In [None]:
text.head()

In [None]:
text.groupby(['subject_id'])['hours_in'].max()

In [None]:
text[text.subject_id == 671]

We can see now that the hours_in correspond to the correct hours if the patient had an entry in the charttime. 
If the charttime was NaT this is registered as 0 hours in no matter how many days or hours the patient stayed in the ICU.
So i will slice again based the chartdate and create an additional feature based on days called days_in. 
Hopefully, this will be good for filtering them. 

In [None]:
import pandas as pd


# # Convert string columns to datetime
# df['chartdate'] = pd.to_datetime(df['chartdate'], format="%Y-%m-%d")
# df['intime'] = pd.to_datetime(df['intime'], format="%Y-%m-%d")

to_days = lambda x, y: max(0, (x.date() - y.date()).days)

# Apply the function to create a new column 'days_difference'
text['days_in'] = text.apply(lambda row: to_days(row['chartdate'], row['intime']), axis=1)



In [None]:
text[text.subject_id == 671]

## Save

In [None]:
sepsis3.columns

In [None]:
text = text.join(sepsis3[['hospital_expire_flag', 'thirtyday_expire_flag', 'icu_los', 'hosp_los', 'mort_icu',\
                                          'mort_hosp', 'sepsis_angus']])




In [None]:
text.head()

In [None]:
text = text.drop('row_id', axis=1)
text.sepsis_angus.value_counts()

In [None]:
KEYS_ALL = ['subject_id', 'icustay_id', 'hadm_id', 'days_in']
text = text.reset_index()
text['hadm_id'] = text['hadm_id'].astype(int)
text.set_index(KEYS_ALL)

In [None]:
text[text.icustay_id == 240913]

In [None]:
folder = 'data_stage_1'
text.to_csv(os.path.join(folder, 'text.csv'),sep=',',index=False)
