In [1]:
from Data.mimic import data_fetcher
from settings import ROOT_DIR
from Data.mimic.mimic import get_events, get_all_icustay_ids
from Data.mimic.data_fetcher import TimeSeriesData
import os
import pandas as pd
import dill
import numpy as np

chart_ids = data_fetcher.return_ids(data_fetcher.chart_dict)
lab_ids = data_fetcher.return_ids(data_fetcher.lab_dict)

database_config_path = os.path.join(ROOT_DIR, 'Data', 'mimic', 'mimic.ini')
engine = data_fetcher.alchemy_engine_mimic(database_config_path)
stays = data_fetcher.get_all_icustay_ids(engine)
heart_stays = data_fetcher.get_all_icustay_ids(engine, heart_only=True)

In [2]:
def completely_missing(data_df, itemid):
    n_missing = len(set(data_df.icustay_id.unique()) - set(data_df[data_df['itemid'] == itemid].icustay_id.unique()))
    return n_missing/(len(set(data_df.icustay_id.unique())))

def avg_timediff(data_df, itemid):
    data_df = data_df[data_df['itemid'] ==  itemid]
    diffs  = data_df.groupby('icustay_id').charttime.agg(lambda group: group.sort_values().diff().mean().total_seconds()/60)
    avg_diff = diffs.mean()
    std = diffs.std()
    return avg_diff, std


def get_mimic_stats(data_df):
    stat_dict = {}
    itemids = data_df.itemid.unique()
    for itemid in itemids:
        stat_dict[itemid] = {}
        stat_dict[itemid]['missing'] = completely_missing(data_df, itemid)
        stat_dict[itemid]['diff mean'], stat_dict[itemid]['diff std'] = avg_timediff(data_df, itemid)
    return stat_dict


In [3]:
mimic_df = data_fetcher.make_dataframe(engine, data_fetcher.chart_dict, data_fetcher.lab_dict, stays)

In [4]:
stats = get_mimic_stats(mimic_df)
print(stats)

{2: {'missing': 0.5569270902251375, 'diff mean': 53.66225889049408, 'diff std': 10.83578223686341}, 1: {'missing': 0.24267972250051983, 'diff mean': 53.99181239133394, 'diff std': 40.24956554413065}, 6: {'missing': 0.026521237783784804, 'diff mean': 937.1950235067752, 'diff std': 391.1902296497109}, 9: {'missing': 0.025821818113079147, 'diff mean': 961.3994146782151, 'diff std': 370.3195814217308}, 7: {'missing': 0.02739078656358103, 'diff mean': 868.0787342252023, 'diff std': 377.8588414266844}, 4: {'missing': 0.1519063911835315, 'diff mean': 885.5093095738139, 'diff std': 380.9314673771253}, 3: {'missing': 0.009999810967656566, 'diff mean': 57.5696514016523, 'diff std': 26.008989105040637}, 8: {'missing': 0.024082720553486703, 'diff mean': 826.2490849390813, 'diff std': 411.60916976745943}, 5: {'missing': 0.37598533109014953, 'diff mean': 520.9180169849711, 'diff std': 720.3389541792626}, 0: {'missing': 0.4105593467042211, 'diff mean': 198.3146634324688, 'diff std': 119.0364372618389

In [5]:
mimic_df.groupby('itemid').valuenum.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
itemid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1204113.0,37.055863,1.775207,-3.0,36.5,37.055599,37.599998,376.5
1,4511493.0,121.683512,148.532235,-69.0,104.0,119.0,137.0,141146.04
2,2758624.0,87.414941,55.779654,-88.0,74.0,86.0,99.0,86101.0
3,6053990.0,98.866184,2946.655023,0.0,96.0,98.0,99.0,6363333.0
4,295205.0,5.012012,1840.508865,-70.0,0.7,1.0,1.8,999999.0
5,438882.0,7.384254,0.083588,0.0,7.34,7.39,7.44,9.0
6,336713.0,139.187305,5.363066,82.0,136.0,139.0,142.0,184.0
7,352784.0,4.08769,0.623354,0.8,3.7,4.0,4.4,27.5
8,369554.0,29.891827,4.689382,0.0,26.7,29.5,32.5,71.7
9,321570.0,24.804446,5.248779,5.0,22.0,25.0,28.0,65.0


In [6]:
arr = np.load(os.path.join(ROOT_DIR, 'Data', 'mimic', '48_1_arr.npy'))
missing_from_timeseries = {}
for itemid in range(arr.shape[1]):
    n_missing = np.count_nonzero(np.isnan(arr[:, itemid, :]))
    missing_ratio = n_missing/(arr.shape[0]*arr.shape[2])
    missing_from_timeseries[itemid] = missing_ratio


In [7]:
missing_from_timeseries

{0: 0.7888055833853173,
 1: 0.47135372362211175,
 2: 0.6656242320561048,
 3: 0.2659094346042608,
 4: 0.9433856007762929,
 5: 0.9248092348600846,
 6: 0.9385573996710838,
 7: 0.9354600259604419,
 8: 0.9242047251784781,
 9: 0.9401192794087069}

In [8]:
mimic_heart_df = data_fetcher.make_dataframe(engine, data_fetcher.chart_dict, data_fetcher.lab_dict, heart_stays)
stats = get_mimic_stats(mimic_heart_df)
print(stats)
# mimic_df.groupby('itemid').valuenum.describe()

arr_heart = np.load(os.path.join(ROOT_DIR, 'Data', 'mimic', 'dataset_48_1_0510_heart_arr.npy'))
missing_from_timeseries = {}
for itemid in range(arr_heart.shape[1]):
    n_missing = np.count_nonzero(np.isnan(arr_heart[:, itemid, :]))
    missing_ratio = n_missing / (arr_heart.shape[0] * arr_heart.shape[2])
    missing_from_timeseries[itemid] = missing_ratio

missing_from_timeseries

{5: {'missing': 0.28137639601569575, 'diff mean': 332.41963148755696, 'diff std': 440.6908288125005}, 8: {'missing': 0.018472683368548144, 'diff mean': 733.3172658274675, 'diff std': 379.135512308977}, 6: {'missing': 0.02764865680651977, 'diff mean': 1004.4314823363314, 'diff std': 403.2957963841264}, 7: {'missing': 0.03247811651071537, 'diff mean': 854.3963297348015, 'diff std': 392.64795976709365}, 9: {'missing': 0.024268035013582854, 'diff mean': 1000.3666600926814, 'diff std': 343.1088471195881}, 2: {'missing': 0.6369453667370963, 'diff mean': 50.30351255384274, 'diff std': 12.289718289230091}, 3: {'missing': 0.009538182915786297, 'diff mean': 57.01348910039766, 'diff std': 28.963201105993562}, 1: {'missing': 0.18756414126169635, 'diff mean': 47.87529693616119, 'diff std': 18.363132546197104}, 4: {'missing': 0.1993359492906731, 'diff mean': 891.5742991276345, 'diff std': 338.92600690324355}, 0: {'missing': 0.2856021732568669, 'diff mean': 165.29061463682507, 'diff std': 161.6825567

{0: 0.7161422175269142,
 1: 0.4527241171143978,
 2: 0.7250628835898983,
 3: 0.2872560116711943,
 4: 0.9475299325887916,
 5: 0.8826944360599658,
 6: 0.9450145889928564,
 7: 0.9386558003823322,
 8: 0.917505533755911,
 9: 0.9437267330717376}