In [38]:
%load_ext autoreload
%autoreload 2
%cd ~/Research/Sriram/DeepSetRNN

import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import os
import string
import pickle
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import logging
import os
import importlib

from src import convert_dot_format

DATA_PATH='data/MIMIC3database'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/minh/Research/Sriram/DeepSetRNN


In [2]:
logging.info("starting logger")
_LOGGER = logging.getLogger('MIMIC_BOW_input_models')
_LOGGER.setLevel(logging.DEBUG)

# Preprocessing Data

In [3]:
top_n = 100 #should be the same as before
diagnoses_procedures_df = pd.read_csv('data/MIMIC3database/processed/ICD9_diagnoses_procedures_mimic_idx_sentences_top_100_sorted.csv', index_col=0)

#converting sentences to a single column
diagnoses_procedures_df['DIAGNOSES_SENTENCES'] = diagnoses_procedures_df[['0_diagnoses', '1_diagnoses',
       '2_diagnoses', '3_diagnoses', '4_diagnoses', '5_diagnoses',
       '6_diagnoses', '7_diagnoses', '8_diagnoses', '9_diagnoses',
       '10_diagnoses', '11_diagnoses', '12_diagnoses', '13_diagnoses',
       '14_diagnoses', '15_diagnoses', '16_diagnoses', '17_diagnoses',
       '18_diagnoses', '19_diagnoses', '20_diagnoses', '21_diagnoses', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31']].values.tolist()

diagnoses_procedures_df['PROCEDURES_SENTENCES'] = diagnoses_procedures_df[
    ['0_procedures', '1_procedures', '2_procedures', '3_procedures',
       '4_procedures', '5_procedures', '6_procedures', '7_procedures',
       '8_procedures', '9_procedures', '10_procedures', '11_procedures',
       '12_procedures', '13_procedures', '14_procedures', '15_procedures',
       '16_procedures', '17_procedures', '18_procedures', '19_procedures',
       '20_procedures', '21_procedures']].values.tolist()

diagnoses_procedures_df.drop(labels=['0_diagnoses', '1_diagnoses',
       '2_diagnoses', '3_diagnoses', '4_diagnoses', '5_diagnoses',
       '6_diagnoses', '7_diagnoses', '8_diagnoses', '9_diagnoses',
       '10_diagnoses', '11_diagnoses', '12_diagnoses', '13_diagnoses',
       '14_diagnoses', '15_diagnoses', '16_diagnoses', '17_diagnoses',
       '18_diagnoses', '19_diagnoses', '20_diagnoses', '21_diagnoses', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31'], axis=1, inplace=True)

diagnoses_procedures_df.drop(labels=['0_procedures', '1_procedures', '2_procedures', '3_procedures',
       '4_procedures', '5_procedures', '6_procedures', '7_procedures',
       '8_procedures', '9_procedures', '10_procedures', '11_procedures',
       '12_procedures', '13_procedures', '14_procedures', '15_procedures',
       '16_procedures', '17_procedures', '18_procedures', '19_procedures',
       '20_procedures', '21_procedures'], axis=1, inplace=True)

diagnoses_procedures_df.to_csv("data/MIMIC3database/processed/ICD9_diagnoses_procedures_mimic_idx_sentences_top_100_sorted_concat.csv")

In [33]:
from src.utils import get_onehot_vector

diagnoses_counts = diagnoses_procedures_df['ICD9_CODE_diagnoses'].value_counts()
procedures_counts = diagnoses_procedures_df['ICD9_CODE_procedures'].value_counts()

diagnoses_set = set(diagnoses_counts.keys()[:top_n])
procedures_set = set(procedures_counts.keys()[:top_n])

diagnoses_idx_map = {}
for i, code in enumerate(diagnoses_counts.keys()[:top_n]):
    diagnoses_idx_map[code] = i

procedures_idx_map = {}
for i, code in enumerate(procedures_counts.keys()[:top_n]):
    procedures_idx_map[code] = i
    
data = list(diagnoses_procedures_df.groupby(['SUBJECT_ID']))
data = [(subject_id, list(subject_data.groupby(['HADM_ID', 'ADMITTIME']))) for subject_id, subject_data in data]

all_inputs = []
all_outputs = []
for _, subject in data:
    input_series = []
    output_series = []
    for _, timestep in subject:
        timestep = timestep[timestep['ICD9_CODE_diagnoses'].isin(diagnoses_set)
                           | timestep['ICD9_CODE_procedures'].isin(procedures_set)]
        
        diagnoses_one_hot = get_onehot_vector(timestep['ICD9_CODE_diagnoses'], diagnoses_idx_map)
        procedures_one_hot = get_onehot_vector(timestep['ICD9_CODE_procedures'], procedures_idx_map)
            #skipping timesteps that are not relevant
        if len(timestep) > 0:
            input_series.append((np.append(diagnoses_one_hot, procedures_one_hot)))
            output_series.append(diagnoses_one_hot)
    if len(input_series) > 0 and len(output_series):
        all_inputs.append(np.array(input_series))
        all_outputs.append(np.array(output_series))

In [34]:
min_len = 3

inputs = list(filter(lambda x : len(x) >= min_len, all_inputs))
outputs = list(filter(lambda x: len(x) >= min_len, all_outputs))

n_seq = len(inputs)

split = int(n_seq*0.8)
train_inputs = inputs[:split]
train_outputs =  outputs[:split]

test_inputs = inputs[split:n_seq]
test_outputs = outputs[split:n_seq]

# Training Models

In [37]:
#global objects
training_loss_map = {}
model_map = {}

In [45]:
######LSTM MODEL#####
from src.model.mimic_onehot import OneHotLSTMClassifier

LSTMArgs = namedtuple('LSTMArgs', 
                      ['hidden_dim',
                       'n_epoch',
                       'lr',
                       'momentum',
                      'n_layers'])
args = LSTMArgs(
    hidden_dim=[1000,100],
    n_epoch = 10,
    lr = 0.1,
    n_layers=1,
    momentum = 0.9
)


_LOGGER.handlers = [
    h for h in _LOGGER.handlers if not isinstance(h, logging.StreamHandler)]
model_name = "onehotLSTM_top_{}_{}hd_{}".format(top_n, args.hidden_dim, args.n_epoch)
fh = logging.FileHandler('logs/MIMIC3/onehot/{}.log'.format(model_name))
fh.setLevel(logging.DEBUG)
_LOGGER.addHandler(fh)

loss_fn = nn.BCEWithLogitsLoss()
model = OneHotLSTMClassifier(
    input_dim=len(inputs[0][0]),
    hidden_dims=args.hidden_dim,
    n_layers=args.n_layers,
    n_class=top_n)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

#initial test loss
test_losses = []
for sequence, target in zip(test_inputs, test_outputs):
    model.hidden = model.init_hidden()
    import pdb;pdb.set_trace()
    logits = model(sequence)
    loss = loss_fn(logits[:-1],torch.tensor(target[1:]))
    test_losses.append(loss.data)
_LOGGER.info("Initial Validation Loss: {}".format(np.mean(test_losses)))

#training model
training_losses = []
for epoch in range(n_epoch):
    curr_losses = []
    for sequence, target in zip(train_inputs,
                                train_outputs):
        model.zero_grad()
        model.hidden = model.init_hidden()
        
        logits = model(sequence)
        loss = loss_fn(logits[1:], torch.tensor(target[1:]))
        curr_losses.append(loss.data)
        loss.backward()
        optimizer.step()
    mean_loss = np.mean(curr_losses)
    training_losses.append(mean_loss)
    _LOGGER.info("epoch {}: {}".format(epoch, mean_loss))
    
#saving model
torch.save(model, 'models/MIMIC3/{}.pt'.format(model_name))

#final validation loss
test_losses = []
for sequence, target in zip(test_inputs, test_outputs):
    model.hidden = model.init_hidden()
    logits = model(sequence)
    loss = loss_fn(logits[:-1],torch.tensor(target[1:]))
    test_losses.append(loss.data)
_LOGGER.info("final validation Loss: {}".format(np.mean(test_losses)))


model_map[model_name] = model
training_loss_map[model_name] = training_losses

TypeError: 'int' object is not callable