In [5]:
import pandas as pd
import numpy as np
import os
import psycopg2
import sqlalchemy
import string
import spacy
from spacy.symbols import ORTH
#import scispacy
from collections import Counter
import re
from datetime import date, datetime, timedelta
import random
from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit
from spellchecker import SpellChecker
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
dbschema='mimiciii'
cnx = sqlalchemy.create_engine('postgresql+psycopg2://aa5118:mimic@localhost:5432/mimic',
                    connect_args={'options': '-csearch_path={}'.format(dbschema)})


In [None]:
sql = """
  SELECT
      subject_id, hadm_id, ethnicity, diagnosis, admittime, dischtime, deathtime, admission_type
  FROM admissions
  WHERE admission_type NOT IN ('NEWBORN')
  ORDER BY subject_id, admittime
  --LIMIT 100;
"""

df_adm = pd.read_sql_query(sqlalchemy.text(sql), cnx)
print(df_adm.shape)
df_adm.head(20)

In [None]:
df_adm['next_admittime'] = df_adm.groupby('subject_id').admittime.shift(-1)
df_adm['next_admission_type'] = df_adm.groupby('subject_id').admission_type.shift(-1)
df_adm.head(20)

In [None]:
# get rows where next admission is elective and replace with naT or nan
rows = df_adm.next_admission_type == 'ELECTIVE'
df_adm.loc[rows,'next_admittime'] = pd.NaT
df_adm.loc[rows,'next_admission_type'] = np.NaN

In [None]:
# sort by subject_ID and admission date
# it is safer to sort right before the fill in case something changed the order above
df_adm = df_adm.sort_values(['subject_id','admittime'])
# back fill (this will take a little while)
df_adm[['next_admittime','next_admission_type']] = df_adm.groupby(['subject_id'])[['next_admittime','next_admission_type']].fillna(method = 'bfill')

In [None]:
df_adm['days_next_admit']=  (df_adm.next_admittime - df_adm.dischtime).dt.total_seconds()/(24*60*60)

In [None]:
print(len(df_adm))
df_adm.head(100)

In [None]:
matplotlib.rcParams.update({'font.size': 18})

plt.hist(df_adm['days_next_admit'], bins=60)
plt.ylabel('Count')
plt.xlabel('Number of days since previous ICU stay')
plt.yscale('log')
plt.tight_layout()
plt.savefig('readmission-histogram.pdf')

In [None]:
df_adm['30d_unplan_readmit'] = 'N'
df_adm.loc[df_adm['days_next_admit'] < 30, '30d_unplan_readmit'] = 'Y'

In [None]:
df_adm.head(50)

In [None]:
#df_adm['30d_unplan_readmit'].value_counts().plot(kind='bar')
plt.bar(df_adm['30d_unplan_readmit'].unique(),height=df_adm['30d_unplan_readmit'].value_counts())
plt.ylabel('Number of ICU stays')
plt.xlabel('Previous stay < 30 days ago')
plt.tight_layout()
plt.savefig('30d-readmission-bar.pdf')

In [None]:
readmit_count = df_adm.subject_id[(df_adm['30d_unplan_readmit']) == 'Y'].count()
ratio = readmit_count/df_adm['30d_unplan_readmit'].count()
ratio

In [None]:
df_adm['ethnicity'].value_counts().plot(kind='bar')

In [None]:
sorted(df_adm.ethnicity.unique())

In [None]:
asian = dict.fromkeys(['ASIAN','ASIAN - ASIAN INDIAN','ASIAN - CAMBODIAN','ASIAN - CHINESE','ASIAN - FILIPINO',
 'ASIAN - JAPANESE',
 'ASIAN - KOREAN',
 'ASIAN - OTHER',
 'ASIAN - THAI',
 'ASIAN - VIETNAMESE',
 'MIDDLE EASTERN'], 'asian')    
white = dict.fromkeys([ 'WHITE', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN', 'WHITE - OTHER EUROPEAN', 'WHITE - RUSSIAN'], 'white')
black = dict.fromkeys([ 'BLACK/AFRICAN', 'BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN','BLACK/HAITIAN'], 'black')
hispanic = dict.fromkeys([ 'HISPANIC OR LATINO', 'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)', 'HISPANIC/LATINO - COLOMBIAN',
 'HISPANIC/LATINO - CUBAN',
 'HISPANIC/LATINO - DOMINICAN',
 'HISPANIC/LATINO - GUATEMALAN',
 'HISPANIC/LATINO - HONDURAN',
 'HISPANIC/LATINO - MEXICAN',
 'HISPANIC/LATINO - PUERTO RICAN',
 'HISPANIC/LATINO - SALVADORAN',
 'PORTUGUESE',
 'SOUTH AMERICAN'], 'hispanic')

df_adm = df_adm.replace(asian)
df_adm = df_adm.replace(white)
df_adm = df_adm.replace(black)
df_adm = df_adm.replace(hispanic)

allowed_vals = ['asian', 'black', 'white', 'hispanic']
df_adm.loc[~df_adm['ethnicity'].isin(allowed_vals), 'ethnicity'] = "other"
sorted(df_adm.ethnicity.unique())

In [None]:
df_adm['ethnicity'].value_counts().plot(kind='bar')

In [None]:
nlp = spacy.load('en_core_sci_md')

def tokenise_text(text):
    
    text = text.lower()
    tokens = nlp.tokenizer(text)
    tokenised_text = ""
    
    for token in tokens:
        tokenised_text = tokenised_text + str(token) + " "
    
    tokenised_text = ' '.join(tokenised_text.split())
    
    return tokenised_text

In [None]:
df_adm.diagnosis = df_adm.diagnosis.apply(str)
print(df_adm.diagnosis)

In [None]:
df_adm["diagnosis"] = df_adm["diagnosis"].apply(tokenise_text)
df_adm.head()

In [None]:
df_adm_csv = df_adm[['subject_id','hadm_id', 'ethnicity','diagnosis','admission_type','30d_unplan_readmit']]

In [None]:
df_adm_csv.to_csv('../data/df_adm.csv',index=False)