<a href="https://colab.research.google.com/github/MedleyHealth/TypeAssist/blob/master/TypeAssist_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Important: Do not save the output from code cells in this notebook to Github (or any other public location). Access to the dataset is restricted and we cannot leak any information about individual samples. If you have any doubts about what this means, message me first before committing.**

# Data Loading

### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Mount Google Drive where dataset is saved

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Load dataset from path in Google Drive (change path to your location)

In [None]:
path = '/content/drive/My Drive/4 Archive/MIMIC/NOTEEVENTS.csv'

df = pd.read_csv(path)
df[:5]

# Exploratory Data Analysis

### Check the shape of the data

In [None]:
print('Number of Notes    ', df.shape[0])
print('Number of Columns  ', df.shape[1])

### See how many null values are in each column

In [None]:
df.isnull().sum()

### Explore the distribution of values in a column

In [None]:
def examine_col_values(col_name):
  values, counts = np.unique(df[col_name], return_counts=True)

  print('Number of Values:', len(values))

  # Find the maximum length to format columns properly
  max_length = max([len(str(value)) for value in values])
  print('Max Length:', max_length, '\n\n')

  # Sort the counts and values from most common to least common
  count_sort_ind = np.argsort(-counts)
  values_sort = values[count_sort_ind]
  counts_sort = counts[count_sort_ind]

  for i, (value, count) in enumerate(zip(values_sort, counts_sort)):
    if i == 0:
      print('%-{}s %s\n'.format(max_length+10) % (col_name, 'COUNT'))
    
    if i > 20:
      print('\n*** RESULTS TRUNCATED FOR BREVITY ***')
      return

    print('%-{}s %s'.format(max_length+10) % (value, count))

### CATEGORY column refers to the type of note

In [None]:
examine_col_values('CATEGORY')

### DESCRIPTION column refers to a finer type of note

In [None]:
examine_col_values('DESCRIPTION')

### SUBJECT_ID column refers to a specific patient

In [None]:
examine_col_values('SUBJECT_ID')

### HADM_ID refers to a specific hospital admission

In [None]:
examine_col_values('HADM_ID')

### Change "i = ..." several times to see what actual notes look like

In [None]:
i = 0

print('CATEGORY:', df.iloc[i]['CATEGORY'], '\n\n')
print(df.iloc[i]['TEXT'])

### Calculate the number of characters across all notes

In [None]:
note_lengths = [len(note) for note in df['TEXT']]

### Plot the entire distribution of note lengths

In [None]:
plt.hist(note_lengths, density=False, bins=1000)
plt.ylabel('Count')
plt.xlabel('Note Length')

### Zoom into the area with a higher concentration of notes (ignore the long tail)

In [None]:
plt.hist(note_lengths, density=False, bins=10000)
plt.ylabel('Count')
plt.xlabel('Note Length')
plt.xlim((-3000, 12000))
plt.ylim(0, 12000)

### Zoom into the area of notes with less than 100 characters (for quick model training)

In [None]:
plt.hist(note_lengths, density=False, bins=10000)
plt.ylabel('Count')
plt.xlabel('Note Length')
plt.xlim((-10, 110))
plt.ylim(0, 12000)

### Get a list of all unique characters for data preprocessing (we want to remove special characters)

In [None]:
note_chars = [set(text) for text in df['TEXT']]

unique_chars = set([char_set for note in note_chars for char_set in note])

unique_chars