# VisICU

This file visualizes Intensive Care Unit (ICU) data for a single patient located in UCI, Machine Learning Repositoy in the link: https://archive.ics.uci.edu/ml/datasets/ICU

The script below imports the data, detects outliers using several techniques and outputs an html file that visualizes this data.


### License

Copyright (C) 2018 Jacob Barhak
 
This file is part of the VisICU . VisICU is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

VisICU is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

See the GNU General Public License for more details.



In [1]:
import pandas
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.models import DatetimeTickFormatter, HoverTool, Range1d, ColumnDataSource, Legend
from bokeh.models.glyphs import MultiLine
from bokeh.layouts import column , row
from bokeh.palettes import Viridis256

from bokeh.io import push_notebook, show, output_notebook

import numpy
import sklearn
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern, ConstantKernel
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.neighbors import LocalOutlierFactor, kneighbors_graph
output_notebook()

In [2]:
def MyDateParse(InputDateStr):
    " Convert the hour to full date properly when passing midnight"
    HourParse = pandas.datetime.strptime(InputDateStr, '%H:%M:%S')
    
    # Hours after 22:00 are assumed to start on 1993-08-16 After midnight it is 1993-08-17
    if HourParse.hour >= 22:
        DateOnly = '1993-08-16 '
    else:
        DateOnly = '1993-08-17 '
    DateParsed = pandas.datetime.strptime( DateOnly + InputDateStr, '%Y-%m-%d %H:%M:%S')
    return DateParsed

MonitorData=pandas.pandas.read_csv('Monitor-Data', sep = '\t', names = ['Time','Code','Data'], parse_dates = ['Time'], date_parser = MyDateParse,  dtype={'Code': object})
MonitorData.describe(include='all')

Unnamed: 0,Time,Code,Data
count,7931,7931.0,7931.0
unique,2595,14.0,
top,1993-08-16 23:21:42,20.0,
freq,8,1985.0,
first,1993-08-16 23:21:42,,
last,1993-08-17 11:32:14,,
mean,,,105.720968
std,,,621.786325
min,,,-12.0
25%,,,62.0


In [3]:
MonitorDataCode=pandas.pandas.read_csv('Monitor-Data-Codes-Edited.txt', sep = '\t', names = ['Code','Name','Units'] ,  dtype={'Code': object})
MonitorDataCode


Unnamed: 0,Code,Name,Units
0,1,Ventilation Mode,"0: Pressure-controlled , 1: Hand bagging"
1,4,Mainframe alarms suspended,
2,7,Trace 1 ECG -> Heart rate,(bpm)
3,19,Arterial pressure - Mean,(mm Hg)
4,20,Arterial pressure - Systolic,(mm Hg)
5,21,Arterial pressure - Diastolic,(mm Hg)
6,22,Arterial Heart rate,(bpm)
7,59,Arterial O2 saturation,(%)
8,76,Volume fraction of inspired oxygen (FiO2),(%)
9,80,Ventilator Data - Respiration rate,(breaths/min)


In [4]:
LabData=pandas.pandas.read_fwf('Lab-Data',widths = [22,14,8], names = ['Time','Code','Data'], parse_dates = ['Time'] , dtype={'Code': object}, keep_default_na=False, na_values=['_'])
#remove white characters aroung code
LabData['Code'] = LabData['Code'].str.strip()
LabData.describe(include='all')

Unnamed: 0,Time,Code,Data
count,81,81,81.0
unique,8,38,
top,1993-08-17 04:31:00,PCO2ART,
freq,31,5,
first,1993-08-16 18:13:00,,
last,1993-08-17 12:41:00,,
mean,,,50.79642
std,,,54.889145
min,,,0.78
25%,,,11.3


In [5]:
LabDataCode=pandas.pandas.read_csv('Domain-Description-Edited.txt', sep = '\t', names = ['Code','Name'] , dtype={'Code': object}, na_filter = False)
# Note that NA is interpreted as NaN so we remove interpretation of NaN here
LabDataCode

Unnamed: 0,Code,Name
0,FIB,Plasma fibrinogen (normal: 170-340 mg/dl) A p...
1,PLT,"Platelet count (normal: > 150,000/mm3). Low v..."
2,PT,Prothrombin time (normal control values are i...
3,PT1,Prothrombin time (normal control values are i...
4,PT2,Prothrombin time (normal control values are i...
5,PTT,Partial thromboplastin time (same nomenclature...
6,PTT1,Partial thromboplastin time (same nomenclature...
7,PTT2,Partial thromboplastin time (same nomenclature...
8,ALB,Serum albumin (normal: 3.5-5.5 g/dl)
9,ALT,Alanine aminotransferase (SGPT) (normal: 10-40...


In [6]:
import re
File = open('Flowsheet-Data')
Lines= File.readlines()
File.close()
Pattern = '\t(?![0-9]+\:)| {2,}'
SplitLines = [re.split(Pattern,Line) for Line in Lines]
NumberOfColumns = [len(SplitLine) for SplitLine in SplitLines]
SplitLines


[['8/16/93\t22:00', 'Terbutaline (mcg/kg/min)', '0.8\n'],
 ['8/16/93\t22:00', 'Pavulon(mg/hr)', '1.1\n'],
 ['8/16/93\t22:00', 'Versed (mg/hr)', '1.2\n'],
 ['8/16/93\t22:00', 'Fluid Out (cc/hr)', '12\n'],
 ['8/16/93\t22:00', 'Dopamine(mg/kg/min)', '12.5\n'],
 ['8/16/93\t22:00', 'Fluid In (cc)', '15.9\n'],
 ['8/16/93\t22:00', 'Fentanyl (mcg/hr)', '60\n'],
 ['8/16/93\t22:00', 'Temp', '38.3 C\n'],
 ['8/16/93\t22:15', 'Dopamine(mg/kg/min)', '15\n'],
 ['8/16/93\t23:00', 'Fluid Out (cc/hr)', '11\n'],
 ['8/17/93\t1:00', 'Dopamine(mg/kg/min)', '10\n'],
 ['8/17/93\t1:00', 'Fluid Out (cc/hr)', '16\n'],
 ['8/17/93\t1:30', 'Fentanyl (mcg/hr)', '70\n'],
 ['8/17/93\t2:00', 'Fluid Out (cc/hr)', '25\n'],
 ['8/17/93\t3:00', 'Fluid Out (cc/hr)', '17\n'],
 ['8/17/93\t4:00', 'Dopamine(mg/kg/min)', '15\n'],
 ['8/17/93\t4:00', 'Fluid Out (cc/hr)', '16\n'],
 ['8/17/93\t4:30', 'Dopamine(mg/kg/min)', '12.5\n'],
 ['8/17/93\t5:00', 'Fluid Out (cc/hr)', '22\n'],
 ['8/17/93\t5:15', 'Fluid In (cc)', '16.1\n'],
 ['8/

In [7]:
# The seperator is funny, either tab not followed by Number and colons or more than two spaces 
# also in one instance C temperature units were left in the file so ' C' was considered 
# white space - there was no other occurance in the file so it should do the trick
# This file was obviously prepared by hand and therefore problematic, this seperator should
# not be reused elsewhere
FlowsheetData=pandas.pandas.read_csv('Flowsheet-Data',engine='python', sep = '\t(?![0-9]+\:)| {2,}', names = ['Time','Name','Data'], parse_dates = ['Time'] , comment =' C')
FlowsheetData

Unnamed: 0,Time,Name,Data
0,1993-08-16 22:00:00,Terbutaline (mcg/kg/min),0.8
1,1993-08-16 22:00:00,Pavulon(mg/hr),1.1
2,1993-08-16 22:00:00,Versed (mg/hr),1.2
3,1993-08-16 22:00:00,Fluid Out (cc/hr),12.0
4,1993-08-16 22:00:00,Dopamine(mg/kg/min),12.5
5,1993-08-16 22:00:00,Fluid In (cc),15.9
6,1993-08-16 22:00:00,Fentanyl (mcg/hr),60.0
7,1993-08-16 22:00:00,Temp,38.3
8,1993-08-16 22:15:00,Dopamine(mg/kg/min),15.0
9,1993-08-16 23:00:00,Fluid Out (cc/hr),11.0


In [8]:
Merged = pandas.concat([MonitorData, LabData, FlowsheetData])
Merged.fillna('',inplace = True)
Merged

Unnamed: 0,Code,Data,Name,Time
0,21,52.0,,1993-08-16 23:21:42
1,1,0.0,,1993-08-16 23:21:42
2,76,50.0,,1993-08-16 23:21:42
3,85,9.0,,1993-08-16 23:21:42
4,80,16.0,,1993-08-16 23:21:42
5,84,18.0,,1993-08-16 23:21:42
6,83,43.0,,1993-08-16 23:21:42
7,81,72.0,,1993-08-16 23:21:42
8,19,66.0,,1993-08-16 23:22:02
9,20,93.0,,1993-08-16 23:22:02


In [9]:
MergedCode = pandas.merge(MonitorDataCode, LabDataCode, how ='outer', on = ['Code','Name'])
MergedCode.fillna('', inplace = True)
Merged

Unnamed: 0,Code,Data,Name,Time
0,21,52.0,,1993-08-16 23:21:42
1,1,0.0,,1993-08-16 23:21:42
2,76,50.0,,1993-08-16 23:21:42
3,85,9.0,,1993-08-16 23:21:42
4,80,16.0,,1993-08-16 23:21:42
5,84,18.0,,1993-08-16 23:21:42
6,83,43.0,,1993-08-16 23:21:42
7,81,72.0,,1993-08-16 23:21:42
8,19,66.0,,1993-08-16 23:22:02
9,20,93.0,,1993-08-16 23:22:02


In [10]:
ProcessedData = Merged.merge(MergedCode, on='Code', how = 'left',  suffixes =['Left','Right'])
ProcessedData['Units'].fillna('', inplace = True)
ProcessedData['NameLeft'].fillna('', inplace = True)
ProcessedData['NameRight'].fillna('', inplace = True)
ProcessedData['Name']=ProcessedData['NameLeft']+ProcessedData['NameRight']
ProcessedData.drop(['NameLeft','NameRight'],1, inplace = True)
ProcessedData.set_index('Time')
def CalculateSourceAndCleanData (Code,Units,Value):
    "Deduce Lab/Flowsheet/Monitor"
    # Monitor has both unit and Code
    # Lab has no unit
    # Flowsheet has no code
    if Code == '':
        RetSource = 'FlowSheet'
    elif Units == '':
        RetSource = 'Lab'
    else:
        RetSource = 'Monitor'
    if Value == 32000:
        CleanVal = None
    else:
        CleanVal= Value
    return RetSource,CleanVal

ProcessedData['Source'] = ProcessedData.apply (lambda Row: CalculateSourceAndCleanData(Row['Code'],Row['Units'],Row['Data'])[0] , axis=1)
ProcessedData['Data'] = ProcessedData.apply (lambda Row: CalculateSourceAndCleanData(Row['Code'],Row['Units'],Row['Data'])[1] , axis=1)
# remove null value points that were removed before at initial filtering and make is a copy
Data = ProcessedData.loc[ProcessedData['Data'].notnull()].copy()
Data


Unnamed: 0,Code,Data,Time,Units,Name,Source
0,21,52.0,1993-08-16 23:21:42,(mm Hg),Arterial pressure - Diastolic,Monitor
1,1,0.0,1993-08-16 23:21:42,"0: Pressure-controlled , 1: Hand bagging",Ventilation Mode,Monitor
2,76,50.0,1993-08-16 23:21:42,(%),Volume fraction of inspired oxygen (FiO2),Monitor
3,85,9.0,1993-08-16 23:21:42,(cm H2O),Ventilator Data - PEEP,Monitor
4,80,16.0,1993-08-16 23:21:42,(breaths/min),Ventilator Data - Respiration rate,Monitor
5,84,18.0,1993-08-16 23:21:42,(cm H2O),Ventilator Data - Mean Airway Pressure,Monitor
6,83,43.0,1993-08-16 23:21:42,(cm H2O),Ventilator Data - PIP,Monitor
7,81,72.0,1993-08-16 23:21:42,(cc/breath),Ventilator Data - Tidal volume,Monitor
8,19,66.0,1993-08-16 23:22:02,(mm Hg),Arterial pressure - Mean,Monitor
9,20,93.0,1993-08-16 23:22:02,(mm Hg),Arterial pressure - Systolic,Monitor


In [11]:
for (ColumnName,ResetValue) in [('GroupCount' , None),
                                ('Min' , None),
                                ('Max' , None),
                                ('PlotNumber' , 0),
                                ('DataFiltered' , None),
                                ('DataFiltered1' , None),
                                ('DataFiltered2' , None),
                                ('DataFiltered3' , None),
                                ('DataFiltered4' , None),
                                ('DataFiltered5' , None),
                                ('DataFiltered6' , None),
                                ('DataFiltered7' , None),
                                ('Votes' , 0),
                                ('Predicted' , None),
                                ('Sigma',  None),
                               ]:
        Data.insert(loc = len(Data.columns), column= ColumnName, value = ResetValue)

In [12]:
def FilterData(Times, Values, Method, Threshold):
    "Filter the Values over Time using the threshold using different techniques"
    Predicted, Auxilary, FilteredData =   None, None , None
    DataX=numpy.array(Times).reshape(-1, 1)
    DataY=numpy.array(Values).reshape(-1, 1)
    if Method in ['GP','GaussianProcess']:
        # filter using Gaussian Process
        Kernel = ConstantKernel() + Matern(length_scale=2, nu=3/2) + WhiteKernel(noise_level=1)        
        GaussianProcessObject = GaussianProcessRegressor(kernel = Kernel, alpha=1e-2, normalize_y = True,random_state=1)
        GaussianProcessObject.fit(DataX,DataY)
        (Predicted, Sigma) = GaussianProcessObject.predict(DataX, return_std=True)
        IndicesToFilter = numpy.abs(DataY - Predicted) > Sigma.reshape(-1, 1)*Threshold
        Auxilary = Sigma
        print 'GP'        
    elif Method in ['SVM','SupportVectorMachine']:
        NormDataY = (DataY-numpy.mean(DataY) )/ max(numpy.abs(DataY))
        DataXY = numpy.concatenate((DataX,NormDataY),axis=1)
        SupportVectorMachine = OneClassSVM(nu=Threshold)
        SupportVectorMachine.fit(NormDataY)
        Predictions = SupportVectorMachine.predict(NormDataY)
        IndicesToFilter = Predictions < 0
        print 'SVM', Threshold
    elif Method in ['DBSCAN']:
        NormDataY = (DataY-numpy.mean(DataY) )/ max(numpy.abs(DataY))
        DataXY = numpy.concatenate((DataX,NormDataY),axis=1)
        Epsilon, ProportionThreshold = Threshold
        Stop = 100
        NewPredictions = None
        while True:
            DBScanObject = DBSCAN(eps = Epsilon)
            Predictions = DBScanObject.fit_predict(DataXY)
            ClassSet = set(Predictions)
            Classes = list(ClassSet)
            NumberOfClasses = len(Classes)
            ClassProportions = [ float(sum(Predictions == Class))/len(Predictions) for Class in Classes]
            NonRelevantClasses = [Class for (ClassEnum,Class) in enumerate(Classes) if ClassProportions[ClassEnum] < ProportionThreshold]
            IndicesToFilter = numpy.isin(Predictions, numpy.array(NonRelevantClasses))
            print 'DBSCAN', NumberOfClasses, Epsilon, ClassProportions, NonRelevantClasses 
            if NumberOfClasses <= 3:
                # the first time 3 or less classes are predicted exit the loop
                break
            else:
                # otherwise loop with larger epsilon
                Epsilon = Epsilon*2
    elif Method in ['AGG', 'AgglomerativeClustering']:
        NormDataY = (DataY-numpy.mean(DataY) )/ max(numpy.abs(DataY))
        DataXY = numpy.concatenate((DataX,NormDataY),axis=1)
        NumberOfClusterToFind, AffinityToUse, LinkageToUse, ProportionThreshold = Threshold
        AGGObject = AgglomerativeClustering(n_clusters = NumberOfClusterToFind, affinity = AffinityToUse, linkage = LinkageToUse)
        Predictions = AGGObject.fit_predict(DataXY)
        ClassSet = set(Predictions)
        Classes = list(ClassSet)
        NumberOfClasses = len(Classes)
        ClassProportions = [ float(sum(Predictions == Class))/len(Predictions) for Class in Classes]
        NonRelevantClasses = [Class for (ClassEnum,Class) in enumerate(Classes) if ClassProportions[ClassEnum] < ProportionThreshold]
        print 'AGG', NumberOfClasses, AffinityToUse, LinkageToUse, ClassProportions, NonRelevantClasses 
        IndicesToFilter = numpy.isin(Predictions, numpy.array(NonRelevantClasses))
        
    elif Method in ['LOF', 'LocalOutlierFactor']:
        NormDataY = (DataY-numpy.mean(DataY) )/ max(numpy.abs(DataY))
        DataXY = numpy.concatenate((DataX,NormDataY),axis=1)
        NumberOfNeighbors, MetricToUse, ProportionThreshold = Threshold
        LOFObject = sklearn.neighbors.LocalOutlierFactor(n_neighbors=NumberOfNeighbors, metric=MetricToUse, contamination=ProportionThreshold)
        Predictions = LOFObject.fit_predict(DataXY)
        IndicesToFilter = numpy.isin(Predictions, numpy.array([-1]))
        print 'LOF', Threshold

    elif Method in ['KNN', 'Kneighbours']:
        NormDataY = (DataY-numpy.mean(DataY) )/ max(numpy.abs(DataY))
        DataXY = numpy.concatenate((DataX,NormDataY),axis=1)
        NumberOfNeighborsRequested, MetricToUse, ThresholdSTD = Threshold
        if NumberOfNeighborsRequested>=len(NormDataY):
            NumberOfNeighbors = len(NormDataY)//2
            print "Using only " + str(NumberOfNeighbors) + " points for neighbourhood which is half the total numebr of points"  
        else:
            NumberOfNeighbors = NumberOfNeighborsRequested
        NeighboursDistance = kneighbors_graph(X = DataXY, n_neighbors=NumberOfNeighbors, metric=MetricToUse, mode='distance')
        Neighbours = NeighboursDistance.toarray()
        # sum over each line knowing that there are the same numebr of elements there ignoring zeros - similar to mean
        Metric = numpy.apply_along_axis(sum, 1, Neighbours)
        MeanOfMetric = numpy.mean(Metric)
        StdOfMetric = numpy.std(Metric)
        print 'KNN', MeanOfMetric, StdOfMetric
        IndicesToFilter = numpy.abs(Metric-MeanOfMetric)  > StdOfMetric * ThresholdSTD
        Auxilary = Metric

    elif Method in ['WIN', 'LookBackWindows']:
        # Just look back at M neighbor samples and figure if this is above statistical threshold
        # Note that the first M samples are untouched this method
        NormDataY = (DataY-numpy.mean(DataY) )/ max(numpy.abs(DataY))
        DataXY = numpy.concatenate((DataX,NormDataY),axis=1)
        NumberOfNeighborsRequested, ThresholdSTD = Threshold
        if NumberOfNeighborsRequested>=len(NormDataY):
            NumberOfNeighbors = len(NormDataY)//2
            print "Using only " + str(NumberOfNeighbors) + " points for backwards neighbourhood which is half the total numebr of points"  
        else:
            NumberOfNeighbors = NumberOfNeighborsRequested
        WindowMean = numpy.copy(NormDataY).reshape(-1, 1)
        WindowSTD = numpy.zeros_like(NormDataY)
        for Location in range(NumberOfNeighbors+1,len(NormDataY)):
            # note that point is not included in the window - we look backwards
            WindowValues = NormDataY[(Location-NumberOfNeighbors-1):Location-1]
            # this mesn that the first few values before loaction reman the same in the mean vector
            WindowMean[Location] = numpy.mean(WindowValues)
            WindowSTD[Location] = numpy.std(WindowValues)
        IndicesToFilter = numpy.abs(NormDataY-WindowMean)  > WindowSTD * ThresholdSTD
        Auxilary = WindowSTD
        Predicted = WindowMean
        print 'WIN', Threshold
        
    else:
        raise ValueError, 'Interpolation method unknown'
    FilteredData = DataY.copy()
    FilteredData[IndicesToFilter] = None
    if Auxilary is None:
        Auxilary = [None]*len(FilteredData)
    if Predicted is None:
        Predicted = [None]*len(FilteredData)
    return Predicted, Auxilary, FilteredData

PlotGroups = Data.groupby(['Code','Name','Source'])

PlotsWithManyPoints = 0

TheGroups = PlotGroups.groups
for (Group,GroupIndexList) in TheGroups.items():
    GroupCount = len(GroupIndexList)

    Values = Data.loc[GroupIndexList,'Data'].astype(numpy.float)
    print 'Processing group: '+ str(Group) + ', n=%i' % Values.size
    Times = Data.loc[GroupIndexList,'Time']
    TimeSpans = Times-Times.min()
    TimesDiff=TimeSpans.astype(numpy.timedelta64)
    NormalizedTimes = TimesDiff / TimesDiff.max()    
    MinVal = min(Values)
    MaxVal = max(Values)
    Data.loc[GroupIndexList,'GroupCount']=GroupCount        
    Data.loc[GroupIndexList,'Min']=MinVal
    Data.loc[GroupIndexList,'Max']=MaxVal

    if GroupCount >= 10:
        (Predicted, Sigma, FilteredData1) = FilterData(NormalizedTimes, Values, 'GP', 2)
        (PredictedDummy, SigmaDummy, FilteredData2) = FilterData(NormalizedTimes, Values, 'SVM', 0.01)
        (PredictedDummy, SigmaDummy, FilteredData3) = FilterData(NormalizedTimes, Values, 'DBSCAN', (0.02,0.1))
        (PredictedDummy, SigmaDummy, FilteredData4) = FilterData(NormalizedTimes, Values, 'AGG', ( 3, 'manhattan', 'average' ,0.1))
        (PredictedDummy, SigmaDummy, FilteredData5) = FilterData(NormalizedTimes, Values, 'LOF', ( 10, 'manhattan', 0.02))
        (PredictedDummy, SigmaDummy, FilteredData6) = FilterData(NormalizedTimes, Values, 'KNN', ( 10, 'manhattan', 3))
        (PredictedDummy, SigmaDummy, FilteredData7) = FilterData(NormalizedTimes, Values, 'WIN', ( 10, 6))
        
        PlotsWithManyPoints = PlotsWithManyPoints + 1
        Data.loc[GroupIndexList,'PlotNumber'] = PlotsWithManyPoints
        Data.loc[GroupIndexList,'DataFiltered1'] = FilteredData1.tolist()
        Data.loc[GroupIndexList,'DataFiltered2'] = FilteredData2.tolist()
        Data.loc[GroupIndexList,'DataFiltered3'] = FilteredData3.tolist()
        Data.loc[GroupIndexList,'DataFiltered4'] = FilteredData4.tolist()
        Data.loc[GroupIndexList,'DataFiltered5'] = FilteredData5.tolist()
        Data.loc[GroupIndexList,'DataFiltered6'] = FilteredData6.tolist()
        Data.loc[GroupIndexList,'DataFiltered7'] = FilteredData7.tolist()
        
        FilterMatrix = numpy.column_stack( (FilteredData1 , FilteredData2  , FilteredData3 , FilteredData4 , FilteredData5 , FilteredData6 , FilteredData7) )
        Data.loc[GroupIndexList,'Votes'] = numpy.sum(numpy.isnan(FilterMatrix), axis = 1).tolist()

        Data.loc[GroupIndexList,'Predicted'] = Predicted.tolist()
        Data.loc[GroupIndexList,'Sigma'] = Sigma.tolist()

        

# Now handle smaller groups by splitting into sources
SecondaryPlotGroups = (Data.loc[Data.loc[:,'PlotNumber'] == 0].groupby(['Source']))

for (Group,GroupIndexList) in SecondaryPlotGroups.groups.items():
    PlotsWithManyPoints = PlotsWithManyPoints + 1
    Data.loc[GroupIndexList,'PlotNumber'] = PlotsWithManyPoints
    print Group



Processing group: ('PT2', "Prothrombin time (normal control values are  indicated by PT; patient's values are listed as  PT1 or PT2; in seconds).  Elevated values indicate  coagulation defects.", 'Lab'), n=2
Processing group: ('21', 'Arterial pressure - Diastolic ', 'Monitor'), n=1701
GP
SVM 0.01
DBSCAN 2 0.02 [0.9835390946502057, 0.01646090534979424] [-1]
AGG 3 manhattan average [0.9911816578483245, 0.0029394473838918285, 0.005878894767783657] [1, 2]
LOF (10, 'manhattan', 0.02)
KNN 0.0986886070184 0.220213077759
WIN (10, 6)
Processing group: ('7', 'Trace 1 ECG -> Heart rate ', 'Monitor'), n=627
GP
SVM 0.01
DBSCAN 5 0.02 [0.7719298245614035, 0.04784688995215311, 0.049441786283891544, 0.10845295055821372, 0.022328548644338118] [1, 2, -1]
DBSCAN 3 0.04 [0.8724082934609251, 0.11164274322169059, 0.01594896331738437] [-1]
AGG 3 manhattan average [0.529505582137161, 0.004784688995215311, 0.46570972886762363] [1]
LOF (10, 'manhattan', 0.02)
KNN 0.0882754007493 0.212201126455
WIN (10, 6)
Proce

In [13]:
def CalculateColorsByStrings(InputStrings, Method):
    "Calculates color by applying function on string"
    NumericValues = []
    for InputString in InputStrings:
        if Method == 'Value':
            NumericValues.append(float(InputString))
        else:
            Ordinal = [ord(Char) for Char in InputString]
            Sum = sum(Ordinal)
            if Method == 'Sum':
                NumericValues.append(Sum)
            elif Method == 'Avg':
                NumericValues.append(float(Sum)/len(InputString))
            else:
                raise ValueError, "Method unknown"
    # now figure out the first color
    Min = min(NumericValues)
    Max = max(NumericValues)
    # Scale the numeric Value to the closest value
    Range = Max-Min
    # If there is no difference, there are no colors issues
    if Range==0:
        Range=1
    ScaledNumericValues = [ int((Entry-Min)*255.0/Range) for Entry in NumericValues]
    ColorsToReturn = [ Viridis256[Entry] for Entry in ScaledNumericValues]
    return ColorsToReturn

def PopulatePlot(Plot,RelevantData, PlotType):
    if PlotType == 'Raw':
        Plot.circle('Time' ,'Data', source = RelevantData, line_color='LineColor', fill_color='FillColor', line_width = 3, size = 8  )
    if PlotType == 'Clean':
        Plot.circle('Time' ,'Data', source = RelevantData, line_color='VoteColor', fill_color='FillColor', line_width = 3, size = 3  )
    if PlotType == 'Processing':
        LegendItem1 = Plot.circle('Time' ,'Data', source = RelevantData, color='red' , size = 5, legend = dict(value='Original')  )
        LegendItem2 = Plot.circle('Time' ,'DataFiltered1', source = RelevantData, color='blue' , size = 5, legend = dict(value='Filtered GP') )
        LegendItem3 = Plot.circle('Time' ,'DataFiltered2', source = RelevantData, color='green' , size = 5, legend = dict(value='Filtered SVM') )
        LegendItem4 = Plot.circle('Time' ,'DataFiltered3', source = RelevantData, color='cyan' , size = 5, legend = dict(value='Filtered DBSCAN') )
        LegendItem5 = Plot.circle('Time' ,'DataFiltered4', source = RelevantData, color='purple' , size = 5, legend = dict(value='Filtered AGG') )
        LegendItem6 = Plot.circle('Time' ,'DataFiltered5', source = RelevantData, color='pink' , size = 5, legend = dict(value='Filtered LOF') )
        LegendItem7 = Plot.circle('Time' ,'DataFiltered6', source = RelevantData, color='yellow' , size = 5, legend = dict(value='Filtered KNN') )
        LegendItem8 = Plot.circle('Time' ,'DataFiltered7', source = RelevantData, color='lightgreen' , size = 5, legend = dict(value='Filtered WIN') )
        LegendItem10 = Plot.circle('Time' ,'Predicted', source = RelevantData, color='black' , size = 2 , legend = dict(value='GP Prediction') )


        ErrorBarX = [ [TimeEntry,TimeEntry] for TimeEntry in RelevantData['Time'] ]
        Predicted = list(RelevantData['Predicted'])
        Sigma = list(RelevantData['Sigma'])
        ErrorBarY = [ [float(PredictedEntry-Sigma[Enum]),float(PredictedEntry+Sigma[Enum]) ] for (Enum,PredictedEntry) in enumerate(Predicted)]
        LegendItem11 = Plot.multi_line(xs = ErrorBarX, ys = ErrorBarY, line_width=0.25, line_color = 'grey', legend = dict(value='GP STD Span') )

        
        Plot.legend.location = "bottom_left"
        Plot.legend.click_policy="hide"
    return Plot


StartTime = Data['Time'].min() 
EndTime = Data['Time'].max() 

HalfTimeSpan = (EndTime - StartTime)/2 
MidTime = StartTime + HalfTimeSpan
PlotStartTime = MidTime-HalfTimeSpan*1.1
PlotEndTime = MidTime+HalfTimeSpan*1.1

TimeRange = Range1d(PlotStartTime , PlotEndTime)

PlotArray = []
for PlotNumber in range(1,PlotsWithManyPoints+1):
    # Extract data
    RelevantData = (Data.loc[Data['PlotNumber']==PlotNumber]).copy()
    TitleX = 'Time'
    if PlotNumber > PlotsWithManyPoints-3:
        Title = 'Other ' + RelevantData['Source'].iloc[0]
        TitleY = 'Other ' + RelevantData['Source'].iloc[0] 
    else:
        Title = RelevantData['Source'].iloc[0] + ' : ' + RelevantData['Name'].iloc[0]
        TitleY = RelevantData['Name'].iloc[0] + ' ' + RelevantData['Units'].iloc[0]
    RelevantData['FillColor'] = CalculateColorsByStrings(list(RelevantData['Name']),'Sum')
    RelevantData['LineColor'] = CalculateColorsByStrings(list(RelevantData['Name']),'Avg')
    RelevantData['VoteColor'] = CalculateColorsByStrings(list(RelevantData['Votes']),'Value')

        
    MyHover1 = HoverTool(
        tooltips=[
            ( 'Measure', '@Name{%.50s}' ),
            ( 'Value', '@Data' ),  
            ( 'Units', '@Units{%s}' ),
            ( 'Votes', '@Votes' ),
            ( 'Time', '@Time{%T}' ),
        ],
        formatters={
            'Name' : 'printf',   
            'Time' : 'datetime', 
            'Data' : 'numeral',   
            'Votes' : 'numeral',   
            'Units' : 'printf',   
        },
    )    
    
    Plot1 = figure(title = Title, x_axis_label = TitleX, y_axis_label = TitleY, tools=[MyHover1,'save'], x_range= TimeRange)
    Plot1.xaxis.formatter = DatetimeTickFormatter()
    if RelevantData.iloc[0]['Sigma']!=None:
        PopulatePlot(Plot1, RelevantData, 'Clean')
        Plot2 = figure(title = 'Processing of: ' + Title, x_axis_label = TitleX, y_axis_label = TitleY, tools='save , pan, box_zoom, reset', x_range= TimeRange )
        Plot2.xaxis.formatter = DatetimeTickFormatter()
        PopulatePlot(Plot2, RelevantData, 'Processing')
        RowPlots = row([Plot1,Plot2]) 
    else:
        PopulatePlot(Plot1, RelevantData, 'Raw')
        RowPlots = row([Plot1]) 
    PlotArray.append(RowPlots)

ColumnPlots = column(PlotArray)    
    
Html = file_html(ColumnPlots, CDN, 'Visualizing Intensive Care Unit Data')
BodyTag = '<body>'
BodyStart = Html.index(BodyTag) + len(BodyTag)
TextToAdd = ''
TextToAdd += '<p>'
TextToAdd += '<b>Visualizing Intensive Care Unit Data</b><br>'
TextToAdd += 'The interactive plots below visualize the Intensive Care Unit data of a single patient from <a href="https://archive.ics.uci.edu/ml/datasets/ICU"> UCI Machine Learning Repository</a>.'
TextToAdd += 'The data is shown along a similar time line and provides visual cues for some measurements with many points by applying various filtering techniques. Details about this visualization can be found in the MODSIM paper titled: "Visualization and Pre-Processing of Intensive Care Unit Data Using Python Data Science Tools". '
TextToAdd += 'The code that generated these plots can be found  <a href="https://github.com/Jacob-Barhak/VisICU"> here</a>. '
TextToAdd += '</p>'
RevisedHTML = Html[:BodyStart] + TextToAdd + Html[BodyStart:]

OutFileName = 'VisualICU.html'
OutFile = open(OutFileName,'w')
OutFile.write(RevisedHTML)
OutFile.close()

# The line below was commented since some hardware may not be abe to present the plots in the notebook
# Uncommetn the line to attempt to display the ICU data visualization in the notebook
#show(ColumnPlots)

print "Done! - Please use a web browser to open the file " + OutFileName


Done! - Please use a web browser to open the file VisualICU.html
