In [2]:
import yaml
import json
import pandas as pd
from functools import reduce
import numpy as np

def get_config():
    with open("config.yaml", 'r') as stream:
        config = yaml.safe_load(stream)
    return config

config = get_config()
filepath = (config['datapath_as2'])
if not filepath.endswith('.json'):
    raise Exception
f = open(filepath)
data = json.load(f)
df = pd.read_json(data, orient='records')
origin_df = df.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          136 non-null    float64
 1   time        136 non-null    object 
 2   recordtype  136 non-null    int64  
 3   glucose     54 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 4.4+ KB


In [3]:
if df['glucose'].dtypes != 'int':
    df['glucose'] = np.floor(pd.to_numeric(df['glucose'], errors='coerce')).astype('Int64')
    
if df['time'].dtypes != 'datatime':
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    
df = df.set_index("time").sort_index().reset_index("time")

df

Unnamed: 0,time,ID,recordtype,glucose
0,2019-04-25 00:08:00,2.845000e+03,1,109
1,2019-04-25 00:14:00,1.614305e+19,0,
2,2019-04-25 00:29:00,1.614305e+19,0,
3,2019-04-25 00:44:00,1.614305e+19,0,
4,2019-04-25 00:50:00,2.850000e+03,1,
...,...,...,...,...
131,2019-04-25 23:02:00,1.614305e+19,0,
132,2019-04-25 23:18:00,1.614305e+19,0,
133,2019-04-25 23:31:00,3.062000e+03,1,111
134,2019-04-25 23:33:00,1.614305e+19,0,


In [4]:
# 56 unique IDs out of 136
print(len(df['ID'].unique().tolist()), "unique patient ids out of", df['ID'].shape[0])

# let's see what patients have miltiple rows
unique_ids = df['ID'].value_counts()
res = unique_ids[unique_ids > 1].index.tolist()
print ("Patients with multiple records:")
for idPat in res:
    print ("Patient", idPat, "got", len(df[df['ID'] == idPat]), "records")
print("\n")
# percentage of missing glucose values
print("Percentage of missing glucose values:", 1-df['glucose'].count()/float(df['glucose'].shape[0]))

recordTypeValues = df['recordtype'].unique().tolist()
print(len(df['recordtype'].unique().tolist()), "different types of records:", recordTypeValues)

# check time range
timeStart = df['time'][0]
timeEnd = df['time'][df['time'].shape[0]-1]
print("Time range from", timeStart, "to", timeEnd, ":", timeEnd-timeStart)

# check for specific pattern for glucose value and recordtype
# there is a pattern - for defined glucose values and empty values (whitespaces) recordtype equals 1
# otherwise (NaN) - recordtype equals 0
# but for us NaN and empty values are undefined values
origin_df = origin_df.fillna(np.nan)
if origin_df['time'].dtypes != 'datatime':
    origin_df['time'] = pd.to_datetime(origin_df['time'], errors='coerce')
    
origin_df = origin_df.set_index("time").sort_index()
origin_df['glucose'] = origin_df['glucose'].replace('\s+', 0, regex=True)

# there are no rows with defined glucose value AND with recordtype 0
origin_df_droppedNan = origin_df.copy().dropna()
print(len(origin_df_droppedNan[origin_df_droppedNan['recordtype'] != 1]))

# there are no rows with undefined glucose value AND with recordtype 1
origin_df_onlyNan = origin_df.copy().isna()
print(len(origin_df_onlyNan[origin_df_onlyNan['recordtype'] != 0]))

# for df (where both empty and null values equal Nan) all rows with defined glucose level have recordtype 1
# (but some Nans have recordtype 1 too)

56 unique patient ids out of 136
Patients with multiple records:
Patient 1.6143047311250231e+19 got 50 records
Patient 1.6143047311250233e+19 got 32 records


Percentage of missing glucose values: 0.6176470588235294
2 different types of records: [1, 0]
Time range from 2019-04-25 00:08:00 to 2019-04-25 23:48:00 : 0 days 23:40:00
0
0


In [5]:
check_nan_dates = df['time'].isnull().values.any()
print("There are NaN dates:", check_nan_dates)

# The interpolation function seems to treat pd.Int64Dtype() as if it is generic object dtype :(
origin_df['glucose'] = pd.to_numeric(origin_df['glucose'], errors='coerce').astype('float64')
origin_df['glucose'] = origin_df['glucose'].replace(0, np.nan)
origin_df['interpolated_glucose_value'] = origin_df[['glucose']].interpolate(method='time')
origin_df.head()

# interpolation method 'time' is used because it interpolates data using the date time indexes (and we have it)
# to add on, the time intervals are not equally spaced - we can't use linear interpolation

There are NaN dates: False


Unnamed: 0_level_0,ID,recordtype,glucose,interpolated_glucose_value
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-25 00:08:00,2845.0,1,109.0,109.0
2019-04-25 00:14:00,1.614305e+19,0,,109.202899
2019-04-25 00:29:00,1.614305e+19,0,,109.710145
2019-04-25 00:44:00,1.614305e+19,0,,110.217391
2019-04-25 00:50:00,2850.0,1,,110.42029


In [6]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
from bokeh.plotting import ColumnDataSource
output_notebook()


In [7]:
# the line graph is chosen: it's the decent way to represent the data throughout the time period (X axis is used for datatime)
# and we can see the interpolated part by chosing different colors for different data types
p = figure(plot_width=800, plot_height=400, title='Comparison of glucose_origin and glucose_interpolated values', x_axis_type="datetime", x_axis_label='date', y_axis_label='glucose')

p.line(origin_df.index, origin_df['glucose'], color='blue', legend_label="glucose_origin", line_width=2, alpha=1)
p.line(origin_df.index, origin_df['interpolated_glucose_value'], color='red', legend_label="glucose_interpolated", line_width=2, alpha=0.5)
# p.circle(origin_df.index, origin_df['interpolated_glucose_value'], fill_color="black", size=3)

show(p)

In [13]:
from bokeh.io import show
from bokeh.models import CheckboxGroup, Dropdown
from bokeh.plotting import curdoc
from bokeh.layouts import column

source1 = ColumnDataSource(
        data=dict(
            x=list(origin_df.index.values),
            y=list(origin_df['interpolated_glucose_value'])
        )
    )

origin_df['10_rolling_avg'] = origin_df['interpolated_glucose_value'].rolling(10).mean()

source_avg = ColumnDataSource(
        data=dict(
            x=list(origin_df.index.values),
            y=list(origin_df['10_rolling_avg'])
        )
    )

pl = figure(plot_width=1600, plot_height=800, title='glucose values interpolation', x_axis_type="datetime", x_axis_label='date', y_axis_label='interpolated_glucose')

line1 = pl.line(source=source1, color='red', legend_label="glucose_interpolated", line_width=2, alpha=0.5)
line_avg = pl.line(source=source_avg, color='blue', legend_label="glucose_mean", line_width=2, alpha=0.8)
line_avg.visible = False

ds1 = line1.data_source
ds2 = line_avg.data_source

def my_button_handler(new):
    origin_df['interpolated_glucose_value'] = origin_df[['glucose']].interpolate(method=new.item, order=2)
    ds1.data['y'] = origin_df['interpolated_glucose_value'].values
    ds2.data['y'] = origin_df['interpolated_glucose_value'].rolling(10).mean()
    
menu = [("Time_method", "time"), ("Linear_method", "linear"), ("Polynomial_method", "polynomial"), ("Cubicspline_method", 'cubicspline')]

dropdown = Dropdown(label="Chose interpolation method", button_type="warning", menu=menu)
dropdown.on_click(my_button_handler)

checkBoxLabels = ["Draw rolling mead"]
checkbox = CheckboxGroup(labels=checkBoxLabels, active=[])

def update(attr, old, new):
    if not checkbox.active:
        line_avg.visible = False
        return
    line_avg.visible = True

checkbox.on_change('active', update)

curdoc().add_root(column(dropdown, checkbox, pl))
