In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.utils import shuffle
from sklearn import svm
from sklearn.model_selection import cross_val_score
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

### Below section reads all the required data and generates a combined Data-Time stamp

In [2]:
InsulinData_Patient1 = pd.read_csv("dataset/InsulinData_Patient1.csv")
InsulinData_Patient1["combinedStamp"] = pd.to_datetime(InsulinData_Patient1["Date"] + ' - ' + InsulinData_Patient1["Time"])

In [3]:
CGMData_Patient1 = pd.read_csv("dataset/CGMData_Patient1.csv")
CGMData_Patient1["combinedStamp"] = pd.to_datetime(CGMData_Patient1["Date"] + ' - ' + CGMData_Patient1["Time"])

In [4]:
InsulinData_Patient2 = pd.read_csv("dataset/InsulinData_Patient2.csv")
InsulinData_Patient2["combinedStamp"] = pd.to_datetime(InsulinData_Patient2["Date"] + ' - ' + InsulinData_Patient2["Time"])

In [5]:
CGMData_Patient2 = pd.read_csv("dataset/CGMData_Patient2.csv")
CGMData_Patient2["combinedStamp"] = pd.to_datetime(CGMData_Patient2["Date"] + ' - ' + CGMData_Patient2["Time"])

### Below function extracts Meal Data as follows:
Start of a meal can be obtained from InsulinData_Patient#.csv. Search column Y for a non NAN non zero value. This time indicates the start of a meal (tm). 

There can be three conditions:
1. There is no meal from time tm to time tm+2hrs. Then use this stretch as meal data.
2. There is a meal at some time tp in between tp>tm and tp< tm+2hrs. Ignore the meal data at time tm and consider the meal at time tp instead.
3. There is a meal at time tm+2hrs, then consider the stretch from tm+1hr 30min to tm+4hrs as meal data.

In [6]:
def extractMealData(cgm, insulin):
    meal = insulin.sort_values(by = "combinedStamp")

    meal = meal[~pd.isnull(meal["BWZ Carb Input (grams)"]) & meal["BWZ Carb Input (grams)"] != 0.0]

    lst1 = []
    diff = 0
    meal.reset_index(drop = True, inplace = True)
    for index, item in enumerate(meal["combinedStamp"]):
        if index < len(meal["combinedStamp"]) - 1:
            diff = (meal["combinedStamp"][index + 1] - item).seconds
            if diff > 2 * 60 * 60:
                lst1.append(item)
            if diff < 2 * 60 * 60:
                lst1.append(meal["combinedStamp"][index + 1])
            if diff == 2 * 60 * 60:
                lst1.append(meal["combinedStamp"][index + 1])
    lst1 = list(set(lst1))

    lst2 = []
    cgm = cgm.set_index(pd.DatetimeIndex(cgm["combinedStamp"]))
    for _, item in enumerate(lst1):
        date = item.date().strftime("%m/%d/%Y")
        endTime = (pd.to_datetime(item + timedelta(minutes = 120))).strftime('%H:%M:%S')
        startTime = (pd.to_datetime(item - timedelta(minutes = 30))).strftime('%H:%M:%S')
        temp = cgm.loc[cgm["Date"] == date].between_time(startTime, endTime)["Sensor Glucose (mg/dL)"]
        lst2.append(temp.values.tolist())

    mealData = pd.DataFrame(lst2)
    return mealData

### Below function extracts Meal Data as follows:
No meal data comprises 2 hrs of raw data that does not have meal intake.

Start of no meal is at time tm+2hrs where tm is the start of some meal. We need to obtain a 2 hr stretch of no meal time. So you need to find all 2 hr stretches in a day that have no meal and do not fall within 2 hrs of the start of a meal.

In [7]:
def extractNoMealData(cgm, insulin):
    nomeal = insulin.sort_values(by = "combinedStamp")

    nomeal = nomeal[~pd.isnull(nomeal["BWZ Carb Input (grams)"]) & nomeal["BWZ Carb Input (grams)"] != 0.0]

    lst1 = []
    val = 0
    nomeal.reset_index(drop = True, inplace = True)
    for index, item in enumerate(nomeal["combinedStamp"]):
        if index < len(nomeal["combinedStamp"]) - 1:
            val = (nomeal["combinedStamp"][index + 1] - item).seconds
            if val >= 4 * 60 * 60:
                lst1.append(item + pd.Timedelta(hours=2))
    lst1 = list(set(lst1))

    lst2 = []
    cgm = cgm.set_index(pd.DatetimeIndex(cgm["combinedStamp"]))
    for _, item in enumerate(lst1):
        date = item.date().strftime("%m/%d/%Y")
        endTime = (pd.to_datetime(item + timedelta(minutes = 120))).strftime('%H:%M:%S')
        startTime = (pd.to_datetime(item)).strftime('%H:%M:%S')
        temp = cgm.loc[cgm["Date"] == date].between_time(startTime, endTime)["Sensor Glucose (mg/dL)"]
        lst2.append(temp.values.tolist())

    nomealData = pd.DataFrame(lst2)
    return nomealData

### Below function cleans the data by:
1. Deleting all observations with more than or equal to 50% missing values
2. Trims the data to a common shape.

In [8]:
def cleanData(data):
    _, columns = data.shape
    thresh = columns * 0.5
    data = data.dropna(thresh = thresh)
    data = data.iloc[:, 0:24]
    data.reset_index(drop = True, inplace = True)
    return data

### Final Dataset is created below by:
1. Labeling data as meal or no-meal instances (meal=1 and no-meal=0)
2. Merging all the data while shuffling
3. Replacing remaining NaN values with column Mode

In [9]:
mealData_Patient1 = extractMealData(CGMData_Patient1, InsulinData_Patient1)
mealData_Patient1 = cleanData(mealData_Patient1)
mealData_Patient1["lbl"] = 1

nomealData_Patient1 = extractNoMealData(CGMData_Patient1, InsulinData_Patient1)
nomealData_Patient1 = cleanData(nomealData_Patient1)
nomealData_Patient1["lbl"] = 0

In [10]:
mealData_Patient2 = extractMealData(CGMData_Patient2, InsulinData_Patient2)
mealData_Patient2 = cleanData(mealData_Patient2)
mealData_Patient2["lbl"] = 1

nomealData_Patient2 = extractNoMealData(CGMData_Patient2, InsulinData_Patient2)
nomealData_Patient2 = cleanData(nomealData_Patient2)
nomealData_Patient2["lbl"] = 0

In [11]:
finalData = shuffle(pd.concat([nomealData_Patient1, mealData_Patient1, nomealData_Patient2, mealData_Patient2]), random_state = 5)
finalData = finalData.reset_index(drop = True)

for column in finalData.columns:
    finalData[column].fillna(finalData[column].mode()[0], inplace = True)

### Feature Extraction and Selection

In [12]:
# Extraction

transform = list()
transform.append(('mms', MinMaxScaler()))
transform.append(('ss', StandardScaler()))
transform.append(('rs', RobustScaler()))
transform.append(('qt', QuantileTransformer(n_quantiles = 100, output_distribution = "normal")))
transform.append(('kbd', KBinsDiscretizer(n_bins = 10, encode = "ordinal", strategy = "uniform")))
transform.append(('pca', PCA(n_components = 7)))
transform.append(('svd', TruncatedSVD(n_components = 7)))

feature_union = FeatureUnion(transform)


## Selection
# Selecting top 15 features

rfe = RFE(estimator = RandomForestClassifier(max_depth = 10, random_state = 5), n_features_to_select = 15)

### Training the created pipeline and reporting cross validation results
Model used -> Random Forest Classifier

In [13]:
y = finalData["lbl"]
X = finalData.drop(columns = "lbl")

model = RandomForestClassifier(max_depth = 10, random_state = 5)


steps = list()
steps.append(('fu', feature_union))
steps.append(('rfe', rfe))
steps.append(('m', model))
pipeline = Pipeline(steps = steps)

scores = cross_val_score(pipeline, X, y, cv = 5)
    
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.66 accuracy with a standard deviation of 0.03
