In [33]:
import pandas as pd
import numpy as np
from datetime import timedelta
from scipy.fftpack import fft, ifft,rfft
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, RepeatedKFold
from joblib import dump, load
from sklearn import svm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Read Data From the CSV Files

In [3]:
patient_insulin_data_dataframe = pd.read_csv('InsulinData.csv', 
                                             low_memory=False, 
                                             usecols=['Date','Time','BWZ Carb Input (grams)'])

In [4]:
patient_CGM_data_dataframe = pd.read_csv('CGMData.csv',
                                         low_memory=False,
                                         usecols=['Date','Time','Sensor Glucose (mg/dL)'])

In [5]:
patient_insulin_data_dataframe['date_time_stamp'] = pd.to_datetime(patient_insulin_data_dataframe['Date'] + 
                                                                   ' ' + patient_insulin_data_dataframe['Time'])

In [6]:
patient_CGM_data_dataframe['date_time_stamp'] = pd.to_datetime(patient_CGM_data_dataframe['Date'] + 
                                                               ' ' + patient_CGM_data_dataframe['Time'])

In [7]:
patient2_insulin_data_dataframe_1 = pd.read_csv('Insulin_patient2.csv',
                                                low_memory=False,
                                                usecols=['Date','Time','BWZ Carb Input (grams)'])

In [8]:
patient2_CGM_data_dataframe_1 = pd.read_csv('CGM_patient2.csv',
                                            low_memory=False,
                                            usecols=['Date','Time','Sensor Glucose (mg/dL)'])

In [9]:
patient2_insulin_data_dataframe_1['date_time_stamp'] = pd.to_datetime(patient2_insulin_data_dataframe_1['Date'] + 
                                                                      ' ' + patient2_insulin_data_dataframe_1['Time'])

In [10]:
patient2_CGM_data_dataframe_1['date_time_stamp'] = pd.to_datetime(patient2_CGM_data_dataframe_1['Date'] + 
                                                                ' ' + patient2_CGM_data_dataframe_1['Time'])

Meal data extraction

In [11]:
def createProcessedMealDataFrame(patient_insulin_data_dataframe, patient_CGM_data_dataframe, dateidentifier):
    
    patient_insulin_dataframe = patient_insulin_data_dataframe.copy()
    patient_insulin_dataframe = patient_insulin_dataframe.set_index('date_time_stamp')
    
#     find_timestamp_data_with_2hours_30_minutes_dataframe = patient_insulin_dataframe.sort_values(by='date_time_stamp', ascending=True).dropna().reset_index()
    find_timestamp_data_with_2hours_30_minutes_dataframe = patient_insulin_dataframe.sort_values(by='date_time_stamp', ascending=True).dropna()
    find_timestamp_data_with_2hours_30_minutes_dataframe['BWZ Carb Input (grams)'].replace(0.0,np.nan,inplace=True)
#     find_timestamp_data_with_2hours_30_minutes_dataframe = find_timestamp_data_with_2hours_30_minutes_dataframe.dropna()
    find_timestamp_data_with_2hours_30_minutes_dataframe = find_timestamp_data_with_2hours_30_minutes_dataframe.dropna().reset_index()
    find_timestamp_data_with_2hours_30_minutes_dataframe = find_timestamp_data_with_2hours_30_minutes_dataframe.reset_index().drop(columns='index')
    
    list_of_valid_timestamps = []
    value = 0
    for index,i in enumerate(find_timestamp_data_with_2hours_30_minutes_dataframe['date_time_stamp']):
        try:
            value = ((find_timestamp_data_with_2hours_30_minutes_dataframe['date_time_stamp'][index + 1] - i).seconds) / 60.0
            if value >= 120:
                list_of_valid_timestamps.append(i)
        except KeyError:
            break
    
    list1 = []
    if dateidentifier == 1:
        for index,i in enumerate(list_of_valid_timestamps):
            start_time_stamp = pd.to_datetime(i - timedelta(minutes=30))
            end_time_stamp = pd.to_datetime(i + timedelta(minutes=120))
            get_date = i.date().strftime('%#m/%#d/%Y')
            list1.append(patient_CGM_data_dataframe.loc[patient_CGM_data_dataframe['Date']==get_date].set_index('date_time_stamp').between_time(start_time=start_time_stamp.strftime('%#H:%#M:%#S'),end_time=end_time_stamp.strftime('%#H:%#M:%#S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(list1)
    else:
        for index,i in enumerate(list_of_valid_timestamps):
            start_time_stamp = pd.to_datetime(i - timedelta(minutes=30))
            end_time_stamp = pd.to_datetime(i + timedelta(minutes=120))
            get_date = i.date().strftime('%Y-%m-%d')
            list1.append(patient_CGM_data_dataframe.loc[patient_CGM_data_dataframe['Date'] == get_date].set_index('date_time_stamp').between_time(start_time = start_time_stamp.strftime('%H:%M:%S'),end_time = end_time_stamp.strftime('%H:%M:%S'))['Sensor Glucose (mg/dL)'].values.tolist())
        return pd.DataFrame(list1)

In [12]:
meal_data = createProcessedMealDataFrame(patient_insulin_data_dataframe, patient_CGM_data_dataframe, 1)

In [13]:
meal_data1 = createProcessedMealDataFrame(patient2_insulin_data_dataframe_1, patient2_CGM_data_dataframe_1, 2)

In [14]:
meal_data = meal_data.iloc[:,0:30]

In [15]:
meal_data1 = meal_data1.iloc[:,0:30]

No-Meal data extraction

In [18]:
def createnomealdata(patient_insulin_data_dataframe,patient_CGM_data_dataframe):
    patient_insulin_no_meal_dataframe = patient_insulin_data_dataframe.copy()
    temporary_testing_dataframe = patient_insulin_no_meal_dataframe.sort_values(by='date_time_stamp',ascending=True).replace(0.0,np.nan).dropna().copy()
    temporary_testing_dataframe = temporary_testing_dataframe.reset_index().drop(columns='index')
    list_of_valid_timestamps_for_no_meal_data = []
    for index,i in enumerate(temporary_testing_dataframe['date_time_stamp']):
        try:
            value=(temporary_testing_dataframe['date_time_stamp'][index+1]-i).seconds//3600
            if value >=4:
                list_of_valid_timestamps_for_no_meal_data.append(i)
        except KeyError:
            break
    dataset = []
    for index, i in enumerate(list_of_valid_timestamps_for_no_meal_data):
        iteration_dataset = 1
        try:
            all_nomeal = patient_CGM_data_dataframe.loc[(patient_CGM_data_dataframe['date_time_stamp']>=list_of_valid_timestamps_for_no_meal_data[index]+pd.Timedelta(hours=2))&(patient_CGM_data_dataframe['date_time_stamp']<list_of_valid_timestamps_for_no_meal_data[index+1])]['Sensor Glucose (mg/dL)']
            total_length = len(all_nomeal)
            length_of_24_dataset = total_length//24
            while (iteration_dataset <= length_of_24_dataset):
                if iteration_dataset == 1:
                    dataset.append(all_nomeal.iloc[:24].values.tolist())
                    iteration_dataset+=1
                else:
                    dataset.append(all_nomeal.iloc[(iteration_dataset-1)*24:(iteration_dataset)*24].values.tolist())
                    iteration_dataset+=1
        except IndexError:
            break
    return pd.DataFrame(dataset)

In [19]:
no_meal_data = createnomealdata(patient_insulin_data_dataframe, patient_CGM_data_dataframe)

In [20]:
no_meal_data1 = createnomealdata(patient2_insulin_data_dataframe_1, patient2_CGM_data_dataframe_1)

Create Feature matrix from extracted meal data.

In [21]:
def createmealfeaturematrix(meal_data):
    index=meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>6).dropna().index
    meal_data_cleaned=meal_data.drop(meal_data.index[index]).reset_index().drop(columns='index')
    meal_data_cleaned=meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again=meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    meal_data_cleaned=meal_data_cleaned.drop(meal_data.index[index_to_drop_again]).reset_index().drop(columns='index')
    meal_data_cleaned['tau_time']=abs(meal_data_cleaned.iloc[:,0:30].idxmin(axis=1)-meal_data_cleaned.iloc[:,0:30].idxmax(axis=1))*5
    meal_data_cleaned['difference_in_glucose_normalized']=(meal_data_cleaned.iloc[:,0:30].max(axis=1)-meal_data_cleaned.iloc[:,0:25].min(axis=1))/(meal_data_cleaned.iloc[:,0:30].max(axis=1))
    meal_data_cleaned=meal_data_cleaned.dropna().reset_index().drop(columns='index')
    power_first_max=[]
    index_first_max=[]
    power_second_max=[]
    index_second_max=[]
    for i in range(len(meal_data_cleaned)):
        array=abs(rfft(meal_data_cleaned.iloc[:,0:30].iloc[i].values.tolist())).tolist()
        sorted_array=abs(rfft(meal_data_cleaned.iloc[:,0:30].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        power_first_max.append(sorted_array[-2])
        power_second_max.append(sorted_array[-3])
        index_first_max.append(array.index(sorted_array[-2]))
        index_second_max.append(array.index(sorted_array[-3]))
    meal_feature_matrix=pd.DataFrame()
    meal_feature_matrix['tau_time']=meal_data_cleaned['tau_time']
    meal_feature_matrix['difference_in_glucose_normalized']=meal_data_cleaned['difference_in_glucose_normalized']
    meal_feature_matrix['power_first_max']=power_first_max
    meal_feature_matrix['power_second_max']=power_second_max
    meal_feature_matrix['index_first_max']=index_first_max
    meal_feature_matrix['index_second_max']=index_second_max
    tm=meal_data_cleaned.iloc[:,22:25].idxmin(axis=1)
    maximum=meal_data_cleaned.iloc[:,5:19].idxmax(axis=1)
    list1=[]
    second_differential_data=[]
    standard_deviation=[]
    for i in range(len(meal_data_cleaned)):
        list1.append(np.diff(meal_data_cleaned.iloc[:,maximum[i]:tm[i]].iloc[i].tolist()).max())
        second_differential_data.append(np.diff(np.diff(meal_data_cleaned.iloc[:,maximum[i]:tm[i]].iloc[i].tolist())).max())
        standard_deviation.append(np.std(meal_data_cleaned.iloc[i]))
    meal_feature_matrix['1stDifferential']=list1
    meal_feature_matrix['2ndDifferential']=second_differential_data
    return meal_feature_matrix

In [22]:
meal_feature_matrix = createmealfeaturematrix(meal_data)

In [23]:
meal_feature_matrix1 = createmealfeaturematrix(meal_data1)

In [24]:
meal_feature_matrix = pd.concat([meal_feature_matrix,meal_feature_matrix1]).reset_index().drop(columns='index')

Just like Meal feature matrix, Create Feature matrix from extracted non meal data. (logic is same.) 

In [25]:
def createnomealfeaturematrix(non_meal_data):
    index_to_remove_non_meal=non_meal_data.isna().sum(axis=1).replace(0,np.nan).dropna().where(lambda x:x>5).dropna().index
    non_meal_data_cleaned=non_meal_data.drop(non_meal_data.index[index_to_remove_non_meal]).reset_index().drop(columns='index')
    non_meal_data_cleaned=non_meal_data_cleaned.interpolate(method='linear',axis=1)
    index_to_drop_again=non_meal_data_cleaned.isna().sum(axis=1).replace(0,np.nan).dropna().index
    non_meal_data_cleaned=non_meal_data_cleaned.drop(non_meal_data_cleaned.index[index_to_drop_again]).reset_index().drop(columns='index')
    non_meal_feature_matrix=pd.DataFrame()
    
    non_meal_data_cleaned['tau_time']=abs(non_meal_data_cleaned.iloc[:,0:24].idxmin(axis=1)-non_meal_data_cleaned.iloc[:,0:24].idxmax(axis=1))*5
    non_meal_data_cleaned['difference_in_glucose_normalized']=(non_meal_data_cleaned.iloc[:,0:24].max(axis=1)-non_meal_data_cleaned.iloc[:,0:24].min(axis=1))/(non_meal_data_cleaned.iloc[:,0:24].max(axis=1))
    power_first_max,index_first_max,power_second_max,index_second_max=[],[],[],[]
    for i in range(len(non_meal_data_cleaned)):
        array=abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array=abs(rfft(non_meal_data_cleaned.iloc[:,0:24].iloc[i].values.tolist())).tolist()
        sorted_array.sort()
        power_first_max.append(sorted_array[-2])
        power_second_max.append(sorted_array[-3])
        index_first_max.append(array.index(sorted_array[-2]))
        index_second_max.append(array.index(sorted_array[-3]))
    non_meal_feature_matrix['tau_time']=non_meal_data_cleaned['tau_time']
    non_meal_feature_matrix['difference_in_glucose_normalized']=non_meal_data_cleaned['difference_in_glucose_normalized']
    non_meal_feature_matrix['power_first_max']=power_first_max
    non_meal_feature_matrix['power_second_max']=power_second_max
    non_meal_feature_matrix['index_first_max']=index_first_max
    non_meal_feature_matrix['index_second_max']=index_second_max
    first_differential_data=[]
    second_differential_data=[]
    for i in range(len(non_meal_data_cleaned)):
        first_differential_data.append(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist()).max())
        second_differential_data.append(np.diff(np.diff(non_meal_data_cleaned.iloc[:,0:24].iloc[i].tolist())).max())
    non_meal_feature_matrix['1stDifferential']=first_differential_data
    non_meal_feature_matrix['2ndDifferential']=second_differential_data
    return non_meal_feature_matrix


In [26]:
non_meal_feature_matrix = createnomealfeaturematrix(no_meal_data)

In [27]:
non_meal_feature_matrix1 = createnomealfeaturematrix(no_meal_data1)

In [28]:
non_meal_feature_matrix=pd.concat([non_meal_feature_matrix,non_meal_feature_matrix1]).reset_index().drop(columns='index')

Subject the data onto Decision Tree Classifier to predict 0 / 1 label using KFold Cross Validation.

In [29]:
meal_feature_matrix['label'] = 1
non_meal_feature_matrix['label'] = 0
total_data = pd.concat([meal_feature_matrix, non_meal_feature_matrix]).reset_index().drop(columns='index')
dataset = shuffle(total_data, random_state = 1).reset_index().drop(columns='index')
kfold = KFold(n_splits=10,shuffle=False)
principaldata = dataset.drop(columns = 'label')
scores_rf = []
model = DecisionTreeClassifier(criterion="entropy")
for train_index, test_index in kfold.split(principaldata):
    X_train,X_test,y_train,y_test = principaldata.loc[train_index], principaldata.loc[test_index], dataset.label.loc[train_index], dataset.label.loc[test_index]
    model.fit(X_train,y_train)
    scores_rf.append(model.score(X_test,y_test))

In [30]:
print('Prediction score is',np.mean(scores_rf)*100)

Prediction score is 87.06216360087538


In [31]:
y_pred = model.predict(X_test)

In [35]:
print('Precision: %.3f' % precision_score(y_test, y_pred))

Precision: 0.800


In [36]:
print('Recall: %.3f' % recall_score(y_test, y_pred))

Recall: 0.759


In [37]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.893


In [38]:
print('F1 Score: %.3f' % f1_score(y_test, y_pred))

F1 Score: 0.779


In [39]:
classifier=DecisionTreeClassifier(criterion='entropy')

In [40]:
X, y= principaldata, dataset['label']

In [41]:
classifier.fit(X,y)

DecisionTreeClassifier(criterion='entropy')

In [42]:
dump(classifier, 'DecisionTreeClassifier.pickle')

['DecisionTreeClassifier.pickle']