In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from datetime import timedelta
from sklearn.cluster import DBSCAN, KMeans
from scipy.fftpack import fft, ifft,rfft
import math
from sklearn import metrics
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


filename = 'CGMData.csv'
columns= ['Date','Time','Sensor Glucose (mg/dL)']
cgm = pd.read_csv(filename,usecols=columns)
cgm['date time stamp'] = pd.to_datetime(cgm['Date']+ ' ' +cgm['Time'])

filename = 'InsulinData.csv'
columns= ['Date','Time','BWZ Carb Input (grams)']
insulin = pd.read_csv(filename,usecols=columns)
insulin['date time stamp'] = pd.to_datetime(insulin['Date']+ ' ' +insulin['Time'])

not_null_glucose= cgm['Sensor Glucose (mg/dL)'].notna()
cgm =  cgm.loc[not_null_glucose][['date time stamp', 'Sensor Glucose (mg/dL)']] 
cgm = cgm.sort_values(by = 'date time stamp', ascending = True).reset_index().drop(columns= 'index')

insulin['BWZ Carb Input (grams)'].replace(0.0,np.nan,inplace=True)
not_null_carb = insulin['BWZ Carb Input (grams)'].notna() 
insulin = insulin.loc[not_null_carb][['date time stamp', 'BWZ Carb Input (grams)']]
insulin = insulin.sort_values(by = 'date time stamp', ascending = True).reset_index().drop(columns= 'index')


min_carb = insulin['BWZ Carb Input (grams)'].min()
max_carb = insulin['BWZ Carb Input (grams)'].max()
total_bins = math.ceil((max_carb - min_carb) / 20)

bins_array = []
bin_matrix = []

for i in range(len(insulin)):
    insulin['BWZ Carb Input (grams)'][i]
    bin_no = math.floor(float((insulin['BWZ Carb Input (grams)'][i] - min_carb)/20))
    if bin_no == total_bins:
        bin_no = bin_no - 1
    bins_array.append(bin_no)
insulin['bins']= bins_array

def getMealData(insulin,cgm):
    bin_matrix = []
    meal_matrix = pd.DataFrame()
    meals = []
    for i in range(0, len(insulin)-1):
        time_diff_seconds = (insulin.iloc[i + 1]['date time stamp'] - insulin.iloc[i]['date time stamp']).total_seconds()
        if(time_diff_seconds > 7200):
            meals.append(True)
        else:
            meals.append(False)
        
    meals.append(True)
    meal_data = insulin[meals]
    #print(cgm)
    # print(meal_data)
    for i in range(len(meal_data)):
        start_time = meal_data.iloc[i]['date time stamp'] - datetime.timedelta(minutes=30)
        end_time = meal_data.iloc[i]['date time stamp'] + datetime.timedelta(minutes=120)
        between_time = (cgm['date time stamp'] >= start_time) & (cgm['date time stamp'] < end_time)
        bin = meal_data.iloc[i]['bins']
        meal_glucose = cgm[between_time]
        
        if len(meal_glucose.index) == 30:
            meal_glucose = meal_glucose.T
            meal_glucose.drop('date time stamp', inplace=True)
            
            meal_glucose.reset_index(drop=True, inplace=True)
            meal_glucose.columns = list(range(1, 31))
            
            meal_matrix = meal_matrix.append(meal_glucose, ignore_index=True)
            bin_matrix.append(bin)

    meal_matrix = meal_matrix.apply(pd.to_numeric)
    bin_matrix = np.array(bin_matrix)
    return meal_matrix,bin_matrix
meal_data, bin_matrix=getMealData(insulin,cgm) 

def getmealfeaturematrix(meal_data):
    first_power=[]
    second_power=[]
    first_power_index=[]
    second_power_index=[]
    third_power=[]
    third_power_index=[]
    first_diff=[]
    second_diff=[]
    mean_cgm =[]
    std_div =[]
    new_meal_data = meal_data.copy()
    start_tau = new_meal_data.iloc[:,22:25].idxmin(axis=1)
    end_tau = new_meal_data.iloc[:,5:19].idxmax(axis=1)
    new_meal_data['tau'] = (start_tau-end_tau)*5

    start_normalized_glucose = new_meal_data.iloc[:,5:19].max(axis=1)
    end_normalized_glucose = new_meal_data.iloc[:,22:25].min(axis=1)
    new_meal_data['normalized_glucose_diff'] = (start_normalized_glucose - end_normalized_glucose)/ end_normalized_glucose
    new_meal_data['half_normalized_glucose_diff'] = (start_normalized_glucose - end_normalized_glucose)/ (end_normalized_glucose*2)    
    for i in range(len(new_meal_data)):
        meal_data_row =new_meal_data.iloc[:,0:30].iloc[i].values.tolist()
        power_fft_row = abs(rfft(meal_data_row)).tolist()
        fft_meal_data_row = abs(rfft(meal_data_row)).tolist() 
        power_fft_row.sort(reverse = True)
        first_power.append(power_fft_row[1])
        second_power.append(power_fft_row[2])
        third_power.append(power_fft_row[3])
        first_power_index.append(fft_meal_data_row.index(power_fft_row[1]))
        second_power_index.append(fft_meal_data_row.index(power_fft_row[2]))
        third_power_index.append(fft_meal_data_row.index(power_fft_row[3]))
    tm_time=new_meal_data.iloc[:,22:27].idxmin(axis=1)
    glucose_max_time=new_meal_data.iloc[:,4:18].idxmax(axis=1)

    for i in range(len(new_meal_data)):
        diff_value = new_meal_data.iloc[:,glucose_max_time[i]:tm_time[i]].iloc[i].tolist()
        first_diff.append(np.diff(diff_value).max())
        second_diff.append(np.diff(np.diff(diff_value)).max()) 
        mean_cgm.append((new_meal_data.iloc[i].sum(axis=0))/30)
        std_div.append(np.std(new_meal_data.iloc[i]))
    meal_matrix=pd.DataFrame()
    meal_matrix['first_diff']=first_diff
    meal_matrix['second_diff']=second_diff
    meal_matrix['first_power']=first_power
    meal_matrix['second_power']=second_power
    meal_matrix['third_power']=third_power
    meal_matrix['first_power_index']=first_power_index
    meal_matrix['second_power_index']=second_power_index
    meal_matrix['third_power_index']=third_power_index
    meal_matrix['tau']=new_meal_data['tau']
    meal_matrix['normalized_glucose_diff']=new_meal_data['normalized_glucose_diff']
    meal_matrix['half_normalized_glucose_diff']=new_meal_data['half_normalized_glucose_diff']
    meal_matrix['mean_cgm'] = mean_cgm
    meal_matrix['std_div'] = std_div
    return meal_matrix


def get_entropy(feature_matrix):
    # print(feature_matrix)
    total = feature_matrix.sum()
    bins = feature_matrix.shape[0]
    entropy = 0
    cluster_sum = 0
    entropies = []

    for i in range(bins):
        cluster_sum = np.sum(feature_matrix[i])
        if cluster_sum != 0:
            for j in range(bins):
                if feature_matrix[i,j] == 0:
                    continue
                column_total = feature_matrix[i,j] / cluster_sum
                entropy = -1 * column_total * np.log2(column_total)
                entropy = entropy + entropy
            entropies.append((cluster_sum / total) * entropy)
    return np.sum(entropies)

def get_purity(feature_matrix):
    feature_matrix_sum = feature_matrix.sum()
    bins = feature_matrix.shape[0]
    cluster_sum = 0
    purity = 0
    maximum = 0
    purities = []

    for i in range(bins):
        maximum = np.max(feature_matrix[i])
        cluster_sum = np.sum(feature_matrix[i])
        if cluster_sum == 0:
            continue
        purity = maximum / cluster_sum
        purities.append((cluster_sum / feature_matrix_sum) * purity)
    return np.sum(purities)





meal_matrix=getmealfeaturematrix(meal_data)


scaler = StandardScaler()
scaled_features = scaler.fit_transform(meal_matrix)
# print(scaled_features)
bin_matrix = np.array(bin_matrix)


model1 = KMeans(n_clusters=total_bins, random_state=0).fit(meal_matrix)
label_kmeans = model1.labels_
cluster_matrix = np.zeros((total_bins,total_bins))
for i, idx in enumerate(bin_matrix):
        if(i!= len(bin_matrix)):
            row = label_kmeans[i]
            col = idx
            cluster_matrix[row][col] += 1
sse1 = model1.inertia_
print(cluster_matrix)
entropy1 = get_entropy(cluster_matrix)
purity1 = get_purity(cluster_matrix)



model2 = DBSCAN(eps=50, min_samples=total_bins, metric="euclidean").fit(meal_matrix)
label_dbscan = model2.labels_
no_of_labels = np.unique(label_dbscan)
dbscan_clusters = len(no_of_labels)
cluster_matrix2 = np.zeros((total_bins,total_bins))
for i, idx in enumerate(bin_matrix):
        if(i!= len(bin_matrix)):
            row = label_dbscan[i]
            col = idx
            cluster_matrix2[row][col] += 1

sse2 = 0
cluster_size = max(label_dbscan)
print(cluster_matrix2)
for i in range(cluster_size + 1):
    diff_values = scaled_features[label_dbscan == i] - scaled_features[label_dbscan == i].mean(axis=0) 
    sse2 = np.sum(diff_values ** 2)
entropy2 = get_entropy(cluster_matrix2)
purity2 = get_purity(cluster_matrix2)

# print(sse1)
# print(entropy1)
# print(purity1)

# print(sse2)
# print(entropy2)
# print(purity2)

output = pd.DataFrame(
        [
            [
                sse1,
                sse2,
                entropy1,
                entropy2,
                purity1,
                purity2,
            ]
        ],
        columns=[
            "SSE for KMeans",
            "SSE for DBSCAN",
            "Entropy for KMeans",
            "Entropy for DBSCAN",
            "Purity for KMeans",
            "Purity for DBSCAN",
        ],
    )
output = output.fillna(0)
output.to_csv("Results.csv", index=False, header=None)



[[30. 35. 31. 14.  9.  0.  0.]
 [ 6.  7. 10.  3.  2.  0.  0.]
 [24. 24. 12.  4.  1.  1.  0.]
 [10. 15. 13. 12.  4.  0.  0.]
 [21. 20. 19.  6.  4.  0.  0.]
 [37. 28. 17. 15.  5.  1.  0.]
 [ 3.  6.  7.  1.  3.  0.  0.]]
[[  3.   0.   4.   0.   0.   0.   0.]
 [  7.   3.   1.   1.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.]
 [121. 132. 104.  54.  28.   2.   0.]]
