## Correlation Analysis with Gaps Data

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import scipy
import glob

In [2]:
# create time series of wanted time horizon which is defined by the time step and
# and since the hypoglycemic event should be analysed for correaltion only class 1
# is investigated which is the last value of the time series

def create_dataset(X, y, timeclass, time_steps=1, step=1):
    Xglc, Xba, Xbo, Xmacc = [], [], [], []
    if (len(X) < time_steps):
        print('too short')
    else:
        for i in range(0, len(X) - time_steps, step):
            label = y.iloc[i+time_steps-1]
            if (label == timeclass):
                glc = X['glucose'].iloc[i:(i + time_steps)].values.astype(int)
                ba = X['basal'].iloc[i:(i + time_steps)].values
                bo = X['bolus'].iloc[i:(i + time_steps)].values
                macc = X['macc'].iloc[i:(i + time_steps)].values
                Xglc.append(glc)
                Xba.append(ba)
                Xbo.append(bo)
                Xmacc.append(macc)
    return np.array(Xglc), np.array(Xba), np.array(Xbo), np.array(Xmacc)

In [3]:
#help from https://stackoverflow.com/questions/49859182/understanding-level-0-and-group-keys
def Correaltion_Analysis(dataframes, timeclass, method = 'pearson', time_steps = 0):

    correlations = []
    for df in dataframes: 
        Xg, Xba, Xbo, Xm = create_dataset(df[['glucose', 'basal', 'bolus', 'macc']], df['Class'], timeclass, time_steps=time_steps, step=1)
        for i in range (0, Xg.shape[0]):
            data = {'glucose': Xg[i], 'basal': Xba[i], 'bolus': Xbo[i], 'macc':  Xm[i]}
            df = pd.DataFrame.from_dict(data)
            if (method == 'pearson'):
                correlations.append(df.corr(method = 'pearson'))
            else:
                correlations.append(df.corr(method = 'spearman'))

    correlation_matrix = pd.concat(correlations).groupby(level=0)

    mean_corr = correlation_matrix.mean()
    std_corr = correlation_matrix.std()
    max_corr = correlation_matrix.max()
    min_corr = correlation_matrix.min()

    print('Chosen Method is', method, 'and chosen sequence length are', time_steps*5, 'minutes, and considered class is', timeclass, '.')
    print("Mean Correaltion Matrix computed: ", mean_corr)
    print("STD Correaltion Matrix computed: ", std_corr)
    print("Max Correaltion Matrix computed: ", max_corr)
    print("Min Correaltion Matrix computed: ", min_corr)

    return correlation_matrix # correlation #correlation, mean_corr, std_corr, max_corr, min_corr

## Population 

In [10]:
frames = []
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/Users/beyzacinar/Desktop/MA/CODE/GAPS_DATA/TRAIN/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)
classes = [1,2,3,4,5,6,7,8,9]
times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
for i in range(0,len(times)):
    Correaltion_Analysis(frames, classes[i], method = 'pearson', time_steps = times[i])

Chosen Method is pearson and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.385770  1.000000 -0.500000  0.206245
bolus   -0.269187 -0.500000  1.000000 -0.030218
glucose  1.000000  0.385770 -0.269187 -0.101611
macc    -0.101611  0.206245 -0.030218  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.782785  0.000000       NaN  0.743303
bolus    0.680352       NaN  0.000000  0.735532
glucose  0.000000  0.782785  0.680352  0.755303
macc     0.755303  0.743303  0.735532  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus  macc
basal    0.987829  1.000000 -0.500000   1.0
bolus    0.997731 -0.500000  1.000000   1.0
glucose  1.000000  0.987829  0.997731   1.0
macc     1.000000  1.000000  1.000000   1.0
Min Correaltion Matrix computed:           glucose     basal  bolus      macc
basal       -1.0  1.000

In [11]:
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)
classes = [1,2,3,4,5,6,7,8,9]
times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
for i in range(0,len(times)):
    Correaltion_Analysis(frames, classes[i], method = 'spearman', time_steps = times[i])

Chosen Method is spearman and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.386619  1.000000 -0.500000  0.218660
bolus   -0.291865 -0.500000  1.000000 -0.026243
glucose  1.000000  0.386619 -0.291865 -0.101609
macc    -0.101609  0.218660 -0.026243  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.775349  0.000000  0.000000  0.708666
bolus    0.722649  0.000000  0.000000  0.695641
glucose  0.000000  0.775349  0.722649  0.753008
macc     0.753008  0.708666  0.695641  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus  macc
basal    0.866025  1.000000 -0.500000   1.0
bolus    0.866025 -0.500000  1.000000   1.0
glucose  1.000000  0.866025  0.866025   1.0
macc     1.000000  1.000000  1.000000   1.0
Min Correaltion Matrix computed:           glucose     basal  bolus      macc
basal       -1.0  1.00

## Individual 

In [12]:
frames = []
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/Users/beyzacinar/Desktop/MA/CODE/GAPS_DATA/TRAIN/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)
    print(subject_ID)
    classes = [1,2,3,4,5,6,7,8,9]
    times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
    for i in range(0,len(times)):
        print('Correaltion Analysis for subject: ' + subject_ID)
        Correaltion_Analysis(frames, classes[i], method = 'pearson', time_steps = times[i])

540
Correaltion Analysis for subject: 540
Chosen Method is pearson and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.560939  1.000000       NaN  0.152343
bolus   -0.008175       NaN  1.000000 -0.824007
glucose  1.000000  0.560939 -0.008175 -0.045099
macc    -0.045099  0.152343 -0.824007  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.670204  0.000000       NaN  0.729344
bolus    0.957381       NaN  0.000000  0.109655
glucose  0.000000  0.670204  0.957381  0.792844
macc     0.792844  0.729344  0.109655  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.986241  1.000000       NaN  0.992956
bolus    0.866025       NaN  1.000000 -0.746470
glucose  1.000000  0.986241  0.866025  1.000000
macc     1.000000  0.992956 -0.746470  1.000000
Min Correaltion Matrix computed:       

In [4]:
frames = [] ## only mean
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/Users/beyzacinar/Desktop/MA/CODE/GAPS_DATA/TRAIN/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)
    print(subject_ID)
    classes = [1,2,3,4,5,6,7,8,9]
    times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
    for i in range(0,len(times)):
        print('Correaltion Analysis for subject: ' + subject_ID)
        Correaltion_Analysis(frames, classes[i], method = 'pearson', time_steps = times[i])

540
Correaltion Analysis for subject: 540
Chosen Method is pearson and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.560939  1.000000       NaN  0.152343
bolus   -0.008175       NaN  1.000000 -0.824007
glucose  1.000000  0.560939 -0.008175 -0.045099
macc    -0.045099  0.152343 -0.824007  1.000000
Correaltion Analysis for subject: 540
Chosen Method is pearson and chosen sequence length are 15 minutes, and considered class is 2 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.572161  1.000000 -0.500000 -0.032594
bolus    0.042028 -0.500000  1.000000 -0.189069
glucose  1.000000  0.572161  0.042028 -0.033397
macc    -0.033397 -0.032594 -0.189069  1.000000
Correaltion Analysis for subject: 540
Chosen Method is pearson and chosen sequence length are 30 minutes, and considered class is 3 .
Mean Correaltion Matrix computed:           

In [13]:
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)
    print(subject_ID)
    classes = [1,2,3,4,5,6,7,8,9]
    times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
    for i in range(0,len(times)):
        print('Correaltion Analysis for subject: ' + subject_ID)
        Correaltion_Analysis(frames, classes[i], method = 'spearman', time_steps = times[i])

540
Correaltion Analysis for subject: 540
Chosen Method is spearman and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.445420  1.000000 -0.500000  0.194719
bolus   -0.269398 -0.500000  1.000000 -0.074231
glucose  1.000000  0.445420 -0.269398 -0.090502
macc    -0.090502  0.194719 -0.074231  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.741556  0.000000       NaN  0.703392
bolus    0.747760       NaN  0.000000  0.708294
glucose  0.000000  0.741556  0.747760  0.758793
macc     0.758793  0.703392  0.708294  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus  macc
basal    0.866025  1.000000 -0.500000   1.0
bolus    0.866025 -0.500000  1.000000   1.0
glucose  1.000000  0.866025  0.866025   1.0
macc     1.000000  1.000000  1.000000   1.0
Min Correaltion Matrix computed:           glucose     bas

In [5]:
for subject_ID in subject_IDs: ## only mean
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)
    print(subject_ID)
    classes = [1,2,3,4,5,6,7,8,9]
    times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
    for i in range(0,len(times)):
        print('Correaltion Analysis for subject: ' + subject_ID)
        Correaltion_Analysis(frames, classes[i], method = 'spearman', time_steps = times[i])

540
Correaltion Analysis for subject: 540
Chosen Method is spearman and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.445420  1.000000 -0.500000  0.194719
bolus   -0.269398 -0.500000  1.000000 -0.074231
glucose  1.000000  0.445420 -0.269398 -0.090502
macc    -0.090502  0.194719 -0.074231  1.000000
Correaltion Analysis for subject: 540
Chosen Method is spearman and chosen sequence length are 15 minutes, and considered class is 2 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.424311  1.000000 -0.187500 -0.038480
bolus   -0.076490 -0.187500  1.000000 -0.277128
glucose  1.000000  0.424311 -0.076490 -0.037651
macc    -0.037651 -0.038480 -0.277128  1.000000
Correaltion Analysis for subject: 540
Chosen Method is spearman and chosen sequence length are 30 minutes, and considered class is 3 .
Mean Correaltion Matrix computed:        