## Correlation Analysis

## Create the sequences of 24 hours and only take class 1 

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import scipy
import glob

In [242]:
# create time series of wanted time horizon which is defined by the time step and
# and since the hypoglycemic event should be analysed for correaltion only class 1
# is investigated which is the last value of the time series

def create_dataset(X, y, time_steps=1, step=1):
    Xglc, Xba, Xbo, Xmacc = [], [], [], []
    if (len(X) < time_steps):
        print('too short')
    else:
        for i in range(0, len(X) - time_steps, step):
            label = y.iloc[i+time_steps-1]
            if (label == 1):
                glc = X['glucose'].iloc[i:(i + time_steps)].values.astype(int)
                ba = X['basal'].iloc[i:(i + time_steps)].values
                bo = X['bolus'].iloc[i:(i + time_steps)].values
                macc = X['macc'].iloc[i:(i + time_steps)].values
                Xglc.append(glc)
                Xba.append(ba)
                Xbo.append(bo)
                Xmacc.append(macc)
    return np.array(Xglc), np.array(Xba), np.array(Xbo), np.array(Xmacc)

In [243]:
#help from https://stackoverflow.com/questions/49859182/understanding-level-0-and-group-keys
def Correaltion_Analysis(dataframes, method = 'pearson', time_steps = 0):

    correlations = []
    for df in dataframes: 
        Xg, Xba, Xbo, Xm = create_dataset(df[['glucose', 'basal', 'bolus', 'macc']], df['Class'], time_steps=time_steps, step=1)
        for i in range (0, Xg.shape[0]):
            data = {'glucose': Xg[i], 'basal': Xba[i], 'bolus': Xbo[i], 'macc':  Xm[i]}
            df = pd.DataFrame.from_dict(data)
            if (method == 'pearson'):
                correlations.append(df.corr(method = 'pearson'))
            else:
                correlations.append(df.corr(method = 'spearman'))
            correlations.append(df.corr(method = 'pearson'))

    #correlation_matrix = df[['glucose', 'basal', 'bolus', 'macc']].corr()
    #correlations.append(correlation_matrix)

    correlation_matrix = pd.concat(correlations).groupby(level=0)

    mean_corr = correlation_matrix.mean()
    std_corr = correlation_matrix.std()
    max_corr = correlation_matrix.max()
    min_corr = correlation_matrix.min()

    print('Chosen Method is', method, 'and chosen sequence length are', time_steps*5, 'minutes.')
    print("Mean Correaltion Matrix computed: ", mean_corr)
    print("STD Correaltion Matrix computed: ", std_corr)
    print("Max Correaltion Matrix computed: ", max_corr)
    print("Min Correaltion Matrix computed: ", min_corr)

    return correlation_matrix # correlation #correlation, mean_corr, std_corr, max_corr, min_corr

## Population 2018

In [181]:
frames = []
subject_IDs = ['559', '563', '570', '575', '588', '591']
path = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Train/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)

times = [12, 36, 144, 288, 576]
for time in times:
    Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
    Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.338138  1.000000 -0.051900  0.008939
bolus   -0.110830 -0.051900  1.000000 -0.048298
glucose  1.000000  0.338138 -0.110830  0.002369
macc     0.002369  0.008939 -0.048298  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.536473  0.000000  0.257227  0.273594
bolus    0.374984  0.257227  0.000000  0.293662
glucose  0.000000  0.536473  0.374984  0.241501
macc     0.241501  0.273594  0.293662  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.954864  1.000000  0.356753  0.884624
bolus    0.811104  0.356753  1.000000  0.637965
glucose  1.000000  0.954864  0.811104  0.959247
macc     0.959247  0.884624  0.637965  1.000000
Min Correaltion Matrix computed:            glucose     basal     bolus      macc
basal   -0.931225  1.000000

## Population for 2020

In [182]:
frames = []
subject_IDs = ['540', '544', '552', '567', '584', '96']
path = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Train/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
        frames.append(df)

times = [12, 36, 144, 288, 576]
for time in times:
    Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
    Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.333859  1.000000  0.154116  0.001057
bolus    0.142781  0.154116  1.000000 -0.043444
glucose  1.000000  0.333859  0.142781 -0.041334
macc    -0.041334  0.001057 -0.043444  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.625575  0.000000  0.558394  0.508073
bolus    0.487822  0.558394  0.000000  0.315153
glucose  0.000000  0.625575  0.487822  0.511684
macc     0.511684  0.508073  0.315153  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.960675  1.000000  0.832749  0.954653
bolus    0.913589  0.832749  1.000000  0.561130
glucose  1.000000  0.960675  0.913589  0.954528
macc     0.954528  0.954653  0.561130  1.000000
Min Correaltion Matrix computed:            glucose  basal     bolus      macc
basal   -1.000000    1.0 -1.00

## Correlation Individual 2018

In [183]:
frames = []
subject_IDs = ['559', '563', '570', '575', '588', '591']
path = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Train/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
    frames.append(df)
    print('Correlation for Subject:', subject_ID)
    times = [12, 36, 144, 288, 576]
    for time in times:  
        Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
        Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Correlation for Subject: 559
Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.284247  1.000000    NaN -0.067482
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.284247    NaN  0.023170
macc     0.023170 -0.067482    NaN  1.000000
STD Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.485427  0.000000    NaN  0.140691
bolus         NaN       NaN    NaN       NaN
glucose  0.000000  0.485427    NaN  0.143250
macc     0.143250  0.140691    NaN  0.000000
Max Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.883719  1.000000    NaN  0.144150
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.883719    NaN  0.200383
macc     0.200383  0.144150    NaN  1.000000
Min Correaltion Matrix computed:            glucose     basal  bolus      macc
basal   -0.614487  1.000000    NaN -0.433389
b

## Correaltion individual 2020

In [184]:
frames = []
subject_IDs = ['540', '544', '552', '567', '584', '96']
path = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Train'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files = glob.glob(path + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files:
        df = pd.read_csv(file)
    frames.append(df)
    print('Correlation for Subject:', subject_ID)
    times = [12, 36, 144, 288, 576]
    for time in times:
        Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
        Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Correlation for Subject: 540
Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus   -0.185369    NaN  1.000000  0.115688
glucose  1.000000    NaN -0.185369  0.045279
macc     0.045279    NaN  0.115688  1.000000
STD Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus    0.241407    NaN  0.000000  0.092448
glucose  0.000000    NaN  0.241407  0.368587
macc     0.368587    NaN  0.092448  0.000000
Max Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus    0.223767    NaN  1.000000  0.220735
glucose  1.000000    NaN  0.223767  0.743934
macc     0.743934    NaN  0.220735  1.000000
Min Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
b

Chosen Method is spearman and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus   -0.210966    NaN  1.000000  0.153151
glucose  1.000000    NaN -0.210966 -0.012099
macc    -0.012099    NaN  0.153151  1.000000
STD Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus    0.223256    NaN  0.000000  0.178183
glucose  0.000000    NaN  0.223256  0.366645
macc     0.366645    NaN  0.178183  0.000000
Max Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus    0.223767    NaN  1.000000  0.515903
glucose  1.000000    NaN  0.223767  0.743934
macc     0.743934    NaN  0.515903  1.000000
Min Correaltion Matrix computed:            glucose  basal     bolus      macc
basal         NaN    NaN       NaN       NaN
bolus   -0.480514    NaN  1.0

## Correlation population 2020 train and test

In [185]:
frames1, frames2 = [], []
subject_IDs = ['540', '544', '552', '567', '584', '596']
path1 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Train/'
path2 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Test/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files1 = glob.glob(path1 + subject_ID + "/*.csv")
    csv_files2 = glob.glob(path2 + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files1:
        df1 = pd.read_csv(file)
        frames1.append(df1)

    for file in csv_files2:
        df2 = pd.read_csv(file)
        frames2.append(df2)


frames = frames1 + frames2

times = [12, 36, 144, 288, 576]
for time in times:
    Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
    Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.319435  1.000000  0.159649  0.022920
bolus    0.059759  0.159649  1.000000  0.033308
glucose  1.000000  0.319435  0.059759 -0.030646
macc    -0.030646  0.022920  0.033308  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.630910  0.000000  0.518185  0.510814
bolus    0.447722  0.518185  0.000000  0.263149
glucose  0.000000  0.630910  0.447722  0.505718
macc     0.505718  0.510814  0.263149  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.960675  1.000000  0.832749  0.954653
bolus    0.913589  0.832749  1.000000  0.561130
glucose  1.000000  0.960675  0.913589  0.954528
macc     0.954528  0.954653  0.561130  1.000000
Min Correaltion Matrix computed:            glucose  basal     bolus      macc
basal   -1.000000    1.0 -1.00

## Individual

In [186]:
frames1, frames2 = [], []
subject_IDs = ['540', '544', '552', '567', '584', '596']
path1 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Train/'
path2 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Test/'

## go over each file for each subject since the folder is ordered as 2018/Train or Test/then distriubuted over the subject ids
for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files1 = glob.glob(path1 + subject_ID + "/*.csv")
    csv_files2 = glob.glob(path2 + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files1:
        df1 = pd.read_csv(file)
        frames1.append(df1)

    for file in csv_files2:
        df2 = pd.read_csv(file)
        frames2.append(df2)


    frames = frames1 + frames2

    print('Correlation for Subject:', subject_ID)
    times = [12, 36, 144, 288, 576]
    for time in times:
        Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
        Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Correlation for Subject: 540
Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.363604  1.000000  0.157290  0.021903
bolus    0.306709  0.157290  1.000000 -0.161870
glucose  1.000000  0.363604  0.306709 -0.123790
macc    -0.123790  0.021903 -0.161870  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.608782  0.000000  0.595762  0.513551
bolus    0.479404  0.595762  0.000000  0.266775
glucose  0.000000  0.608782  0.479404  0.520033
macc     0.520033  0.513551  0.266775  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.936841  1.000000  0.832749  0.954653
bolus    0.913589  0.832749  1.000000  0.342464
glucose  1.000000  0.936841  0.913589  0.954528
macc     0.954528  0.954653  0.342464  1.000000
Min Correaltion Matrix computed:            glucose     basal     bolus      mac

## Correlation population 2018 train and test

In [187]:
frames1, frames2 = [], []
subject_IDs = ['559', '563', '570', '575', '588', '591']
path = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Train/'
path2 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Test/'

for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files1 = glob.glob(path1 + subject_ID + "/*.csv")
    csv_files2 = glob.glob(path2 + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files1:
        df1 = pd.read_csv(file)
        frames1.append(df1)

    for file in csv_files2:
        df2 = pd.read_csv(file)
        frames2.append(df2)


frames = frames1 + frames2

times = [12, 36, 144, 288, 576]
for time in times:
    Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
    Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.491084  1.000000  0.228327 -0.016516
bolus   -0.199517  0.228327  1.000000 -0.124803
glucose  1.000000  0.491084 -0.199517  0.078186
macc     0.078186 -0.016516 -0.124803  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.484239  0.000000  0.150174  0.165431
bolus    0.271648  0.150174  0.000000  0.195219
glucose  0.000000  0.484239  0.271648  0.302886
macc     0.302886  0.165431  0.195219  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.950996  1.000000  0.301511  0.379980
bolus    0.619943  0.301511  1.000000  0.508402
glucose  1.000000  0.950996  0.619943  0.921535
macc     0.921535  0.379980  0.508402  1.000000
Min Correaltion Matrix computed:            glucose     basal     bolus      macc
basal   -0.901400  1.000000

## Individual

In [188]:
frames1, frames2 = [], []
subject_IDs = ['559', '563', '570', '575', '588', '591']
path = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Train/'
path2 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Test/'

for subject_ID in subject_IDs:
# store all paths of csv files in one folder
    csv_files1 = glob.glob(path1 + subject_ID + "/*.csv")
    csv_files2 = glob.glob(path2 + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files1:
        df1 = pd.read_csv(file)
        frames1.append(df1)

    for file in csv_files2:
        df2 = pd.read_csv(file)
        frames2.append(df2)


    frames = frames1 + frames2

    print('Correlation for Subject:', subject_ID)
    times = [12, 36, 144, 288, 576]
    for time in times:
        Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
        Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Correlation for Subject: 559
Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.779687  1.000000    NaN -0.055125
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.779687    NaN -0.031716
macc    -0.031716 -0.055125    NaN  1.000000
STD Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.145675  0.000000    NaN  0.186588
bolus         NaN       NaN    NaN       NaN
glucose  0.000000  0.145675    NaN  0.287546
macc     0.287546  0.186588    NaN  0.000000
Max Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.950996  1.000000    NaN  0.379980
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.950996    NaN  0.593851
macc     0.593851  0.379980    NaN  1.000000
Min Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.341560  1.000000    NaN -0.414039
b

ValueError: No objects to concatenate

## 2018 and 2020

In [244]:
frames1, frames2 = [], []
subject_IDs1 = ['559', '563', '570', '575', '588', '591']
path1 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2018/Train/'

subject_IDs2 = ['540', '544', '552', '567', '584', '96']
path2 = '/Users/beyzacinar/Desktop/MA/CODE/DATA/2020/Train/'

for subject_ID in subject_IDs1:
# store all paths of csv files in one folder
    csv_files1 = glob.glob(path1 + subject_ID + "/*.csv")

# create list of the csv files
    
    for file in csv_files1:
        df1 = pd.read_csv(file)
        frames1.append(df1)


for subject_ID in subject_IDs2:
# store all paths of csv files in one folder
    csv_files2 = glob.glob(path2 + subject_ID + "/*.csv")

    for file in csv_files2:
        df2 = pd.read_csv(file)
        frames2.append(df2)

frames = frames1 + frames2

    
times = [12, 36, 144, 288, 576]
for time in times:
    Correaltion_Analysis(frames, method = 'pearson', time_steps = time) 
    Correaltion_Analysis(frames, method = 'spearman', time_steps = time)

Chosen Method is pearson and chosen sequence length are 60 minutes.
Mean Correaltion Matrix computed:            glucose     basal     bolus     macc
basal    0.336100  1.000000  0.029852  0.00504
bolus   -0.018475  0.029852  1.000000 -0.04732
glucose  1.000000  0.336100 -0.018475 -0.01514
macc    -0.015140  0.005040 -0.047320  1.00000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.580504  0.000000  0.414930  0.406742
bolus    0.436679  0.414930  0.000000  0.297820
glucose  0.000000  0.580504  0.436679  0.374537
macc     0.374537  0.406742  0.297820  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.960675  1.000000  0.832749  0.954653
bolus    0.913589  0.832749  1.000000  0.637965
glucose  1.000000  0.960675  0.913589  0.959247
macc     0.959247  0.954653  0.637965  1.000000
Min Correaltion Matrix computed:            glucose  basal     bolus      macc
basal   -1.000000    1.0 -1.000000 