## Correlation Analysis with Interpolated Data

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import scipy
import glob

In [2]:
# create time series of wanted time horizon which is defined by the time step and
# and since the hypoglycemic event should be analysed for correaltion only class 1
# is investigated which is the last value of the time series

def create_dataset(X, y, timeclass, time_steps=1, step=1):
    Xglc, Xba, Xbo, Xmacc = [], [], [], []
    if (len(X) < time_steps):
        print('too short')
    else:
        for i in range(0, len(X) - time_steps, step):
            label = y.iloc[i+time_steps-1]
            if (label == timeclass):
                glc = X['glucose'].iloc[i:(i + time_steps)].values.astype(int)
                ba = X['basal'].iloc[i:(i + time_steps)].values
                bo = X['bolus'].iloc[i:(i + time_steps)].values
                macc = X['macc'].iloc[i:(i + time_steps)].values
                Xglc.append(glc)
                Xba.append(ba)
                Xbo.append(bo)
                Xmacc.append(macc)
    return np.array(Xglc), np.array(Xba), np.array(Xbo), np.array(Xmacc)

In [3]:
#help from https://stackoverflow.com/questions/49859182/understanding-level-0-and-group-keys
def Correaltion_Analysis(dataframes, timeclass, method = 'pearson', time_steps = 0):

    correlations = []
    for df in dataframes: 
        Xg, Xba, Xbo, Xm = create_dataset(df[['glucose', 'basal', 'bolus', 'macc']], df['Class'], timeclass, time_steps=time_steps, step=1)
        for i in range (0, Xg.shape[0]):
            data = {'glucose': Xg[i], 'basal': Xba[i], 'bolus': Xbo[i], 'macc':  Xm[i]}
            df = pd.DataFrame.from_dict(data)
            if (method == 'pearson'):
                correlations.append(df.corr(method = 'pearson'))
            else:
                correlations.append(df.corr(method = 'spearman'))

    correlation_matrix = pd.concat(correlations).groupby(level=0)

    mean_corr = correlation_matrix.mean()
    std_corr = correlation_matrix.std()
    max_corr = correlation_matrix.max()
    min_corr = correlation_matrix.min()

    print('Chosen Method is', method, 'and chosen sequence length are', time_steps*5, 'minutes, and considered class is', timeclass, '.')
    print("Mean Correaltion Matrix computed: ", mean_corr)
    print("STD Correaltion Matrix computed: ", std_corr)
    print("Max Correaltion Matrix computed: ", max_corr)
    print("Min Correaltion Matrix computed: ", min_corr)

    return correlation_matrix # correlation #correlation, mean_corr, std_corr, max_corr, min_corr

In [4]:
#help from https://stackoverflow.com/questions/49859182/understanding-level-0-and-group-keys
def Correaltion_Analysis_Individual(df, timeclass, method = 'pearson', time_steps = 0):

    correlations = []
    Xg, Xba, Xbo, Xm = create_dataset(df[['glucose', 'basal', 'bolus', 'macc']], df['Class'], timeclass, time_steps=time_steps, step=1)
    for i in range (0, Xg.shape[0]):
        data = {'glucose': Xg[i], 'basal': Xba[i], 'bolus': Xbo[i], 'macc':  Xm[i]}
        df = pd.DataFrame.from_dict(data)
        if (method == 'pearson'):
            correlations.append(df.corr(method = 'pearson'))
        else:
            correlations.append(df.corr(method = 'spearman'))
                
    correlation_matrix = pd.concat(correlations).groupby(level=0)

    mean_corr = correlation_matrix.mean()
    std_corr = correlation_matrix.std()
    max_corr = correlation_matrix.max()
    min_corr = correlation_matrix.min()

    print('Chosen Method is', method, 'and chosen sequence length are', time_steps*5, 'minutes, and considered class is', timeclass,'.')
    print("Mean Correaltion Matrix computed: ", mean_corr)
    print("STD Correaltion Matrix computed: ", std_corr)
    print("Max Correaltion Matrix computed: ", max_corr)
    print("Min Correaltion Matrix computed: ", min_corr)

    return correlation_matrix 

## PLAN 

- I need:
    - Population of all including training data and testing data
    - Individual including training and testing for 2018 and 2020
    - in total 2 different

In [5]:
frames = []
path = '/Users/beyzacinar/Desktop/MA/CODE/NEW_DATA/TRAIN/'

# store all paths of csv files in one folder
csv_files = glob.glob(path + "/*.csv")

# create list of the csv files
for file in csv_files:
    df = pd.read_csv(file)
    frames.append(df)
    
classes = [1,2,3,4,5,6,7,8,9]
times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
for i in range(0,len(times)):
    Correaltion_Analysis(frames, classes[i], method = 'pearson', time_steps = times[i]) 

Chosen Method is pearson and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.385770  1.000000 -0.500000  0.254456
bolus   -0.269187 -0.500000  1.000000  0.020996
glucose  1.000000  0.385770 -0.269187 -0.054826
macc    -0.054826  0.254456  0.020996  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.782785  0.000000       NaN  0.744416
bolus    0.680352       NaN  0.000000  0.744133
glucose  0.000000  0.782785  0.680352  0.792652
macc     0.792652  0.744416  0.744133  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus  macc
basal    0.987829  1.000000 -0.500000   1.0
bolus    0.997731 -0.500000  1.000000   1.0
glucose  1.000000  0.987829  0.997731   1.0
macc     1.000000  1.000000  1.000000   1.0
Min Correaltion Matrix computed:           glucose     basal  bolus      macc
basal       -1.0  1.000

## Population spearman

In [6]:
classes = [1,2,3,4,5,6,7,8,9]
times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
for i in range(0,len(times)):
    Correaltion_Analysis(frames, classes[i], method = 'spearman', time_steps = times[i])

Chosen Method is spearman and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.386619  1.000000 -0.500000  0.264960
bolus   -0.291865 -0.500000  1.000000  0.024744
glucose  1.000000  0.386619 -0.291865 -0.054562
macc    -0.054562  0.264960  0.024744  1.000000
STD Correaltion Matrix computed:            glucose     basal     bolus      macc
basal    0.777830  0.000000       NaN  0.717382
bolus    0.727042       NaN  0.000000  0.711845
glucose  0.000000  0.777830  0.727042  0.792342
macc     0.792342  0.717382  0.711845  0.000000
Max Correaltion Matrix computed:            glucose     basal     bolus  macc
basal    0.866025  1.000000 -0.500000   1.0
bolus    0.866025 -0.500000  1.000000   1.0
glucose  1.000000  0.866025  0.866025   1.0
macc     1.000000  1.000000  1.000000   1.0
Min Correaltion Matrix computed:           glucose     basal  bolus      macc
basal       -1.0  1.00

## Individual Correaltions Pearson

In [7]:
frames = []
#subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596',]
path = '/Users/beyzacinar/Desktop/MA/CODE/NEW_DATA/TRAIN/'

# store all paths of csv files in one folder
csv_files = glob.glob(path + "/*.csv")

# create list of the csv files
for file in csv_files:
    df = pd.read_csv(file)
    subject_id = np.unique(df['Subject_ID'].astype(str).to_numpy())
    subject_id = subject_id[0]
    print(subject_id)
    classes = [1,2,3,4,5,6,7,8,9]
    times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
    for i in range(0,len(times)):
        print('Correaltion Analysis for subject: ' + subject_id)
        Correaltion_Analysis_Individual(df, classes[i], method = 'pearson', time_steps = times[i])

552
Correaltion Analysis for subject: 552
Chosen Method is pearson and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.926927  1.000000    NaN  0.900354
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.926927    NaN -0.102515
macc    -0.102515  0.900354    NaN  1.000000
STD Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.086128  0.000000    NaN  0.048547
bolus         NaN       NaN    NaN       NaN
glucose  0.000000  0.086128    NaN  0.846490
macc     0.846490  0.048547    NaN  0.000000
Max Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.987829  1.000000    NaN  0.934682
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.987829    NaN  1.000000
macc     1.000000  0.934682    NaN  1.000000
Min Correaltion Matrix computed:            glucose     basal  bolus      macc
basal

## Individual Correaltions Spearman

In [8]:
for file in csv_files:
    df = pd.read_csv(file)
    subject_id = np.unique(df['Subject_ID'].astype(str).to_numpy())
    subject_id = subject_id[0]
    print(subject_id)
    classes = [1,2,3,4,5,6,7,8,9]
    times = [3, 3, 6, 12, 24, 48, 72, 144, 288] #15,30,60,120,240, 480, 12h, 24h, 48h
    for i in range(0,len(times)):
        print('Correaltion Analysis for subject: ' + subject_id)
        Correaltion_Analysis_Individual(df, classes[i], method = 'spearman', time_steps = times[i])

552
Correaltion Analysis for subject: 552
Chosen Method is spearman and chosen sequence length are 15 minutes, and considered class is 1 .
Mean Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.866025  1.000000    NaN  0.866025
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.866025    NaN -0.112027
macc    -0.112027  0.866025    NaN  1.000000
STD Correaltion Matrix computed:            glucose  basal  bolus      macc
basal    0.000000    0.0    NaN  0.000000
bolus         NaN    NaN    NaN       NaN
glucose  0.000000    0.0    NaN  0.836251
macc     0.836251    0.0    NaN  0.000000
Max Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.866025  1.000000    NaN  0.866025
bolus         NaN       NaN    NaN       NaN
glucose  1.000000  0.866025    NaN  1.000000
macc     1.000000  0.866025    NaN  1.000000
Min Correaltion Matrix computed:            glucose     basal  bolus      macc
basal    0.866025  