# Correlation Analysis and Pairwise Plots

In [1]:
# imports 
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import scipy
import glob

import seaborn as sns

In [2]:
# Function for correlation analysis for the individual models as it prints the subjects ids
# takes the dataframe, the class which is considered, the subject_ID, and the correaltion method which can be 
# either pearson or spearman as input and prints the correlation tables as output 
def Correaltion_Analysis_indi(df, classes, subject_ID, method = 'pearson'):

    if (method == 'pearson'):
        correlations = df.corr(method = 'pearson')
    else:
        correlations = df.corr(method = 'spearman')

    print('Chosen Method is', method, "for class", classes, "and subject", subject_ID, '.')
    print(correlations)
    return correlations 

# Function for correlation analysis for the population models 
# takes the dataframe, the class which is considered, and the correaltion method which can be 
# either pearson or spearman as input and prints the correlation tables as output 
def Correaltion_Analysis_pop(df, classes, method = 'pearson'):

    if (method == 'pearson'):
        correlations = df.corr(method = 'pearson')
    else:
        correlations = df.corr(method = 'spearman')

    print('Chosen Method is', method, "for class", classes, '.')
    print(correlations)
    return correlations 

## Individual Correlation Results for Pearson and Spearman

In [7]:
# array with subject ids so that it can be iterated over each person
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/GAPS_DATA/TRAIN/'

# array with classes so that it can be iterated over each class
classes = [0,1,2,3,4,5,6,7,8,9]

## first over each file of each subject is iterated since the folder is structured as 2018/Train/Subject_IDs/.. 
for subject_ID in subject_IDs:
    csv_files = glob.glob(path + subject_ID + "/*.csv")
    df_corr = (pd.read_csv(file) for file in csv_files)
    # all dataframes of the same subject are concatenated
    df_corr  = pd.concat(df_corr, ignore_index=True)
    df = df_corr.copy()
    # missing values in macc are removed, they were saved as -1 
    df["macc"] = df["macc"].replace(-1, np.nan)
    df = df.dropna()
    # over each class is iterated and the Correaltion_Analysis_indi() function is called for chosen parameters with the Pearson method
    for i in classes:
        df_corr = df.loc[df["Class"] == i]
        df_corr = df_corr[["glucose", "basal", "bolus", "macc"]]
        Correaltion_Analysis_indi(df_corr, i, subject_ID, method = 'pearson')

Chosen Method is pearson for class 0 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.000000  0.048508  0.056369 -0.226349
basal    0.048508  1.000000 -0.019755  0.006372
bolus    0.056369 -0.019755  1.000000 -0.032977
macc    -0.226349  0.006372 -0.032977  1.000000
Chosen Method is pearson for class 1 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.000000  0.034898  0.011249  0.016087
basal    0.034898  1.000000 -0.109426 -0.197908
bolus    0.011249 -0.109426  1.000000  0.104866
macc     0.016087 -0.197908  0.104866  1.000000
Chosen Method is pearson for class 2 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.000000  0.093616 -0.054852 -0.010059
basal    0.093616  1.000000 -0.019609 -0.043839
bolus   -0.054852 -0.019609  1.000000  0.119683
macc    -0.010059 -0.043839  0.119683  1.000000
Chosen Method is pearson for class 3 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.0

In [8]:
# array with subject ids so that we can iterate over each person
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/GAPS_DATA/TRAIN/'

# array with classes so that we can iterate over each class
classes = [0,1,2,3,4,5,6,7,8,9]

# first over each file of each subject is iterated since the folder is structured as 2018/Train/Subject_IDs/.. 
for subject_ID in subject_IDs:
    csv_files = glob.glob(path + subject_ID + "/*.csv")
    df_corr = (pd.read_csv(file) for file in csv_files)
    # all dataframes of the same subject are concatenated
    df_corr  = pd.concat(df_corr, ignore_index=True)
    df = df_corr.copy()
    # missing values in macc are removed, they were saved as -1 
    df["macc"] = df["macc"].replace(-1, np.nan)
    df = df.dropna()
    # over each class is iterated and the Correaltion_Analysis_indi() function is called for chosen parameters with the Spearman method
    for i in classes:
        df_corr = df.loc[df["Class"] == i]
        df_corr = df_corr[["glucose", "basal", "bolus", "macc"]]
        Correaltion_Analysis_indi(df_corr, i, subject_ID, method = 'spearman')

Chosen Method is spearman for class 0 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.000000  0.014602  0.068484 -0.133367
basal    0.014602  1.000000 -0.007625  0.067310
bolus    0.068484 -0.007625  1.000000 -0.031357
macc    -0.133367  0.067310 -0.031357  1.000000
Chosen Method is spearman for class 1 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.000000  0.056671  0.069783  0.081102
basal    0.056671  1.000000 -0.130401 -0.183764
bolus    0.069783 -0.130401  1.000000  0.137834
macc     0.081102 -0.183764  0.137834  1.000000
Chosen Method is spearman for class 2 and subject 540 .
          glucose     basal     bolus      macc
glucose  1.000000  0.113835 -0.024850  0.107014
basal    0.113835  1.000000  0.038267  0.062480
bolus   -0.024850  0.038267  1.000000  0.209288
macc     0.107014  0.062480  0.209288  1.000000
Chosen Method is spearman for class 3 and subject 540 .
          glucose     basal     bolus      macc
glucose 

## Population Correlation Results for Pearson and Spearman

In [12]:
# empty array to store all dataframes of all subjects 
frames = []
# array with subject ids so that it can be iterated over each person
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/GAPS_DATA/TRAIN/'

# array with classes so that it can be iterated over each class
classes = [0,1,2,3,4,5,6,7,8,9]

# first over each file of each subject is iterated since the folder is structured as 2018/Train/Subject_IDs/.. 
for subject_ID in subject_IDs:
    # all paths of csv files are stored in one folder and the csv files are read
    csv_files = glob.glob(path + subject_ID + "/*.csv")
    df_plot = (pd.read_csv(file) for file in csv_files)
    # all dataframes of one subject are concatenated
    df_plot  = pd.concat(df_plot, ignore_index=True)
    # missing values in macc are removed
    df_plot["macc"] = df_plot["macc"].replace(-1, np.nan)
    df_plot = df_plot.dropna()
    # each subject's dataframe is stores in frames
    frames.append(df_plot)
# all dataframes of all subjects are concatenated
df_plot_list = pd.concat(frames, ignore_index=True)

# over each class is iterated and the Correaltion_Analysis_pop() function is called for chosen parameters iteratively for Pearson and Spearman
for i in classes:
    df = df_plot_list.copy()
    df_corr = df.loc[df["Class"] == i]
    df_corr = df_corr[["glucose", "basal", "bolus", "macc"]]
    Correaltion_Analysis_pop(df_corr, i, method = 'pearson')
    Correaltion_Analysis_pop(df_corr, i, method = 'spearman')

Chosen Method is pearson for class 0 .
          glucose     basal     bolus      macc
glucose  1.000000  0.058903  0.046786 -0.062635
basal    0.058903  1.000000  0.020536 -0.082659
bolus    0.046786  0.020536  1.000000 -0.034987
macc    -0.062635 -0.082659 -0.034987  1.000000
Chosen Method is spearman for class 0 .
          glucose     basal     bolus      macc
glucose  1.000000  0.042385  0.052118 -0.024941
basal    0.042385  1.000000  0.026518 -0.042937
bolus    0.052118  0.026518  1.000000  0.003660
macc    -0.024941 -0.042937  0.003660  1.000000
Chosen Method is pearson for class 1 .
          glucose     basal     bolus      macc
glucose  1.000000  0.088447 -0.030201 -0.033254
basal    0.088447  1.000000  0.016948 -0.281876
bolus   -0.030201  0.016948  1.000000 -0.041782
macc    -0.033254 -0.281876 -0.041782  1.000000
Chosen Method is spearman for class 1 .
          glucose     basal     bolus      macc
glucose  1.000000  0.067402 -0.003406 -0.059112
basal    0.067402  1.00000

## Pairwise-Plots

In [None]:
# empty array to store all dataframes of all subjects 
frames = []
# array with subject ids so that it can be iterated over each person
subject_IDs = ['540', '544', '552', '559', '563', '567','570', '575', '584', '588', '591', '596']
path = '/GAPS_DATA/TRAIN/'

# array with classes so that it can be iterated over each class
classes = [0,1,2,3,4,5,6,7,8,9]


# first over each file of each subject is iterated since the folder is structured as 2018/Train/Subject_IDs/.. 
for subject_ID in subject_IDs:
    # all paths of csv files are stored in one folder and the csv files are read
    csv_files = glob.glob(path + subject_ID + "/*.csv")
    df_plot = (pd.read_csv(file) for file in csv_files)
    # all dataframes of one subject are concatenated
    df_plot  = pd.concat(df_plot, ignore_index=True)
    # missing values in macc are removed
    df_plot["macc"] = df_plot["macc"].replace(-1, np.nan)
    df_plot = df_plot.dropna()
    # each subject's dataframe is stores in frames
    frames.append(df_plot)
# all dataframes of all subjects are concatenated
df_plot_list = pd.concat(frames, ignore_index=True)


# over each class is iterated 
for i in classes:
    print('The class is class:', i)
    df = df_plot_list.copy()
    df = df.loc[df["Class"] == i]
    # the pairwise plots are plotted with chosen variables
    pairplots = sns.pairplot(df[['glucose', 'basal', 'bolus', 'macc']])
    file_name = "Correlations/Corr_%i.png" % (i)
    # the plots are saved for each class
    pairplots.savefig(file_name)