In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as it

In [None]:
dataset_folder = r'C:\Users\mpola\OneDrive\Desktop\Career\Proje\DREAMT\dataset'
clean_data_folder = dataset_folder + '\\clean_data'

df_participants = pd.read_csv(f'{clean_data_folder}\\df_participants.csv')

df_sleep_disorders = pd.read_csv(f'{clean_data_folder}\\df_sleep_disorders.csv')

df_medical_history = pd.read_csv(f'{clean_data_folder}\\df_medical_history.csv')

df_stats = pd.read_csv(f'{clean_data_folder}\\df_aggregate.csv')

In [None]:
# This box is to define the palettes used so that the field-based coloring is consistent accross graphs

gender_palette = {'M': 'darkblue', 'F': 'coral'}
gender_order = ['M', 'F']

ny_palette = {'1': 'red', '0': 'limegreen'}
ny_order = ['1', '0']

In [None]:
# It is also useful to define lists that contain the different medical history and sleep disorder conditions separately, since
# we might want to filter which columns to include in our graphs

mh_list = df_medical_history['Medical History'].unique().tolist()
sd_list = df_sleep_disorders['Sleep Disorders'].unique().tolist()
conditions_list = mh_list + sd_list

Bar Charts
---

Bar Chart for Sleep Disorders by Occurence in Participants
-

In [None]:
# Grouping by gender to draw a bar graph where we can compare gender-based distribution
sd_sums = df_participants.groupby('Gender')[sd_list].sum().reset_index()

# Reshaping it into 'long' format to work with seaborn
plot_data = sd_sums.melt(id_vars='Gender', var_name='column', value_name='sum')

# Sorting the data in descending order for better comparison
plot_data = plot_data.sort_values(by='sum', ascending=False)

# Creating the bar plot
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=plot_data, x='column', y='sum', hue='Gender', palette=gender_palette, hue_order=gender_order)

# Adding the y-value labels to the bars
for container in ax.containers:
    ax.bar_label(container)
    
# Adding the titles and labels, and rotating the x-ticks to fit
plt.title('Sleep Disorders by Occurence in Participants')

plt.xlabel('Sleep Disorders')
plt.ylabel('Count of Participants')

# Wrapping and rotating the labels so that they're easily readable and dont overlap
wrapped_labels = [label.replace(' ', '\n') for label in sd_list]

ax.set_xticks(range(len(wrapped_labels)))
ax.set_xticklabels(wrapped_labels)
plt.xticks(rotation=45)

plt.show()



Bar Chart for Medical Disorder History by Occurence in Participants
-

In [None]:
# Grouping by gender to draw a bar graph where we can compare gender-based distribution
mh_sums = df_participants.groupby('Gender')[mh_list].sum().reset_index()

# Reshaping it into 'long' format to work with seaborn
plot_data = mh_sums.melt(id_vars='Gender', var_name='column', value_name='sum')

# Sorting the data in descending order for better comparison
plot_data = plot_data.sort_values(by='sum', ascending=False)

# Creating the bar plot
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=plot_data, x='column', y='sum', hue='Gender', palette=gender_palette, hue_order=gender_order)

# Adding the y-value labels to the bars
for container in ax.containers:
    ax.bar_label(container)
    
# Adding the titles and labels, and rotating the x-ticks to fit
plt.title('Medical Disorder History by Occurence in Participants')

plt.xlabel('Medical Disorders')
plt.ylabel('Count of Participants')

# Wrapping and rotating the labels so that they're easily readable and dont overlap
wrapped_labels = [label.replace(' ', '\n') for label in mh_list]

ax.set_xticks(range(len(wrapped_labels)))
ax.set_xticklabels(wrapped_labels)
plt.xticks(rotation=45)

plt.show()

Box Charts
---

Apnea-Hypopnea Index Distribution by Sleep Disorders 
-

In [None]:
# List of sleep disorder columns
# Set up the figure with subplots for each sleep disorder
plt.figure(figsize=(15, 24))

# wrapped_labels = [label.replace(' ', '\n') for label in sd_list]

# Create boxplots for each sleep disorder
for i, sd_col in enumerate(sd_list):
    plt.subplot(5, len(sd_list) // 5 , i+1)  # Adjust the layout (5 rows, 3 columns)

    # Using boxplot like this, Seaborn gives a warning regarding passing palette without assigning hue, but assiging hue to the same variable as 
    # the x axis gives out another error that prevents the graphs from rendering, so it is left like this
    ax = sns.boxplot(data=df_participants, x=sd_col, y='AHI', legend=False, palette=ny_palette, hue_order=ny_order)
    ax.set_xticks(range(2))
    ax.set_xticklabels(['No', 'Yes'])
    
    plt.title(f'AHI for {sd_col}')
    plt.xlabel('')
    plt.ylabel('AHI')

plt.tight_layout()  # Adjust subplots to fit into the figure area
plt.show()

Apnea-Hypopnea Index Distribution by Medical Conditions
-

In [None]:
# List of sleep disorder columns
# Set up the figure with subplots for each sleep disorder
plt.figure(figsize=(15, 24))

# wrapped_labels = [label.replace(' ', '\n') for label in sd_list]

# Create boxplots for each sleep disorder
for i, mh_col in enumerate(mh_list):
    plt.subplot(4, len(mh_list) // 4 , i+1)  # Adjust the layout (5 rows, 3 columns)

    # Using boxplot like this, Seaborn gives a warning regarding passing palette without assigning hue, but assiging hue to the same variable as 
    # the x axis gives out another error that prevents the graphs from rendering, so it is left like this
    ax = sns.boxplot(data=df_participants, x=mh_col, y='AHI', legend=False, palette=ny_palette, hue_order=ny_order)
    ax.set_xticks(range(2))
    ax.set_xticklabels(['No', 'Yes'])
    
    plt.title(f'AHI for {mh_col}')
    plt.xlabel('')
    plt.ylabel('AHI')

plt.tight_layout()  # Adjust subplots to fit into the figure area
plt.show()

Correlation Heatmaps
---

In [None]:
# There are a lot of different measurements in this dataset, and trying to render out all of them in a singular heatmap would be
# incomprehensible, but segmenting them into multiple graphs would either force us to leave out some potential connections or spread out the
# information into too many graphs, so we use a function to filter out only columns that have strong correlation with one another to render
# the graphs to avoid these issues.

##################################################
def corr_test(df, threshold):
    passed = set()
    for (r,c) in it.combinations(df.columns, 2):
        if (abs(df.loc[r,c]) >= threshold):
            passed.add(r)
            passed.add(c)
    passed = sorted(passed)
    return df.loc[passed,passed]
##################################################

In [None]:
# The heatmaps are split into groupings based on the sleep stages, since there is no healthy comparison to be made for measurements
# throughout the stages with the data we have. Creating a separate dataframe that tracks the difference of these statistics between stages
# might result in useful insights, but some of these stages last for such short durations that the sample size might not be enough for
# healthy observation (this research already only has 100 participants)

##################################################
def corr_heatmap(corr, threshold, title, fig_size = 16):
    wrapped_labels = [label.replace(' ', '\n') for label in corr_test(corr, threshold).columns]

    matrix = np.triu(np.ones_like(corr_test(corr, threshold)))
    
    plt.figure(figsize = (fig_size, fig_size))
    sns.heatmap(corr_test(corr, threshold), 
                annot= True,
                cmap = 'mako',
                #mask = matrix
               )
    plt.title(title)

    plt.xticks(rotation = 90, 
               ticks = np.arange(len(wrapped_labels)) + 0.5, 
               labels= wrapped_labels)
    plt.yticks(rotation = 0, 
               ticks = np.arange(len(wrapped_labels)) + 0.5, 
               labels= wrapped_labels)
    return plt
##################################################

In [None]:
# Some of the columns in the participants dataframe store numerical data in string formats, and must be converted in order to use them in
# correlation heatmaps. 

df_temp = pd.DataFrame(df_participants)
gender_map = {'M': 1, 'F': -1}
df_temp['Gender'] = df_temp['Gender'].map(gender_map)
df_temp['Mean SaO2'] = df_temp['Mean SaO2'].str.rstrip('%').astype('float') / 100.0

# Afterwards, we merge the participants dataframe with our aggregate information dataframe so that we can correlate the measured data
# with the medical conditions

df_temp.set_index('SID')

# participant_measure_count = df_agg.groupby('SID').size()

# We drop the data tracking the minimum and the maximum columns, as they complicate the graphs too much without revealing useful insight by themselves
# min_max_cols = df_temp.columns[df_temp.columns.str.contains('Min|Max')]
df_temp_stats = pd.DataFrame(df_stats)

df_temp_stats = df_temp_stats.drop(labels=
                       df_temp_stats.columns[df_temp_stats.columns.str.contains('Min|Max')], 
                       axis='columns')


df_temp = df_temp.merge(df_temp_stats, how='left', on='SID')

In [None]:
# The second graph is for the awake of sleep
corr_W = df_temp[df_temp['Sleep Stage']=='W'].drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for Awake Data'

corr_heatmap(corr_W, 0.9, title)

In [None]:
# The second graph is for the N1 stage of NREM of sleep
corr_W = df_temp[df_temp['Sleep Stage']=='N1'].drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for N1 Sleep Data'

corr_heatmap(corr_W, 0.8, title)

In [None]:
# The second graph is for the N2 stage of NREM of sleep
corr_W = df_temp[df_temp['Sleep Stage']=='N2'].drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for N2 Sleep Data'

corr_heatmap(corr_W, 0.9, title, 20)

In [None]:
# The second graph is for the N3 stage of NREM of sleep
corr_W = df_temp[df_temp['Sleep Stage']=='N3'].drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for N3 Sleep Data'

corr_heatmap(corr_W, 0.9, title)

In [None]:
# The third graph is for the aggregate of all stages of NREM sleep

N_list = ['N1', 'N2', 'N3']

corr_N = df_temp[df_temp['Sleep Stage'].isin(N_list)].drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for Non-REM Sleep Data'


corr_heatmap(corr_N, 0.85, title)

In [None]:
# The first graph is for the REM stage of sleep
corr_R = df_temp[df_temp['Sleep Stage']=='R'].drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for REM Sleep Data'

corr_heatmap(corr_R, 0.9, title)

In [None]:
# This graph contains the aggregate of all data without segmenting into sleep stages

corr = df_temp.drop(['Sleep Stage', 'SID'], axis='columns').corr()
title = 'Correlation Heatmap for Total Data'


corr_heatmap(corr, 0.85, title)