In [92]:
from config import project_config as config
from utils.sleep_wake_filter import filter_sleep_series
import pandas as pd
import numpy as np
from functools import reduce
from utils.data_utils import read_sleep_diaries
from sklearn.metrics import classification_report, cohen_kappa_score

In [2]:
merged_sources_path = 'Results/merged_indicators'
label = 'AWS Sleep'
pred_col_name = 'pred_best_on_all'
models = [pred_col_name, 'Biobank Sleep']
sleep_diaries_path = 'data/Sleep diaries'
diaries_df = read_sleep_diaries(sleep_diaries_path, include_naps=False)

results = pd.DataFrame()
all_preds = pd.DataFrame()
for id in config['subject_ids']:

    subject_diary = diaries_df[diaries_df['subject_id'] == id]

    preds_df = pd.read_csv(f'{merged_sources_path}/sub_{id:03d}.csv')
    preds_df['epoch_ts'] = pd.to_datetime(preds_df['epoch_ts'])
    # df = df.dropna(subset=[label] + models)  # Drop epochs without a label or prediction
    preds_df.insert(0, 'subject_id', id)
    
    # df['pred_AWS-CNN'] = filter_sleep_series(df['pred_AWS-CNN'])
    # df[pred_col_name] = filter_sleep_series(df[pred_col_name])
    
    # Here we mark the epochs between sleep start and sleep end as recorded in sleep diary
    # This is how it's done:
    # - Create a column that's =1 for sleep_start epochs
    # - Create a column that's =-1 for sleep_end epochs
    # - Combine the two column so that the new "lights_off_period" column has a 1 when sleep start and a -1 when it ends
    # - Then find the cumulative sum of the lights_off_period column. The cumsum will be 1 between sleep start and sleep end
    #     and 0 elsewhere
    preds_df['lights_off_time'] = preds_df['epoch_ts'].isin(subject_diary['lights_off']).astype(int)
    preds_df['lights_on_time'] = preds_df['epoch_ts'].isin(subject_diary['lights_on']).astype(int).map({0: 0, 1: -1})  # Mark end of sleep with -1
    
    # merge the two columns. We can simply add them, because they are never non-zero on the same row. i.e. start timestamp and end timestamp are never the same
    preds_df['lights_off_period'] = preds_df['lights_off_time'] + preds_df['lights_on_time']
    preds_df['lights_off_period'] = preds_df['lights_off_period'].cumsum()

    # Next, create a column that assigns a distinct id to each sleep episode
    preds_df['sleep_episode_counter'] = preds_df['lights_off_time'].cumsum()  # This is a helper variable that creates a new id evey time sleep starts
    preds_df['sleep_episode_id'] = preds_df['sleep_episode_counter'].where(preds_df['lights_off_period'] == 1, 0)

    preds_df = preds_df.drop(columns=['lights_off_time', 'lights_on_time', 'sleep_episode_counter'])

    all_preds = pd.concat([all_preds, preds_df])


In [None]:
q = all_preds[all_preds['subject_id'] == 21]
q = q[q['is_cv_prediction'] == 1]
q = q[['subject_id', 'PSG Sleep', 'lights_off_period']]

print(q.sum())

q = q[q['lights_off_period'] == 1]

print(q.sum())

In [98]:
# Metrics computed over CV epochs only
temp_df = all_preds.copy()
temp_df = temp_df.dropna(subset=['PSG Sleep', 'Biobank Sleep'])
# temp_df = temp_df.dropna(subset=['AWS Sleep', 'Biobank Sleep', pred_col_name])
temp_df = temp_df[temp_df['is_cv_prediction'] == 1]

subject_metrics_list = []
for subject_id in config['subject_ids']:
    subset_df = temp_df[temp_df['subject_id'] == subject_id]
    metrics_dict = classification_report(y_true=subset_df['PSG Sleep'], y_pred=subset_df[pred_col_name], output_dict=True)
    specificity = metrics_dict['0.0']['recall']
    metrics_dict = metrics_dict['macro avg']
    
    metrics_dict["Cohen's Kappa"] = cohen_kappa_score(y1=subset_df['PSG Sleep'], y2=subset_df[pred_col_name])
    metrics_dict["Specificitiy"] = specificity
    
    subject_df = pd.DataFrame.from_dict(metrics_dict, orient='index', columns=['Value']).rename(columns={'macro avg': 'Value'})
    subject_df = subject_df.drop('support').reset_index().rename(columns={'index': 'Metric'})
    subject_df.insert(0, 'subject_id', subject_id)
    subject_metrics_list.append(subject_df)
    
subject_metrics_df = pd.concat(subject_metrics_list)

demograph_df = pd.read_excel('data/SRCDRI001 PARTICIPANT DEMOGRAPHICS & AHI FEB22.xlsx')
demograph_df = demograph_df.rename(columns={'Participant no.': 'subject_id', 'M/F': 'gender'})
demograph_df = demograph_df[['subject_id', 'gender']]

subject_metrics_df = pd.merge(left=subject_metrics_df, right=demograph_df, on='subject_id')

In [99]:
# Bias
subject_metrics_df.drop(columns='subject_id').groupby(['Metric', 'gender']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Metric,gender,Unnamed: 2_level_1
Cohen's Kappa,F,0.47643
Cohen's Kappa,M,0.440576
Specificitiy,F,0.607817
Specificitiy,M,0.598516
f1-score,F,0.731711
f1-score,M,0.713335
precision,F,0.762396
precision,M,0.74137
recall,F,0.739298
recall,M,0.72941
