In [1]:
!pip3 install torch torchvision torchaudio --quiet
!pip install gpytorch --quiet
!pip install mne --quiet
!pip install pandas --quiet
!pip install pywavelets --quiet
!pip install edfio --quiet
!pip install tqdm --quiet

In [2]:
import os
import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

import mne
from mne.stats.parametric import _parametric_ci
from mne import create_info
from mne.io import RawArray

from KC_algorithm.model import score_KCs
from KC_algorithm.utils import EpochData
from KC_algorithm.plotting import KC_from_probas, plot_all_Kcs

from utils.plots import plot_Kcs_in_single_chart, plot_Kcs_in_single_chart_epoch_data, plot_kcs_per_minute_and_sleep_stages, KC_from_probas_epoch_data
from utils.io import import_event_and_stages_SHHS, get_total_recording_time, store_kcs_data, load_kcs_edf_files, load_invalid_kc_metadata, load_valid_kc_metadata
from utils.preprocessing import remove_steady_epochs, remove_standard_deviation_outliers
from utils.postprocessing import calculate_kcs_per_minute, filter_kcs_by_sleep_stages

Updated sk-learn import path


In [14]:
DATASET_DIR = "/home/ec2-user/SageMaker/datasets/nsrr/shhs/"

datasets_dir = os.path.join(DATASET_DIR, 'datasets')

# Individual datasets with population & events data
population_dataset_enriched = os.path.join(datasets_dir, 'shhs1-dataset-0.20.0.csv')
events_dataset = os.path.join(datasets_dir, 'shhs-cvd-summary-dataset-0.20.0.csv')
variables_dataset = os.path.join(datasets_dir, 'shhs-data-dictionary-0.20.0-variables.csv')

# Load the datasets to make them globally available
variables_df = pd.read_csv(variables_dataset)
events_df = pd.read_csv(events_dataset)
population_df_enriched = pd.read_csv(population_dataset_enriched)

In [15]:
events_df.head()

Unnamed: 0,nsrrid,pptid,vital,prev_mi,prev_mip,prev_stk,mi,mip,mi_fatal,stroke,...,ptca_date,cabg_date,chf_date,pptidr,visitnumber,afibprevalent,afibincident,gender,race,age_s1
0,200001,1,1,,,,,,,,...,,,,1,3,,,1,1,55
1,200002,2,0,,,,,,,,...,,,,2,3,,,1,1,78
2,200003,3,0,,,,,,,,...,,,,3,3,,,2,1,77
3,200004,4,1,,,,,,,,...,,,,4,3,,,1,1,48
4,200005,5,0,,,,,,,,...,,,,5,3,,,2,2,66


### Explore Events
- The following is partially copied from the official [SHHS Dataset Description](https://www.sleepdata.org/datasets/shhs/pages/04-dataset-introduction.md)
- shhs-cvd (CVD Outcomes): tracking of adjudicated heart health outcomes (e.g. stroke, heart attack) between baseline and 2008-2011 (varies by parent cohort)(5,802 rows, outcomes data were not provided on all subjects)
- shhs-cvd-events (CVD Outcome Events): event-level details for the tracking of heart health outcomes 4,839 rows, representing individual events

Key outcomes for SHHS include the following incident or recurrent CVD events or diagnoses occurring subsequent to the first SHHS PSG:

- Hospitalized acute MI (HAMI)
- Coronary surgical intervention -- percutaneous transcutaneous angioplasty (PTCA), coronary stent placement, coronary artery bypass grafting (CABG)
- Congestive Heart Failure (CHF)
- Coronary heart disease death
- Any coronary heart disease (CHD) -- summary variable which includes 1, 2, and 4 above.
- Any cardiovascular disease (CVD) -- summary variable which includes 1 to 4 above.
- Angina pectoris (AP) -- at CHS and FHS only

#### What we want to track
Anything related to mortality, stroke, etc. This results in the following "labels":

- vital: Vital status at last contact
- prev_mi: Number of myocardial infarctions (MIs) *Prior* to Baseline (=Heart Attack -> Heart)
- prev_mip: Number of Procedures Related to Heart Attack *Prior* to Baseline
- prev_stk: Number of Strokes *Prior* to Baseline (Stroke -> Brain)
- mi_fatal: Fatal Heart Attack Since Baseline [True/False]
- stk_fatal: Fatal Stroke Since Baseline [True/False]
- chd_death: Fatal Coronary Heart Disease (CHD) Since Baseline [True/False]
- cvd_death: Fatal Cardiovascular Disease (CVD) Since Baseline [True/False]
- any_chd: Any Coronary Heart Disease (CHD) Since Baseline [True/False]
- any_cvd: Any Cardiovascular Disease (CVD) Since Baseline [True/False]
variables_df

In [6]:
prev_selected_columns = ["prev_mi", "prev_mip", "prev_stk", "prev_chf", "prev_revpro", "prev_ang"]
selected_columns = ["vital", "mi", "mip", "mi_fatal", "stroke", "stk_fatal", 'stroke', 'stk_fatal', 'chd_death', 'cvd_death', 'angina',
                   'revasc_proc', 'ptca', 'cabg', 'chf', 'any_chd', 'any_cvd', 'mi_death']

Unnamed: 0,folder,id,display_name,description,type,units,domain,labels,calculation,commonly_used,forms
0,Administrative/Interim,calldt,Interim Follow-up: Days from index date to call,,numeric,days from index date,,interim_shhs,,,
1,Administrative/Interim,cmplbp,Interim Follow-up: Completed blood pressure (BP),,choices,,complete2,interim_shhs,,,
2,Administrative/Interim,cmplcvd,Interim Follow-up: Completed cardiovascular di...,,choices,,complete4,interim_shhs,,,
3,Administrative/Interim,cmplmeds,Interim Follow-up: Completed medication form,,choices,,complete4,interim_shhs,,,
4,Administrative/Interim,cmplshq,Interim Follow-up: Completed sleep health ques...,,choices,,complete4,interim_shhs,,,


In [12]:
# Initialize the dictionaries
id_to_display_name = {}
id_to_description = {}

# Populate the dictionaries
for _, row in variables_df.iterrows():
    column_id = row['id']
    display_name = row['display_name']
    description = row['description']  # Use the correct column for description

    id_to_display_name[column_id] = display_name
    id_to_description[column_id] = description

In [None]:
# Create a new dataset with 3 columns: column_id, display_name, description
mapped_data = {
    'column_id': [],
    'display_name': [],
    'description': []
}

for column_id in events_df.columns.to_list():
    mapped_data['column_id'].append(column_id)
    mapped_data['display_name'].append(id_to_display_name.get(column_id))
    mapped_data['description'].append(id_to_description.get(column_id))

mapped_df = pd.DataFrame(mapped_data)
mapped_df.head()

In [16]:
events_df.columns.to_list()

['nsrrid',
 'pptid',
 'vital',
 'prev_mi',
 'prev_mip',
 'prev_stk',
 'mi',
 'mip',
 'mi_fatal',
 'stroke',
 'stk_fatal',
 'chd_death',
 'cvd_death',
 'angina',
 'revasc_proc',
 'ptca',
 'cabg',
 'chf',
 'prev_chf',
 'any_chd',
 'any_cvd',
 'prev_revpro',
 'mi_death',
 'prev_ang',
 'censdate',
 'mi_date',
 'mip_date',
 'stk_date',
 'chd_dthdt',
 'cvd_dthdt',
 'ang_date',
 'revpro_date',
 'ptca_date',
 'cabg_date',
 'chf_date',
 'pptidr',
 'visitnumber',
 'afibprevalent',
 'afibincident',
 'gender',
 'race',
 'age_s1']

In [None]:
mapped_df.to_csv(os.path.join(datasets_dir, 'shhs-cvd-summary-dataset-dictionary-0.20.0.csv'))