In [6]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [7]:
# Load data
data_file = '../../../data/original/UMD Sleep Apnea data.csv'
data = pd.read_csv(data_file)

# Define variables
categorical_variables = ["gender", "ethnicity", "allergies", "asthma", "gerd"]
final_variables = ['ahi', 'rem', 'nrem', 'tonsilsize_3-4', 'gender_Male', 'age', 'bmi', 'zscore', \
                   'term', 'allergies_Yes', 'asthma_Yes', 'gerd_Yes', 'ethnicity_Hispanic', \
                   'ethnicity_White', 'ethnicity_Black', 'ethnicity_Asian', 'tst50co2', 'sleepefficiency', \
                   'arousali', 'oai', 'cai', 'tb90', 'lowsao2', 'peakc02', 'oahi', 'ohi']

# Set AHI threshold
ahi_thresh = 5

# Remove NAs from data
data["term"].fillna(0, inplace=True)

# Encode categorical data
for cat_label in categorical_variables:
    if cat_label is "ethnicity":
        temp = pd.get_dummies(data[cat_label], drop_first=False)
    else:
        temp = pd.get_dummies(data[cat_label], drop_first=True)
    name_dict = {old_name:cat_label + "_" + str(old_name) for old_name in temp.columns}
    temp.rename(columns=name_dict,inplace=True)
    data = pd.concat((data, temp),axis=1)

# Rescale tonsilsize from 0-4 to 0-1 (0 for 0-2 and 1 for 3-4)
data['tonsilsize_3-4'] = data['tonsilsize'].replace(to_replace=[0,1,2,3,4], value=[0,0,0,1,1])

# Create final dataset
umd_data = data[final_variables].copy()

In [8]:
# Summarise data
summary = umd_data.describe()
summary = summary.transpose()
print(summary)

                    count       mean        std        min        25%  \
ahi                 456.0  11.858553  23.614481   0.000000   0.900000   
rem                 456.0  16.980702   6.601057   0.000000  13.400000   
nrem                456.0  72.708991  35.218159  22.800000  68.000000   
tonsilsize_3-4      456.0   0.478070   0.500067   0.000000   0.000000   
gender_Male         456.0   0.521930   0.500067   0.000000   0.000000   
age                 456.0  10.508655   4.759669   2.058864   5.823409   
bmi                 456.0  26.863047  10.561774  14.161674  17.922654   
zscore              456.0   1.590278   1.097054  -1.135029   0.933396   
term                456.0   0.098684   0.298565   0.000000   0.000000   
allergies_Yes       456.0   0.350877   0.477769   0.000000   0.000000   
asthma_Yes          456.0   0.287281   0.452991   0.000000   0.000000   
gerd_Yes            456.0   0.070175   0.255723   0.000000   0.000000   
ethnicity_Hispanic  456.0   0.563596   0.496484   0

In [9]:
# Create train and test split
umd_ahi = pd.Series(umd_data['ahi']>5, index=umd_data.index)

skf = StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
n_samples = umd_data.shape[0]
kfold_index = [[i, j] for i, j in skf.split(np.zeros(n_samples), umd_ahi)]
umd_train = umd_data.loc[kfold_index[1][0]]
umd_test = umd_data.loc[kfold_index[1][1]]
umd_train = umd_train.reset_index(drop=True)
umd_test = umd_test.reset_index(drop=True)

In [10]:
# Save data
umd_train_file = '../../../data/processed/umd_data_standard_train.csv'
umd_test_file = '../../../data/processed/umd_data_standard_test.csv'
umd_all_file = '../../../data/processed/umd_data_standard_all.csv'

umd_data.to_csv(umd_all_file, index_label=False)
umd_train.to_csv(umd_train_file, index_label=False)
umd_test.to_csv(umd_test_file, index_label=False)