In [7]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [8]:
# Get chat data

# Define columsn to use
use_cols = ['QSAHI', 'NREMEPBP', 'NREMEPOP', 'SLPTIME', 'TONSIZE', 'male', \
            'AGEYEAR_AT_MEAS', 'ANT5', 'ANT5A', 'MED6', 'MED2', 'MED1C4', \
            'MED1G4', 'CHI3', 'REF4', 'REF5', 'NOVA_EST_PCTCO2G50', 'SLP_EFF', \
            'AI_ALL', 'QSOAI', 'CAI', 'PCTSA90H', 'MINSAT', 'CO2PEAK', 'OAHI3', \
            'HI', 'TIMEREMP', 'OSAS18', 'TOTALPSQ']
use_cols = use_cols + ['SLS'+str(i) for i in range(1,11)]

# Load data
data_file = '../../../data/original/chat-baseline-dataset-0.7.0.csv'
data = pd.read_csv(data_file, usecols=use_cols)

# Create dataset similar to UMD
chat_data = pd.DataFrame()
chat_data['ahi'] = data['QSAHI'].copy()
chat_data['rem'] = data['TIMEREMP']
chat_data['nrem'] = (data['NREMEPBP'] + data['NREMEPOP'])/data['SLPTIME']
chat_data['tonsilsize_3-4'] = data['TONSIZE'].replace(to_replace=[1,2], value=[0,1])
chat_data['gender_Male'] = data['male']
chat_data['age'] = data['AGEYEAR_AT_MEAS']
chat_data['bmi'] = data['ANT5']
chat_data['zscore'] = data['ANT5A']
chat_data['term'] = data['MED6'].replace(to_replace=[88], value=[0])
chat_data['allergies_Yes'] = data['MED2']
chat_data['asthma_Yes'] = data['MED1C4']
chat_data['gerd_Yes'] = data['MED1G4']
chat_data['ethnicity_Hispanic'] = data['REF5'].replace(to_replace=[2], value=[0])
chat_data['ethnicity_White'] = 0
chat_data.loc[data['REF4']==5,'ethnicity_White'] = 1
chat_data['ethnicity_Black'] = 0
chat_data.loc[data['REF4']==4,'ethnicity_Black'] = 1
chat_data['ethnicity_Asian'] = 0
chat_data.loc[data['REF4']==2,'ethnicity_Asian'] = 1
chat_data['tst50co2'] = data['NOVA_EST_PCTCO2G50']
chat_data['sleepefficiency'] = data['SLP_EFF']
chat_data['arousali'] = data['AI_ALL']
chat_data['oai'] = data['QSOAI']
chat_data['cai'] = data['CAI']
chat_data['tb90'] = tb90 = data['PCTSA90H']*data['SLPTIME']/100
chat_data['lowsao2'] = data['MINSAT']
chat_data['peakc02'] = data['CO2PEAK']
chat_data['oahi'] = data['OAHI3']
chat_data['ohi'] = data['HI']
chat_data['reference: osa18'] = data['OSAS18']
chat_data['reference: psq'] = data['TOTALPSQ']

# Calculate epworth sleepiness score
data_temp = pd.Series(np.zeros(chat_data.shape[0]))
for i in range(1,11):
    data_temp = data_temp + data['SLS'+str(i)]
chat_data['reference: ess'] = data_temp

In [9]:
# Deal with missing data

# Remove patients without AHI
idx_no_ahi = ~chat_data['ahi'].apply(np.isnan)
chat_data = chat_data[idx_no_ahi]

# Remove patients without reference data
idx_no_osa18 = ~chat_data['reference: osa18'].apply(np.isnan)
chat_data = chat_data[idx_no_osa18]
idx_no_psq = ~chat_data['reference: psq'].apply(np.isnan)
chat_data = chat_data[idx_no_psq]
idx_no_ess = ~chat_data['reference: ess'].apply(np.isnan)
chat_data = chat_data[idx_no_ess]

# Select columns to imput NaNs as zeros
impute_zeros_cols = ['tonsilsize_3-4', 'zscore', 'allergies_Yes', \
'asthma_Yes', 'gerd_Yes', 'ethnicity_Hispanic', 'term']
for col in impute_zeros_cols:
    chat_data[col].fillna(0, inplace=True)

# Reset index
chat_data = chat_data.reset_index(drop=True)

In [10]:
# Summarise data
summary = chat_data.describe()
summary = summary.transpose()
print(summary)

                    count       mean        std        min        25%  \
ahi                 445.0   6.818809   5.676759   1.130000   2.690000   
rem                 445.0  18.408652   4.239112   2.720000  16.100000   
nrem                445.0   0.816319   0.042945   0.676678   0.790224   
tonsilsize_3-4      445.0   0.746067   0.435749   0.000000   0.000000   
gender_Male         445.0   0.483146   0.500278   0.000000   0.000000   
age                 445.0   6.566292   1.402455   5.000000   5.000000   
bmi                 445.0  18.956225   4.866325  12.490000  15.240000   
zscore              445.0   0.847348   1.288120  -3.460000  -0.040000   
term                445.0   0.143820   0.351302   0.000000   0.000000   
allergies_Yes       445.0   0.404494   0.491346   0.000000   0.000000   
asthma_Yes          445.0   0.242697   0.429196   0.000000   0.000000   
gerd_Yes            445.0   0.004494   0.066965   0.000000   0.000000   
ethnicity_Hispanic  445.0   0.074157   0.262322   0

In [11]:
# Create train and test split
chat_ahi = pd.Series(chat_data['ahi']>5, index=chat_data.index)

skf = StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
n_samples = chat_data.shape[0]
kfold_index = [[i, j] for i, j in skf.split(np.zeros(n_samples), chat_ahi)]
chat_train = chat_data.loc[kfold_index[1][0]]
chat_test = chat_data.loc[kfold_index[1][1]]
chat_train = chat_train.reset_index(drop=True)
chat_test = chat_test.reset_index(drop=True)

In [12]:
# Save data
chat_train_file = '../../../data/processed/chat_data_standard_train.csv'
chat_test_file = '../../../data/processed/chat_data_standard_test.csv'
chat_all_file = '../../../data/processed/chat_data_standard_all.csv'

chat_data.to_csv(chat_all_file, index_label=False)
chat_train.to_csv(chat_train_file, index_label=False)
chat_test.to_csv(chat_test_file, index_label=False)