In [118]:
# Imports
import pandas as pd
import numpy as np
from getpass import getuser
import os

In [119]:
# Get chat data

# Define columsn to use
use_cols = ['QSAHI', 'NREMEPBP', 'NREMEPOP', 'SLPTIME', 'TONSIZE', 'male', \
            'AGEYEAR_AT_MEAS', 'ANT5', 'ANT5A', 'MED6', 'MED2', 'MED1C4', \
            'MED1G4', 'CHI3', 'REF4', 'REF5', 'NOVA_EST_PCTCO2G50', 'SLP_EFF', \
            'AI_ALL', 'QSOAI', 'CAI', 'PCTSA90H', 'MINSAT', 'CO2PEAK', 'OAHI3', \
            'HI', 'TIMEREMP', 'OSAS18', 'TOTALPSQ']
use_cols = use_cols + ['SLS'+str(i) for i in range(1,11)]

# Load data
data_file = '/Users/' + getuser() + '/Documents/Rhodes/RAIL/chat/datasets/chat-baseline-dataset-0.7.0.csv'
data = pd.read_csv(data_file, usecols=use_cols)

# Create dataset similar to UMD
data_chat = pd.DataFrame()
data_chat['ahi'] = data['QSAHI'].copy()
data_chat['rem'] = data['TIMEREMP']
data_chat['nrem'] = (data['NREMEPBP'] + data['NREMEPOP'])/data['SLPTIME']
data_chat['tonsilsize_3-4'] = data['TONSIZE'].replace(to_replace=[1,2], value=[0,1])
data_chat['gender_Male'] = data['male']
data_chat['age'] = data['AGEYEAR_AT_MEAS']
data_chat['bmi'] = data['ANT5']
data_chat['zscore'] = data['ANT5A']
data_chat['term'] = data['MED6']
data_chat['allergies_Yes'] = data['MED2']
data_chat['asthma_Yes'] = data['MED1C4']
data_chat['gerd_Yes'] = data['MED1G4']
data_chat['ethnicity_Hispanic'] = data['REF5'].replace(to_replace=[2], value=[0])
data_chat['ethnicity_White'] = 0
data_chat.loc[data['REF4']==5,'ethnicity_White'] = 1
data_chat['ethnicity_Black'] = 0
data_chat.loc[data['REF4']==4,'ethnicity_Black'] = 1
data_chat['ethnicity_Asian'] = 0
data_chat.loc[data['REF4']==2,'ethnicity_Asian'] = 1
data_chat['tst50co2'] = data['NOVA_EST_PCTCO2G50']
data_chat['sleepefficiency'] = data['SLP_EFF']
data_chat['arousali'] = data['AI_ALL']
data_chat['oai'] = data['QSOAI']
data_chat['cai'] = data['CAI']
data_chat['tb90'] = tb90 = data['PCTSA90H']*data['SLPTIME']/100
data_chat['lowsao2'] = data['MINSAT']
data_chat['peakc02'] = data['CO2PEAK']
data_chat['oahi'] = data['OAHI3']
data_chat['ohi'] = data['HI']
data_chat['reference: osa18'] = data['OSAS18']
data_chat['reference: psq'] = data['TOTALPSQ']

# Calculate epworth sleepiness score
data_temp = pd.Series(np.zeros(data_chat.shape[0]))
for i in range(1,11):
    data_temp = data['SLS'+str(i)]
data_chat['reference: ess'] = data_temp

In [120]:
# Deal with missing data

# Remove patients without AHI
idx_no_ahi = ~data_chat['ahi'].apply(np.isnan)
data_chat = data_chat[idx_no_ahi]

# Remove patients without reference data
idx_no_osa18 = ~data_chat['reference: osa18'].apply(np.isnan)
data_chat = data_chat[idx_no_osa18]
idx_no_psq = ~data_chat['reference: psq'].apply(np.isnan)
data_chat = data_chat[idx_no_psq]
idx_no_ess = ~data_chat['reference: ess'].apply(np.isnan)
data_chat = data_chat[idx_no_ess]

# Select columns to imput NaNs as zeros
impute_zeros_cols = ['tonsilsize_3-4', 'zscore', 'allergies_Yes', 'asthma_Yes', 'gerd_Yes', 'ethnicity_Hispanic']
for col in impute_zeros_cols:
    data_chat[col].fillna(0, inplace=True)

In [121]:
# Summarise data
summary = data_chat.describe()
summary = summary.transpose()
print(summary)

                    count       mean        std        min        25%  \
ahi                 445.0   6.818809   5.676759   1.130000   2.690000   
rem                 445.0  18.408652   4.239112   2.720000  16.100000   
nrem                445.0   0.816319   0.042945   0.676678   0.790224   
tonsilsize_3-4      445.0   0.746067   0.435749   0.000000   0.000000   
gender_Male         445.0   0.483146   0.500278   0.000000   0.000000   
age                 445.0   6.566292   1.402455   5.000000   5.000000   
bmi                 445.0  18.956225   4.866325  12.490000  15.240000   
zscore              445.0   0.847348   1.288120  -3.460000  -0.040000   
term                444.0   0.936937   8.317926   0.000000   0.000000   
allergies_Yes       445.0   0.404494   0.491346   0.000000   0.000000   
asthma_Yes          445.0   0.242697   0.429196   0.000000   0.000000   
gerd_Yes            445.0   0.004494   0.066965   0.000000   0.000000   
ethnicity_Hispanic  445.0   0.074157   0.262322   0

In [122]:
# Save data
output_path_base = '/Users/' + getuser() + '/Documents/Rhodes/RAIL/apnea_dev/data'

if not os.path.exists(output_path_base):
    os.makedirs(output_path_base)

output_path = os.path.join(output_path_base,'chat_data_standard.csv')
data_chat.to_csv(output_path, index_label=False)