Number of Participants and Test Days According to Train Test Split File

In [1]:
import sys
sys.path.append('..')
import pandas as pd
import data_utils

MRT = [1, 2, 3]
include_participants = {}

for mrt in MRT:

    tts_file = pd.read_csv(data_utils.train_test_split_path(mrt, 'valid_first_alarms_no_con.csv'), index_col=0)
    include_participants[mrt] = tts_file.columns.astype(int).tolist()
    n_participants = len(tts_file.columns)
    n_days = tts_file.notna().sum(axis=0)
    p25, median, p75 = n_days.quantile([0.25, 0.5, 0.75])
    print(f'MRT {mrt}: Median={median}, 25th percentile={p25}, 75th percentile={p75}, Number of participants={n_participants}')

MRT 1: Median=6.0, 25th percentile=3.0, 75th percentile=17.5, Number of participants=46
MRT 2: Median=7.5, 25th percentile=3.0, 75th percentile=14.0, Number of participants=48
MRT 3: Median=7.0, 25th percentile=3.0, 75th percentile=13.0, Number of participants=51


Total Number of Participants, EMA and EMI, Valid Ratio

In [2]:
import sys
sys.path.append('..')
import pandas as pd
import data_utils

characteristics = []
NON_CONDITIONAL_OBSERVATIONS = ['EMA_mood','EMA_disappointed','EMA_scared','EMA_worry',
'EMA_down','EMA_sad','EMA_confidence','EMA_stress','EMA_lonely',
'EMA_energetic','EMA_concentration','EMA_resilience','EMA_tired',
'EMA_satisfied', 'EMA_relaxed']
INTERACTIVE_NAMES = ['interactive1', 'interactive2', 'interactive3', 'interactive4',
                      'interactive5', 'interactive6', 'interactive7', 'interactive8']

for mrt in MRT:

    data_dir = data_utils.dataset_path(mrt, 'processed_csv_no_con')
    data_files = data_utils.get_data_files(data_dir, True)
    N = len(data_files)

    for k, data in enumerate(data_utils.read_data_files(data_dir, True)):

        participant = data_utils.determine_participant_id(data)
        if participant not in include_participants[mrt]:
            continue

        df = pd.DataFrame(index=[0])
        EMAs = data['Form'].str.contains('interactive-questions')

        df['MRT'] = mrt
        df['Participant'] = participant
        df['MRTSize'] = N
        df['EMAs'] = sum(EMAs)
        days = data.loc[EMAs, 'DayNr'].max()
        df['Days'] = days
        df['ExpectedEMAs'] = min(days, 10) * 8 + max(days-10, 0) * 6
        df['Valid'] = data[NON_CONDITIONAL_OBSERVATIONS].notna().any(axis=1).sum()
        df['ValidRatio'] = df['Valid'] / df['ExpectedEMAs']   #df['EMAs']
        df['TotalTasks'] = data[INTERACTIVE_NAMES].sum().sum()

        characteristics.append(df)

characteristics = pd.concat(characteristics, ignore_index=True)
characteristics = characteristics.set_index(['MRT', 'Participant'])

characteristics.groupby('MRT').quantile([0.25, 0.5, 0.75], numeric_only=True).T.loc[['MRTSize', 'Valid', 'ValidRatio', 'TotalTasks']]

MRT,1,1,1,2,2,2,3,3,3
Unnamed: 0_level_1,0.25,0.50,0.75,0.25,0.50,0.75,0.25,0.50,0.75
MRTSize,57.0,57.0,57.0,56.0,56.0,56.0,59.0,59.0,59.0
Valid,92.5,115.5,175.0,102.5,125.5,152.0,90.0,127.0,160.0
ValidRatio,0.353967,0.481537,0.688976,0.406822,0.496063,0.599409,0.357443,0.495968,0.63543
TotalTasks,46.25,61.5,88.0,47.0,65.0,93.25,34.0,61.0,87.0


KS-Test to Compare EMA distributions

In [3]:
import sys
sys.path.append('..')
from scipy import stats
import numpy as np
import data_utils


n_feat = len(NON_CONDITIONAL_OBSERVATIONS)
distributions = np.zeros((3, n_feat, 7))     # MRT x feature x value

for m, mrt in enumerate(MRT):

     data_dir = data_utils.unprocessed_csv_path(mrt)
     data_files = data_utils.get_data_files(data_dir, True)
     n_valid_EMAs = 0

     for i, data in enumerate(data_utils.read_data_files(data_dir, True)):
          distributions[m] += np.array([data[NON_CONDITIONAL_OBSERVATIONS]==x for x in np.arange(1,8)]).sum(axis=1).T
          n_valid_EMAs += (data['Form'] == 'interactive-questions').sum()

     distributions[m] /= distributions[m].sum(axis=1, keepdims=True)

ks_results = pd.DataFrame(index=NON_CONDITIONAL_OBSERVATIONS, columns=['MRT1 vs MRT2', 'MRT1 vs MRT3', 'MRT2 vs MRT3'])
cum_distributions = np.cumsum(distributions, axis=2)
for i, feat in enumerate(NON_CONDITIONAL_OBSERVATIONS):
     ks_results.loc[feat, 'MRT1 vs MRT2'] = stats.ks_2samp(cum_distributions[0, i], cum_distributions[1, i]).pvalue
     ks_results.loc[feat, 'MRT1 vs MRT3'] = stats.ks_2samp(cum_distributions[0, i], cum_distributions[2, i]).pvalue
     ks_results.loc[feat, 'MRT2 vs MRT3'] = stats.ks_2samp(cum_distributions[1, i], cum_distributions[2, i]).pvalue

ks_results

  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,MRT1 vs MRT2,MRT1 vs MRT3,MRT2 vs MRT3
EMA_mood,0.999961,0.999961,0.999961
EMA_disappointed,0.962704,0.999961,0.962704
EMA_scared,0.999961,0.999961,0.962704
EMA_worry,0.999961,0.999961,0.999961
EMA_down,0.999961,0.999961,0.999961
EMA_sad,0.999961,0.999961,0.962704
EMA_confidence,0.999961,0.999961,0.999961
EMA_stress,0.999961,0.999961,0.962704
EMA_lonely,0.999961,0.962704,0.999961
EMA_energetic,0.999961,0.999961,0.999961


Percentage of Stationary Items

In [6]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import data_utils


stationarity = []

for m, mrt in enumerate(MRT):

    data_dir = data_utils.unprocessed_csv_path(mrt)
    data_files = data_utils.get_data_files(data_dir, True)
    mrt_mean = []
    mrt_std = []
    pid = []
    mrt_stationarity = []
    for data in data_utils.read_data_files(data_dir, True):
        participant = data_utils.determine_participant_id(data)
        if participant not in include_participants[mrt]:
            continue
        pid.append(participant)
        stationary = np.zeros(len(NON_CONDITIONAL_OBSERVATIONS))
        for f, feat in enumerate(NON_CONDITIONAL_OBSERVATIONS):
            filled = data[feat].ffill().bfill()
            if filled.std() < 1e-6:
                stationary[f] = 1
            else:
                p = adfuller(data[feat].interpolate().bfill())[1]
                stationary[f] = 1*(p < 0.05)
        mrt_stationarity.append(pd.Series(stationary, index=NON_CONDITIONAL_OBSERVATIONS, name='stationary'))
    
    mrt_stationarity = pd.concat(mrt_stationarity, axis=1, keys=pid, names='participant').T
    stationarity.append(mrt_stationarity)

stationarity = pd.concat(stationarity, axis=0, keys=MRT, names=['MRT'])
print('Percentage of stationary features per MRT:')
(stationarity.sum(axis=1) >= 14).groupby(['MRT']).sum() / stationarity.groupby(['MRT']).size()

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


Percentage of stationary features per MRT:


MRT
1    0.826087
2    0.770833
3    0.803922
dtype: float64