# Simple Regression Models

Uses the cleaned data files:

1. '../data/crosscheck_daily_data_cleaned_w_sameday.csv'
2. '../data/studentlife_daily_data_cleaned_w_sameday_03192020.csv'

to run lasso regression with the hyperparameter $\lambda=0.1$. The $\lambda=0.1$ was determined through experimentation on the source (CrossCheck) data.

Requires the following code files in the repo:

1. '../src/util.py'
2. '.../src/regression_cv.py'

In [519]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from importlib import reload
import sys
from matplotlib.patches import Patch
import pingouin as pg

In [2]:
sys.path.insert(0, '../code/')

In [3]:
import util
import regression_cv

In [4]:
palette = sns.color_palette('tab10')[0:2]

### Open Data

In [5]:
crosscheck = pd.read_csv('../data/crosscheck_daily_data_cleaned_w_sameday.csv', index_col=0)

In [6]:
studentlife = pd.read_csv('../data/studentlife_daily_data_cleaned_w_sameday_03192020.csv')

## Prep for models

### Get features

In [7]:
# EMA cols
ema_cols_crosscheck = [i for i in crosscheck.columns if 'ema' in i]
ema_cols_studentlife = [i for i in studentlife.columns if 'ema' in i]

In [8]:
# Behavior cols
behavior_cols_crosscheck = [
    i for i in crosscheck.columns if i not in ['study_id', 'eureka_id', 'date'] + ema_cols_crosscheck
]

behavior_cols_studentlife = [
    i for i in studentlife.columns if i not in ['study_id', 'eureka_id', 'day'] + ema_cols_studentlife
]

behavior_cols = list(set(behavior_cols_crosscheck) & set(behavior_cols_studentlife))
behavior_cols.sort()

In [9]:
features = behavior_cols[:]

### Clean for model training

In [10]:
crosscheck_temp = crosscheck.copy()
crosscheck_temp[behavior_cols] = crosscheck_temp[behavior_cols].fillna(0) # Not using the columns with NAs. All 
                                                                          # ambient audio/light

features = [f for f in features if len(crosscheck_temp[f].unique()) > 1]

#### Transform StudentLife EMA to look like CrossCheck

In [11]:
studentlife_temp = studentlife[['study_id', 'day'] + 
    behavior_cols + ['ema_Stress_level', 'ema_Sleep_rate', 'ema_Behavior_calm', 'ema_Mood_sad', 'ema_Mood_sadornot']
].reset_index(drop=True).copy() # TEMP FILL

# Fill NA
non_sleep_loc_cols = [i for i in behavior_cols if ('loc' not in i) and ('sleep' not in i)]
studentlife_temp[non_sleep_loc_cols] = studentlife_temp[non_sleep_loc_cols].fillna(0)

# Fill sleep with average value for that individual
for s in studentlife_temp.study_id.unique():
    temp = studentlife_temp.loc[studentlife_temp.study_id == s, :]
    duration_mean = temp['sleep_duration'].mean()
    start_mean = temp['sleep_start'].mean()
    end_mean = temp['sleep_end'].mean()
    ind = (studentlife_temp.study_id == s) & pd.isnull(studentlife_temp['sleep_duration'])
    studentlife_temp.loc[ind, 'sleep_duration'] = duration_mean
    studentlife_temp.loc[ind, 'sleep_start'] = start_mean
    studentlife_temp.loc[ind, 'sleep_end'] = end_mean

# Drop days without location (14 total) and days still w/o sleep (all IDs with no sleep info)
studentlife_temp = studentlife_temp.dropna(subset=behavior_cols).reset_index()

In [12]:
# Need to map all of them from 0-3

# Stress [1]A little stressed, [2]Definitely stressed, [3]Stressed out, [4]Feeling good, [5]Feeling great, 
studentlife_temp['ema_STRESSED'] = studentlife_temp['ema_Stress_level'].map({
    5:0, 4:1, 1:2, 2:3, 3:4
})
# Map from 0 - 3
minimum = studentlife_temp['ema_STRESSED'].min()
maximum = studentlife_temp['ema_STRESSED'].max()
studentlife_temp['ema_STRESSED'] =  3 * (studentlife_temp['ema_STRESSED'] - minimum) / (maximum - minimum)

# Sleeping [1]Very good, [2]Fairly good, [3]Fairly bad, [4]Very bad, 
# Map from 0 - 3
studentlife_temp['ema_SLEEPING'] = 4 - studentlife_temp['ema_Sleep_rate'].copy()

In [16]:
targets = ['ema_SLEEPING', 'ema_STRESSED']
studentlife_temp['data'] = 'sl'
crosscheck_temp['data']= 'cc'

### Base models CV

In [17]:
reload(regression_cv)

<module 'regression_cv' from '../code/regression_cv.py'>

In [18]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import  LinearRegression, Ridge, Lasso
from sklearn.svm import SVC
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [20]:
from multiprocessing import Pool, cpu_count

In [21]:
crosscheck_temp['data'] = 'cc'
crosscheck_temp['day'] = pd.to_datetime(crosscheck_temp['date']).dt.tz_localize('US/Eastern')
studentlife_temp['data'] = 'sl'

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
reload(regression_cv)

<module 'regression_cv' from '../code/regression_cv.py'>

#### Histplot

## LOSO entire population

In [None]:
# By study ID
args = []

# models_list = ['lr', 'ridge', 'lasso', 'rf', 'gbt', 'sv']
models_list = ['lasso']

for data in ['sl', 'cc', 'both']:
    for target in targets:
        for m in models_list:
            if data == 'sl':
                train = None
            else:
                train = crosscheck_temp.copy()
            args.append(
                (train, studentlife_temp, data, features, [target], 'day', {'lasso': {'alpha': [0.1]}}, \
                [m], 5, False)
            )

In [None]:
reload(regression_cv)

In [None]:
pool = Pool(cpu_count())
loso_res_dfs = pool.map(regression_cv.run_cv, args)
pool.close()

In [None]:
loso_res_df = pd.concat(loso_res_dfs).reset_index(drop=True)

In [319]:
y_true_cc = []
y_pred_cc = []
study_id_cc = []
target_cc = []
model_type_cc = []
params_cc = []

y_true_sl = []
y_pred_sl = []
study_id_sl = []
target_sl = []
model_type_sl = []
params_sl = []

y_true_both = []
y_pred_both = []
study_id_both = []
target_both = []
target_both = []
model_type_both = []
params_both = []

for ind in loso_res_df.index:
    if loso_res_df.loc[ind, 'data'] == 'cc':
        y_true_cc += eval(loso_res_df.loc[ind, 'y_true'])
        y_pred_cc += eval(loso_res_df.loc[ind, 'y_pred'])
        study_id_cc += [loso_res_df.loc[ind, 'fold']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        target_cc += [loso_res_df.loc[ind, 'target']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        model_type_cc += [loso_res_df.loc[ind, 'model_type']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        params_cc += [loso_res_df.loc[ind, 'params']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
    elif loso_res_df.loc[ind, 'data'] == 'sl':
        y_true_sl += eval(loso_res_df.loc[ind, 'y_true'])
        y_pred_sl += eval(loso_res_df.loc[ind, 'y_pred'])
        study_id_sl += [loso_res_df.loc[ind, 'fold']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        target_sl += [loso_res_df.loc[ind, 'target']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        model_type_sl += [loso_res_df.loc[ind, 'model_type']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        params_sl += [loso_res_df.loc[ind, 'params']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
    elif loso_res_df.loc[ind, 'data'] == 'both':
        y_true_both += eval(loso_res_df.loc[ind, 'y_true'])
        y_pred_both += eval(loso_res_df.loc[ind, 'y_pred'])
        study_id_both += [loso_res_df.loc[ind, 'fold']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        target_both += [loso_res_df.loc[ind, 'target']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        model_type_both += [loso_res_df.loc[ind, 'model_type']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))
        params_both += [loso_res_df.loc[ind, 'params']] \
            * len(eval(loso_res_df.loc[ind, 'y_true']))

In [320]:
df_overall_err_cc = pd.DataFrame({
    'study_id': study_id_cc,
    'y_true': y_true_cc,
    'y_pred_cc': y_pred_cc,
    'target': target_cc,
    'model_type': model_type_cc,
    'params': params_cc
})

df_overall_err_sl = pd.DataFrame({
    'study_id': study_id_sl,
    'y_true': y_true_sl,
    'y_pred_sl': y_pred_sl,
    'target': target_sl,
    'model_type': model_type_sl,
    'params': params_sl
})

df_overall_err_both = pd.DataFrame({
    'study_id': study_id_both,
    'y_true': y_true_both,
    'y_pred_both': y_pred_both,
    'target': target_both,
    'model_type': model_type_both,
    'params': params_both
})

df_overall_err = pd.concat(
    [df_overall_err_cc, df_overall_err_sl[['y_pred_sl']], df_overall_err_both[['y_pred_both']]], axis=1)
df_overall_err['cc_err'] = (df_overall_err['y_true'] - df_overall_err['y_pred_cc'])**2
df_overall_err['sl_err'] = (df_overall_err['y_true'] - df_overall_err['y_pred_sl'])**2
df_overall_err['both_err'] = (df_overall_err['y_true'] - df_overall_err['y_pred_both'])**2

In [321]:
target_list = []
model_type_list = []
params_list = []
r2_list = []
mae_list = []
data_list = []

for t in df_overall_err.target.unique():
    for m in df_overall_err.model_type.unique():
        for p in df_overall_err.params.unique():
            temp = df_overall_err.loc[
                (df_overall_err.target == t) & (df_overall_err.model_type == m) &
                (df_overall_err.params == p), :
            ]
            if temp.shape[0] > 0:
                target_list += [t, t, t]
                model_type_list += [m, m, m]
                params_list += [p, p, p]
                data_list += ['sl', 'cc', 'both']
                r2_list += [
                    r2_score(temp['y_true'], temp['y_pred_sl']), 
                    r2_score(temp['y_true'], temp['y_pred_cc']),
                    r2_score(temp['y_true'], temp['y_pred_both'])
                ]
                mae_list += [
                    mean_absolute_error(temp['y_true'], temp['y_pred_sl']), 
                    mean_absolute_error(temp['y_true'], temp['y_pred_cc']),
                    mean_absolute_error(temp['y_true'], temp['y_pred_both'])
                ]

overall_res_df = pd.DataFrame({
    'target': target_list,
    'model_type': model_type_list,
    'params': params_list,
    'data': data_list,
    'r2': r2_list,
    'mae': mae_list,
})            

In [322]:
overall_res_df_max = overall_res_df.loc[
    overall_res_df.groupby(['target', 'data'])['r2'].idxmax(), :
]

In [323]:
df_overall_err_best_sleep_both = pd.merge(
    left=df_overall_err, right=overall_res_df.iloc[[0], :]
)

df_overall_err_best_sleep_cc = pd.merge(
    left=df_overall_err, right=overall_res_df.iloc[[1], :]
)

df_overall_err_best_sleep_sl = pd.merge(
    left=df_overall_err, right=overall_res_df.iloc[[2], :]
)

df_overall_err_best_sleep = pd.concat([
    df_overall_err_best_sleep_both[['both_err']],
    df_overall_err_best_sleep_cc[['cc_err']],
    df_overall_err_best_sleep_sl[['sl_err']]
], axis=1)

test_sleep_res_overall_cc = util.paired_test(df_overall_err_best_sleep, 'sl_err', 'cc_err')[0]
test_sleep_res_overall_both = util.paired_test(df_overall_err_best_sleep, 'sl_err', 'both_err')[0]

In [324]:
df_overall_err_best_stress_both = pd.merge(
    left=df_overall_err, right=overall_res_df.iloc[[3], :]
)

df_overall_err_best_stress_cc = pd.merge(
    left=df_overall_err, right=overall_res_df.iloc[[4], :]
)

df_overall_err_best_stress_sl = pd.merge(
    left=df_overall_err, right=overall_res_df.iloc[[5], :]
)

df_overall_err_best_stress = pd.concat([
    df_overall_err_best_stress_both[['both_err']],
    df_overall_err_best_stress_cc[['cc_err']],
    df_overall_err_best_stress_sl[['sl_err']]
], axis=1)

test_stress_res_overall_cc = util.paired_test(df_overall_err_best_stress, 'sl_err', 'cc_err')[0]
test_stress_res_overall_both = util.paired_test(df_overall_err_best_stress, 'sl_err', 'both_err')[0]

In [325]:
test_sleep_res_overall_cc['target'] = ['ema_SLEEPING']
test_sleep_res_overall_cc['data'] = ['cc']
test_sleep_res_overall_both['target'] = ['ema_SLEEPING']
test_sleep_res_overall_both['data'] = ['both']
test_stress_res_overall_cc['target'] = ['ema_STRESSED']
test_stress_res_overall_cc['data'] = ['cc']
test_stress_res_overall_both['target'] = ['ema_STRESSED']
test_stress_res_overall_both['data'] = ['both']

test_res_overall = pd.concat([
    test_sleep_res_overall_cc, test_sleep_res_overall_both, 
    test_stress_res_overall_cc, test_stress_res_overall_both
])

test_res_overall = pd.merge(
    left=test_res_overall,
    right=overall_res_df_max[['target', 'data', 'r2', 'mae']],
    on=['target', 'data'],
    how='outer'
)

In [364]:
test_res_overall['Training Data'] = test_res_overall['data'].map({
    'cc': 'Source',
    'sl': 'Target',
    'both': 'Source + Target'
})

test_res_overall['EMA'] = test_res_overall['target'].map({
    'ema_SLEEPING': 'Sleep',
    'ema_STRESSED': 'Stress'
})

test_res_overall_pivot = pd.pivot_table(
    data=test_res_overall,
    index=['EMA'],
    columns=['Training Data'],
    values=['r2', 'mae', 'W-val', 'p-val', 'RBC']
)

test_res_overall_pivot = test_res_overall_pivot[[
    ('r2', 'Target'), ('r2', 'Source'), ('mae', 'Target'), ('mae', 'Source'), 
    ('W-val', 'Source'), ('p-val', 'Source'), ('RBC', 'Source'),
    ('r2', 'Source + Target'), ('mae', 'Source + Target'),
    ('W-val', 'Source + Target'), ('p-val', 'Source + Target'), ('RBC', 'Source + Target'),
    
]]

test_res_overall_pivot.round(2).to_clipboard(excel=True, sep='\t')

In [1162]:
print(util.paired_test(df_overall_err_best_sleep, 'sl_err', 'cc_err')[1])

ShapiroResult(statistic=0.6711632013320923, pvalue=2.195865324322281e-32)


In [1163]:
print(util.paired_test(df_overall_err_best_sleep, 'sl_err', 'both_err')[1])

ShapiroResult(statistic=0.6429898738861084, pvalue=2.0230786750109438e-33)


In [1164]:
print(util.paired_test(df_overall_err_best_stress, 'sl_err', 'cc_err')[1])

ShapiroResult(statistic=0.9131038188934326, pvalue=2.4868180431569797e-12)


In [1165]:
print(util.paired_test(df_overall_err_best_stress, 'sl_err', 'both_err')[1])

ShapiroResult(statistic=0.9537782669067383, pvalue=2.932135423350246e-08)


In [365]:
test_res_overall_pivot

Unnamed: 0_level_0,r2,r2,mae,mae,W-val,p-val,RBC,r2,mae,W-val,p-val,RBC
Training Data,Target,Source,Target,Source,Source,Source,Source,Source + Target,Source + Target,Source + Target,Source + Target,Source + Target
EMA,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Sleep,-0.007897,-0.004585,0.656596,0.660054,96087.0,0.052493,0.076587,-0.007019,0.660339,96579.0,0.041119,0.082099
Stress,-0.021449,-1.953651,0.570344,1.05359,7179.0,1.0,-0.696307,-0.836885,0.795267,8606.0,1.0,-0.635941


## Lasso alpha=0.1

In [1189]:
args = []

for target in targets:
    for model_type in ['lasso']:
        for params in regression_cv.get_param_combinations({'alpha': [0.1]}):
            print(model_type, params)
            for s in studentlife_temp.study_id.unique():
                # Go through each study ID
                val = studentlife_temp.loc[studentlife_temp.study_id == s, 
                                               features + [target, 'study_id', 'day', 'data']]
                if val.dropna().shape[0] < 30:
                    continue
                sl_wo_s = studentlife_temp.loc[
                    studentlife_temp.study_id != s, 
                        features + [target, 'study_id', 'day', 'data']
                ].dropna()
                for data in ['sl', 'cc']:
                    if data == 'sl':
                        d2 = sl_wo_s.copy()
                    else:
                        d2 = crosscheck_temp.copy()
                    for c in list(d2.study_id.unique()):
                        train_data = d2.loc[d2.study_id == c, :]
                            
                        if train_data.shape[0] < 30:
                            continue

                        # Get train data 
                        model = regression_cv.get_model(model_type=model_type, params=params)
                        args.append([model, train_data, model_type, features, target, val, False])

lasso {'alpha': 0.1}
lasso {'alpha': 0.1}


In [1190]:
pool = Pool(cpu_count())
output = pool.map(regression_cv.train_validate_model, args)
pool.close()

In [1191]:
study_id_list = []
data_list = []
cc_id_list = []
target_list = []
model_type_list = []
params_list = []
r2_list = []
mae_list = []
pearson_corr_list = []
pearson_p_list = []
skipped_corr_list = []
skipped_p_list = []
y_true_list = []
y_pred_list = []
coef_list = []

curr = 0
for target in targets:
    for model_type in ['lasso']:
        for params in regression_cv.get_param_combinations({'alpha': [0.1]}):
            print(model_type, params)
            for s in studentlife_temp.study_id.unique():
                # Go through each study ID
                val = studentlife_temp.loc[studentlife_temp.study_id == s, 
                                               features + [target, 'study_id', 'day', 'data']]
                if val.dropna().shape[0] < 30:
                    continue
                sl_wo_s = studentlife_temp.loc[
                    studentlife_temp.study_id != s, 
                        features + [target, 'study_id', 'day', 'data']
                ].dropna()
                for data in ['sl', 'cc']:
                    if data == 'sl':
                        d2 = sl_wo_s.copy()
                    else:
                        d2 = crosscheck_temp.copy()
                    for c in list(d2.study_id.unique()):
                        train_data = d2.loc[d2.study_id == c, :]
                            
                        if train_data.shape[0] < 30:
                            continue

                        model, r2, mae, pearson_corr, pearson_p, skipped_corr, skipped_p, y_true, y_pred, _ = \
                            output[curr]

                        coef_list.append(list(model.coef_))
                        pearson_corr_list.append(pearson_corr)
                        pearson_p_list.append(pearson_p)
                        skipped_corr_list.append(skipped_corr)
                        skipped_p_list.append(skipped_p)
                        y_true_list.append(str(list(y_true)))
                        y_pred_list.append(str(list(y_pred)))

                        study_id_list.append(s)
                        data_list.append(data)
                        cc_id_list.append(c)
                        target_list.append(target)
                        model_type_list.append(model_type)
                        params_list.append(str(params))
                        r2_list.append(r2)
                        mae_list.append(mae)

                        curr += 1

lasso {'alpha': 0.1}
lasso {'alpha': 0.1}


In [1192]:
individual_lasso_res_df = pd.DataFrame({
    'study_id': study_id_list,
    'data': data_list,
    'cc_id': cc_id_list,
    'target': target_list,
    'model_type': model_type_list,
    'params': params_list,
    'r2': r2_list,
    'mae': mae_list,
    'pearson_corr': pearson_corr_list,
    'pearson_p': pearson_p_list,
    'skipped_corr': skipped_corr_list,
    'skipped_p': skipped_p_list,
    'y_true': y_true_list,
    'y_pred': y_pred_list
})

In [1193]:
individual_lasso_res_df[features] = coef_list

In [1285]:
individual_lasso_res_df_max =  individual_lasso_res_df.loc[individual_lasso_res_df.groupby(
    ['target', 'study_id', 'data'])['r2'].idxmax(), :]

In [1286]:
individual_lasso_res_df_max_both = individual_lasso_res_df.loc[individual_lasso_res_df.groupby(
    ['target', 'study_id'])['r2'].idxmax(), :]

In [1287]:
y_true_cc = []
y_pred_cc = []
study_id_cc = []
target_cc = []

y_true_sl = []
y_pred_sl = []
study_id_sl = []
target_sl = []

y_true_both = []
y_pred_both = []
study_id_both = []
target_both = []


for ind in individual_lasso_res_df_max.index:
    if individual_lasso_res_df_max.loc[ind, 'data'] == 'cc':
        y_true_cc += eval(individual_lasso_res_df_max.loc[ind, 'y_true'])
        y_pred_cc += eval(individual_lasso_res_df_max.loc[ind, 'y_pred'])
        study_id_cc += [individual_lasso_res_df_max.loc[ind, 'study_id']] \
            * len(eval(individual_lasso_res_df_max.loc[ind, 'y_true']))
        target_cc += [individual_lasso_res_df_max.loc[ind, 'target']] \
            * len(eval(individual_lasso_res_df_max.loc[ind, 'y_true']))
    else:
        y_true_sl += eval(individual_lasso_res_df_max.loc[ind, 'y_true'])
        y_pred_sl += eval(individual_lasso_res_df_max.loc[ind, 'y_pred'])
        study_id_sl += [individual_lasso_res_df_max.loc[ind, 'study_id']] \
            * len(eval(individual_lasso_res_df_max.loc[ind, 'y_true']))
        target_sl += [individual_lasso_res_df_max.loc[ind, 'target']] \
            * len(eval(individual_lasso_res_df_max.loc[ind, 'y_true']))
        
for ind in individual_lasso_res_df_max_both.index:
    y_true_both += eval(individual_lasso_res_df_max_both.loc[ind, 'y_true'])
    y_pred_both += eval(individual_lasso_res_df_max_both.loc[ind, 'y_pred'])
    study_id_both += [individual_lasso_res_df_max_both.loc[ind, 'study_id']] \
        * len(eval(individual_lasso_res_df_max_both.loc[ind, 'y_true']))
    target_both += [individual_lasso_res_df_max_both.loc[ind, 'target']] \
        * len(eval(individual_lasso_res_df_max_both.loc[ind, 'y_true']))

In [1288]:
df_err_cc = pd.DataFrame({
    'study_id': study_id_cc,
    'y_true': y_true_cc,
    'y_pred_cc': y_pred_cc,
    'target': target_cc
})

df_err_sl = pd.DataFrame({
    'study_id': study_id_sl,
    'y_true': y_true_sl,
    'y_pred_sl': y_pred_sl,
    'target_sl': target_sl
})

df_err_both = pd.DataFrame({
    'study_id': study_id_both,
    'y_true': y_true_both,
    'y_pred_both': y_pred_both,
    'target_both': target_both
})

df_err = pd.concat([df_err_cc, df_err_sl[['y_pred_sl']], df_err_both[['y_pred_both']]], axis=1)
df_err['cc_err'] = (df_err['y_true'] - df_err['y_pred_cc'])**2
df_err['sl_err'] = (df_err['y_true'] - df_err['y_pred_sl'])**2
df_err['both_err'] = (df_err['y_true'] - df_err['y_pred_both'])**2

In [1289]:
reload(util)

<module 'util' from '../code/util.py'>

In [1290]:
df_err_sleep = df_err.loc[df_err.target == 'ema_SLEEPING', :]
df_err_stress = df_err.loc[df_err.target != 'ema_SLEEPING', :]

# SLEEP
test_sleep_res_cc = util.paired_test(df_err_sleep, 'sl_err', 'cc_err')[0]
test_sleep_res_cc['r2'] = [r2_score(df_err_sleep['y_true'], df_err_sleep['y_pred_cc'])]
test_sleep_res_cc['mae'] = [mean_absolute_error(df_err_sleep['y_true'], df_err_sleep['y_pred_cc'])]
test_sleep_res_cc['data'] = ['cc']
test_sleep_res_cc['target'] = ['ema_SLEEPING']

test_sleep_res_both = util.paired_test(df_err_sleep, 'sl_err', 'both_err')[0]
test_sleep_res_both['r2'] = [r2_score(df_err_sleep['y_true'], df_err_sleep['y_pred_both'])]
test_sleep_res_both['mae'] = [mean_absolute_error(df_err_sleep['y_true'], df_err_sleep['y_pred_both'])]
test_sleep_res_both['data'] = ['both']
test_sleep_res_both['target'] = ['ema_SLEEPING']

test_sleep_res_sl = test_sleep_res_both.copy()
test_sleep_res_sl[['W-val', 'tail', 'p-val', 'RBC', 'CLES']] = None
test_sleep_res_sl['r2'] = [r2_score(df_err_sleep['y_true'], df_err_sleep['y_pred_sl'])]
test_sleep_res_sl['mae'] = [mean_absolute_error(df_err_sleep['y_true'], df_err_sleep['y_pred_sl'])]
test_sleep_res_sl['data'] = ['sl']
test_sleep_res_sl['target'] = ['ema_SLEEPING']

# STRESS
test_stress_res_cc = util.paired_test(df_err_stress, 'sl_err', 'cc_err')[0]
test_stress_res_cc['r2'] = [r2_score(df_err_stress['y_true'], df_err_stress['y_pred_cc'])]
test_stress_res_cc['mae'] = [mean_absolute_error(df_err_stress['y_true'], df_err_stress['y_pred_cc'])]
test_stress_res_cc['data'] = ['cc']
test_stress_res_cc['target'] = ['ema_STRESSED']

test_stress_res_both = util.paired_test(df_err_stress, 'sl_err', 'both_err')[0]
test_stress_res_both['r2'] = [r2_score(df_err_stress['y_true'], df_err_stress['y_pred_both'])]
test_stress_res_both['mae'] = [mean_absolute_error(df_err_stress['y_true'], df_err_stress['y_pred_both'])]
test_stress_res_both['data'] = ['both']
test_stress_res_both['target'] = ['ema_STRESSED']


test_stress_res_sl = test_stress_res_both.copy()
test_stress_res_sl[['W-val', 'tail', 'p-val', 'RBC', 'CLES']] = None
test_stress_res_sl['r2'] = [r2_score(df_err_stress['y_true'], df_err_stress['y_pred_sl'])]
test_stress_res_sl['mae'] = [mean_absolute_error(df_err_stress['y_true'], df_err_stress['y_pred_sl'])]
test_stress_res_sl['data'] = ['sl']
test_stress_res_sl['target'] = ['ema_STRESSED']

In [1291]:
test_res_individual = pd.concat([
    test_sleep_res_cc, test_sleep_res_both, test_sleep_res_sl,
    test_stress_res_cc, test_stress_res_both, test_stress_res_sl
]).reset_index(drop=True)

test_res_individual[['W-val', 'p-val', 'RBC']] = test_res_individual[['W-val', 'p-val', 'RBC']].astype(float)

In [1292]:
test_res_individual['Training Data'] = test_res_individual['data'].map({
    'cc': 'Source',
    'sl': 'Target',
    'both': 'Source + Target'
})

test_res_individual['EMA'] = test_res_individual['target'].map({
    'ema_SLEEPING': 'Sleep',
    'ema_STRESSED': 'Stress'
})

test_res_individual_pivot = pd.pivot_table(
    data=test_res_individual,
    index=['EMA'],
    columns=['Training Data'],
    values=['r2', 'mae', 'W-val', 'p-val', 'RBC']
)

test_res_individual_pivot = test_res_individual_pivot[[
    ('r2', 'Target'), ('r2', 'Source'), ('mae', 'Target'), ('mae', 'Source'), 
    ('W-val', 'Source'), ('p-val', 'Source'), ('RBC', 'Source'),
    ('r2', 'Source + Target'), ('mae', 'Source + Target'),
    ('W-val', 'Source + Target'), ('p-val', 'Source + Target'), ('RBC', 'Source + Target'),
    
]]

test_res_individual_pivot.round(2).to_clipboard(excel=True, sep='\t')

In [1293]:
print(util.paired_test(df_err_sleep, 'sl_err', 'cc_err')[1])

ShapiroResult(statistic=0.882167637348175, pvalue=6.796237081770717e-21)


In [1294]:
print(util.paired_test(df_err_sleep, 'sl_err', 'both_err')[1])

ShapiroResult(statistic=0.8429273962974548, pvalue=7.625872616839018e-24)


In [1295]:
print(util.paired_test(df_err_stress, 'sl_err', 'both_err')[1])

ShapiroResult(statistic=0.7376714944839478, pvalue=1.02215774580686e-21)


In [1296]:
print(util.paired_test(df_err_stress, 'sl_err', 'both_err')[1])

ShapiroResult(statistic=0.7376714944839478, pvalue=1.02215774580686e-21)


In [1297]:
test_res_individual_pivot

Unnamed: 0_level_0,r2,r2,mae,mae,W-val,p-val,RBC,r2,mae,W-val,p-val,RBC
Training Data,Target,Source,Target,Source,Source,Source,Source,Source + Target,Source + Target,Source + Target,Source + Target,Source + Target
EMA,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Sleep,0.114907,0.183892,0.624696,0.566964,103373.0,0.000405,0.158221,0.192209,0.56798,72197.0,8.8e-05,0.195453
Stress,0.178603,0.215598,0.492426,0.483447,26096.0,0.057269,0.103938,0.223017,0.481823,16646.0,0.061681,0.113817
