# Imports

In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Cm for figures
cm = 1/2.54

# Load Data

In [None]:
df = pd.read_csv(os.path.join('data', 'pre_and_manifest.csv'))
df_filt = pd.read_csv(os.path.join('data', 'filtered_pre_and_manifest.csv'))
df_imp = pd.read_csv(os.path.join('data', 'imputed_pre_and_manifest.csv'))

# Dropped variables missing %

In [None]:
missing_post = (df.isnull().sum() / df.shape[0]).sort_values(ascending=False)
missing_post[(~df.columns.isin(df_filt.columns))].reset_index()\
                                                 .rename(columns={'index': 'variable', 0: 'missing (%)'})\
                                                 .to_csv(os.path.join('supplementary_material', 'dropped_variables.csv'),
                                                         index=False, float_format='%.3f')

# Imputation

<b> Find best models

In [None]:
def best_models(scores):
    """
    :param scores: Performance of the imputation models
    
    Calculate the best models based on the mean r2/f1 score of the folds per feature
    """
    
    # Mean score per variable and model
    means = scores.groupby(["variable", "model"]).mean().reset_index()
    # Models per variable with highest mean score
    models = means.loc[means.groupby('variable').score.idxmax(), :]
    
    # Get values from all the best scores
    rows = []
    for i, row in enumerate(models.sort_values('score', ascending=False)[['variable', 'model']].values):
        for r in scores[(scores[['variable', 'model']].isin(row)).all(1)].values:
            rows.append(r)
    
    out = pd.DataFrame(rows,
                       columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'],
                      )
    out = out.astype({'variable': str, 'missing': int, 'model': str, 'problem': str, 'fold': int,
                      'score': float, 'mae': float, 'rmse': float})
    return out

In [None]:
train_results1 = pd.read_csv(os.path.join('tables', 'impute_train_1.csv'))
test_results1 = pd.read_csv(os.path.join('tables', 'impute_test_1.csv'))
train_best1 = best_models(train_results1.dropna())
test_best1 = best_models(test_results1.dropna())
test_best1['round'] = 1


train_results2 = pd.read_csv(os.path.join('tables', 'impute_train_2.csv'))
test_results2 = pd.read_csv(os.path.join('tables', 'impute_test_2.csv'))
train_best2 = best_models(train_results2.dropna())
test_best2 = best_models(test_results2.dropna())

test_best2['round'] = 2

<b> Best models based on average performance of all folds </b>

In [None]:
# Round1
fig, axs = plt.subplots(1, 3, figsize=(22.5*cm, 8.5*cm))
p_dict = {'reg': 'Regression', 'ordinal': 'Ordinal', 'class': 'Classification'}
ps = ['reg', 'ordinal', 'class']

average_test_best1 = test_best1.groupby(['variable', 'missing', 'model', 'problem', 'round']).mean().reset_index()

for i in range(3):
    sns.swarmplot(x='model', y='score', hue='model', ax=axs[i], dodge=False,
                data=average_test_best1[average_test_best1['problem']==ps[i]].sort_values(by='model', ascending=False)
               ).set_title(p_dict[ps[i]])
    m = average_test_best1[average_test_best1['problem']==ps[i]].sort_values(by='model', ascending=False)['model'].unique()
    axs[i].set_xticklabels([mi + "\n(n=" + str( np.sum((average_test_best1['problem']==ps[i]) & (average_test_best1['model']==mi)))
                                                       + ")" for mi in m])
    
    axs[i].get_legend().remove()
    if i == 0:
        axs[i].set_ylabel("R2 score")
    else:
        axs[i].set_ylabel("F1 score")
plt.tight_layout()
# plt.savefig('figures/figure3.pdf', dpi=300)

In [None]:
# Round 2
fig, axs = plt.subplots(1, 3, figsize=(22.5*cm, 8.5*cm))
p_dict = {'reg': 'Regression', 'ordinal': 'Ordinal', 'class': 'Classification'}
ps = ['reg', 'ordinal', 'class']

average_test_best2 = test_best2.groupby(['variable', 'missing', 'model', 'problem', 'round']).mean().reset_index()

for i in range(3):
    sns.swarmplot(x='model', y='score', hue='model', ax=axs[i], dodge=False,
                data=average_test_best2[average_test_best2['problem']==ps[i]].sort_values(by='model', ascending=False)
               ).set_title(p_dict[ps[i]])
    m = average_test_best2[average_test_best2['problem']==ps[i]].sort_values(by='model', ascending=False)['model'].unique()
    axs[i].set_xticklabels([mi + "\n(n=" + str( np.sum((average_test_best2['problem']==ps[i]) & (average_test_best2['model']==mi)))
                                                       + ")" for mi in m])
    
    axs[i].get_legend().remove()
    if i == 0:
        axs[i].set_ylabel("R2 score")
    else:
        axs[i].set_ylabel("F1 score")
plt.tight_layout()
plt.savefig('figures/figure3.pdf', dpi=300)

# Completeness

In [None]:
def form(v):
    if v in ['motscore', 'miscore', 
             'ocularh', 'ocularv', 'sacinith', 'sacinitv', 'sacvelh', 'sacvelv', 'dysarth', 'tongue', 'fingtapr',
             'fingtapl', 'prosupr', 'prosupl', 'luria', 'rigarmr', 'rigarml', 'brady', 'dysttrnk', 'dystrue',
             'dystlue', 'dystrle', 'dystlle', 'chorface', 'chorbol', 'chortrnk', 'chorrue', 'chorlue', 'chorrle',
             'chorlle', 'gait', 'tandem', 'retropls', 'diagconf']:
        return 'Motor'
    elif v in ['tfcscore', 'occupatn', 'finances', 'chores', 'adl', 'carelevl']:
        return 'TFC'
    elif v in ['fascore', 'fiscore',
               'emplusl', 'emplany', 'volunt', 'fafinan', 'grocery', 'cash', 'supchild', 'drive', 'housewrk', 'laundry',
               'prepmeal', 'telephon', 'ownmeds', 'feedself', 'dress', 'bathe', 'pubtrans', 'walknbr', 'walkfall',
               'walkhelp', 'comb', 'trnchair', 'bed', 'toilet', 'carehome', 'indepscl']:
        return 'Function'
    elif v in ['depscore', 'irascore', 'psyscore', 'aptscore', 'exfscore', 'pbas1sv', 'pbas1fr', 'pbas1wo', 'pbas2sv',
               'pbas2fr', 'pbas2wo', 'pbas3sv', 'pbas3fr', 'pbas3wo', 'pbas4sv', 'pbas4fr', 'pbas4wo', 'pbas5sv',
               'pbas5fr', 'pbas5wo', 'pbas6sv', 'pbas6fr', 'pbas6wo', 'pbas7sv', 'pbas7fr', 'pbas7wo', 'pbas8sv',
               'pbas8fr', 'pbas8wo', 'pbas9sv', 'pbas9fr', 'pbas9wo', 'pbas10sv', 'pbas10sm__1', 'pbas10sm__2',
               'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5', 'pbas10fr', 'pbas10wo', 'pbas11sv', 'pbas11fr', 'pbas11wo',
               'pbainfo', 'pbahshd']:
        return 'PBA-S'
    elif v in ['hxalcab', 'hxtobab', 'hxtobcpd', 'hxtobyos', 'hxpacky', 'hxdrugab', 'hxmar', 'hxmarfrq', 'hxher',
               'hxherfrq', 'hxcoc', 'hxcocfrq', 'hxclb', 'hxclbfrq', 'hxamp', 'hxampfrq', 'hxrit', 'hxritfrq', 'hxhal',
               'hxhalfrq', 'hxinh', 'hxinhfrq', 'hxopi', 'hxopifrq', 'hxpak', 'hxpakfrq', 'hxbar', 'hxbarfrq', 'hxtrq',
               'hxtrqfrq']:
        return 'MHx'
    elif v in ['height', 'weight', 'bmi', 'hdcat',
               'alcab', 'alcunits', 'tobab', 'tobcpd', 'tobyos', 'packy', 'cafab', 'cafpd', 'drugab', 'mar', 'marfrq',
               'her', 'herfrq', 'coc', 'cocfrq', 'clb', 'clbfrq', 'amp', 'ampfrq', 'rit', 'ritfrq', 'hal', 'halfrq',
               'inh', 'inhfrq', 'opi', 'opifrq', 'pak', 'pakfrq', 'bar', 'barfrq', 'trq', 'trqfrq']:
        return 'Var Items I'
    elif v in ['updsc', 'maristat', 'res', 'isced', 'jobclas', 'jobpaid',
               'rdcwk', 'rdcwkd', 'rdcwkhw', # Baseline
               'emplnrsn', 'emplnrd', 'ssdb', 'rtrnwk', 'rtrddur']:
        return 'Var Items II'
    elif v in ['gen1', 'gen2', 'gen3', 'gen4', 'gen5', 'gen6', 'sdmt', 'sdmt1', 'sdmt2', 'sdmtnd', 'verfct', 'verfctd',
               'verfct5', 'verfct6', 'verfct7', 'verfctnd', 'scnt', 'scnt1', 'scnt2', 'scnt3','scntnd', 'swrt', 'swrt1',
               'swrt2', 'swrt3', 'swrtnd', 'sit', 'sit1', 'sit2', 'sit3', 'trl', 'trla1', 'trla2', 'trla3', 'trlb1',
               'trlb2', 'trlb3', 'verflt', 'verflt05', 'verflt06', 'verflt07']:
        return 'Cognitive'
    elif v == 'mmsetotal':
        return 'MMSE'
    elif v in ['tug', 'tug1', 'scst', 'scst1']:
        return 'Physiotherapy'
    elif v in ['wpaiscr1', 'wpaiscr2', 'wpaiscr3', 'wpaiscr4']:
        return 'WPAI-SHP'
    elif v in ['scoring', 'pf', 'rp', 'bp', 'gh', 'vt', 'sf', 're', 'mh', 'pcs', 'mcs']:
        return 'SF-12'
    elif v in ['anxscore', 'hads_depscore', 'irrscore', 'outscore', 'inwscore']:
        return 'HADS-SIS'
    elif v in ['region', 'sex', 'race', 'handed', 'hxsid', 'dssage', 'dsplace', 'dsend', 'caghigh', 'caglow',
               'momhd', 'momagesx', 'dadhd', 'dadagesx', 'fhx',
               'ccmtr', 'ccmtrage', 'sxsubj', 'sxsubjm', 'sxs_m', 'sxs_c', 'sxs_p', 'sxs_o', 'sxfam',
               'sxfamm', 'sxf_m', 'sxf_c', 'sxf_p', 'sxf_o', 'hddiagn', 'sxest', 'sxrater', 'sxestcfd', 'sxreas', 'sxgs',
               'sxraterm', 'sxr_m', 'sxr_c', 'sxr_p', 'sxr_o', 'ccdep', 'ccdepage', 'ccirb', 'ccirbage',
               'ccvab', 'ccvabage', 'ccapt', 'ccaptage', 'ccpob', 'ccpobage', 'ccpsy', 'ccpsyage', 'ccpsyfh', 'cccog',
               'cccogage', 'xgwas', 'xbsp', 'xpheno', 'xmorpho', 'ximage']:
        return 'Profile'
    elif v in ['sid1', 'sid2', 'sid3', 'sid4', 'sid5', 'int1', 'int2', 'int3', 'int4', 'int5', 'int6', 'sbh1', 'sbh1n',
               'sbh2', 'sbh3', 'sbh3n', 'sbh4', 'sbh4n', 'sbh5', 'sbh6', 'sbh7']:
        return 'C-SSRS'
    else:
        return 'null'

In [None]:
def completeness(preprocessed_data, original_data):
    rows = preprocessed_data.shape[0]
    rows2 = original_data.shape[0]
        
    preprocessed_data = preprocessed_data.isnull().sum().reset_index()
    preprocessed_data.columns = ['variable', 'missing']
    
    original_data = original_data.isnull().sum().reset_index()
    original_data.columns = ['variable', 'missing']

    g1 = preprocessed_data['variable'].apply(lambda x: form(x) if form(x) !='null' else form(x.split('_')[0])) 
    preprocessed_data['group'] = g1.astype(str)
    
    g2 = original_data['variable'].apply(lambda x: form(x) if form(x) !='null' else form(x.split('_')[0])) 
    original_data['group'] = g2.astype(str)

    preprocessed_data = preprocessed_data.groupby('group').missing.aggregate(['sum', 'count']) # .sort_values('mean')
    original_data = original_data.groupby('group').missing.aggregate(['sum', 'count']) # .sort_values('mean')

    
    
    cm = 1/2.54
    plt.figure(figsize=(17*cm, 10*cm))
    w = .3
    plt.xticks(rotation=90)
    forms = ['Profile', 'MHx', 'Var Items I', 'Var Items II', 'Motor', 'Function', 'TFC',
             'Cognitive', 'MMSE', 'Physiotherapy', 'PBA-S', 'SF-12', 'HADS-SIS', 'WPAI-SHP', 'C-SSRS']
    
    for i, g in enumerate(forms):
        if g in preprocessed_data.index:
            p1 = plt.bar(x=i,
                         height=100 * (1 - (preprocessed_data.loc[g,'sum'] / (rows * preprocessed_data.loc[g,'count']) )),
                         # yerr=preprocessed_data.loc[g,'std'],
                         width=w,
                         color='C0',
                         label='Pre-Processed',
                        )
        else:
            p1 = plt.bar(x=i,
                         height=0.0,
                         #yerr=0.0,
                         width=w,
                         color='C0',
                         label='Pre-Processed',
                        )
        if g in original_data.index:
            p2 = plt.bar(x=i+w,
                         height=100 * (1 - (original_data.loc[g,'sum'] / (rows2 * original_data.loc[g,'count']))),
                         # yerr=original_data.loc[g,'std'],
                         width=w,
                         color='C1',
                         label='Original',
                        )
        else:
            p2 = plt.bar(x=i,
                         height=0.0,
                         #yerr=0.0,
                         width=w,
                         color='C0',
                         label='Original',
                        )
    plt.legend([p1, p2], ['Pre-Processed', 'Original'], ncol=2,
               bbox_to_anchor=(0., 1.02, 1., .102), loc=8, borderaxespad=0.)
    plt.xticks([i + w*.5 for i in range(len(forms))], forms)
    
    plt.ylim(0, 105)
    j = -.3

    plt.ylabel('Completeness %')
    plt.xlabel('Form')
    plt.margins(0.01, 0.01)

    plt.tight_layout()
    # plt.savefig('figures/completeness.pdf', bbox_inches = 'tight', pad_inches = 0.05, dpi=1200)
    plt.savefig('figures/figure2.pdf', dpi=300)

In [None]:
completeness(df_filt.copy(), df.copy())