# Imports

In [None]:
from os.path import join

import matplotlib.pyplot as plt
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
from tqdm.auto import tqdm
import seaborn as sns

from random import randint

import seaborn as sns

# Load data

In [None]:
enroll = pd.read_csv(join('data', 'original', 'enroll.csv'), sep='\t', low_memory=False)
profile = pd.read_csv(join('data', 'original', 'profile.csv'), sep='\t', low_memory=False)

In [None]:
df = enroll.merge(profile, on='subjid', how='left') # Add profile table

df['dsplace'].replace(9999, 5, inplace=True)  # 5 is the unknown category, missing means not applicable here

# Replace Strings and Integers that indicate missing values
df.replace(r'(^\s*$|MISSING|WRONG|^9999$|^9999\.0$|^9998$|^9998\.0$|^9997$|^9997\.0$|^9996$|^9996\.0$)', np.nan,
           regex=True, inplace=True)
df.replace([9999, 9998, 9997, 9996], np.nan, inplace=True)

df['age'] = df['age'].replace('<18', 17).astype(int)
df['caglow'] = df['caglow'].replace('>28', 29).astype(int)
df['caghigh'] = df['caghigh'].replace('>70', 71).astype(int)

print(df.shape)
df.head()

In [None]:
# Check which variables contain strings
(df.dtypes == object).replace(False, np.nan).dropna()

In [None]:
print(df.shape)
print('Patients:', df['subjid'].nunique())
print('Visits  :', df.shape[0])
print(df.groupby('subjid')['seq'].count().agg(['mean', 'std']).round(2))
print('Max visit:', df.groupby('subjid').size().max())

# Select patients

In [None]:
df = df.loc[(((df['caghigh'] >= 36) & (df['caghigh'] <= 59)) & # CAG 36-59
             ((df['hddiagn'] >= 21) | (df['sxrater'] >= 21))   # AAO >=21
            )]
print(df.shape)
print('Patients:', df['subjid'].nunique())
print('Visits  :', df.shape[0])
print(df.groupby('subjid')['seq'].count().agg(['mean', 'std']).round(2))
df.head()

In [None]:
df['age'].plot.hist(rot=0, bins=len(df['age'].sort_values().unique()) // 2)
plt.xticks(range(20, 95, 5))
plt.xlabel('age')
plt.ylabel('counts')

In [None]:
df['caghigh'].value_counts().sort_index().plot.bar(rot=0)
plt.xlabel('caghigh')
plt.ylabel('counts')

In [None]:
df['caglow'].value_counts().sort_index().plot.bar(rot=0)
plt.xlabel('caglow')
plt.ylabel('counts')

# Infer hdcat using backward fill + forward fill

**Example**

In [None]:
df.loc[(df['subjid'] == 'R002593663') & (df['seq'] == 1), 'hdcat'] = np.nan
df.loc[df['subjid'] == 'R002593663', ['subjid', 'seq', 'hdcat']]

In [None]:
df.loc[df['subjid'] == 'R002593663'].groupby('subjid')['hdcat'].fillna(method='bfill').fillna(method='ffill')

**Replace**

In [None]:
df['hdcat'] = df.groupby('subjid')['hdcat'].fillna(method='bfill').fillna(method='ffill')
df.loc[df['subjid'] == 'R002593663', ['subjid', 'seq', 'hdcat']]

**Count number of patients which are premanifest, manifest or both**

In [None]:
df['hdcat'].value_counts(dropna=False)

In [None]:
pre_and_manifest_subjects = (df.groupby('subjid')['hdcat'].nunique() > 1).replace(False, np.nan).dropna().index
premanifest_subjects = df.loc[(~df['subjid'].isin(pre_and_manifest_subjects)) & (df['hdcat'] == 2)]
manifest_subjects = df.loc[(~df['subjid'].isin(pre_and_manifest_subjects)) & (df['hdcat'] == 3)]
print('premanifest: {}\nmanifest: {}\npre and manifest: {}'.format(len(premanifest_subjects),
                                                                   len(manifest_subjects),
                                                                   len(pre_and_manifest_subjects),
                                                                   )
     )

# Save subset

In [None]:
df.to_csv(join('data', 'pre_and_manifest.csv'))

# Missing before feature engineering

In [None]:
def pieplot(df_in):
    plt.figure(figsize=(1.5,1.5))
    mis = round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)
    x = [mis, 100-mis]
    plt.pie(x, startangle=90, pctdistance=0.85, colors = ['white', 'mediumseagreen'],
            wedgeprops={"edgecolor":"w", 'linewidth':.5})
    #draw circle
    plt.text(-0.48,-0.05, str(mis) + '%', fontsize=10)
    centre_circle = plt.Circle((0,0),0.70,fc='white', edgecolor='w', linewidth=.5)
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    # Equal aspect ratio ensures that pie is drawn as a circle
    plt.tight_layout()
    plt.savefig('test.png', dpi=900)

In [None]:
pieplot(df.copy())

In [None]:
print('Missing:', str(round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)) + '%')

# List of features which will be dropped

In [None]:
cols = ['dssage', 'dsplace', 'dsend',
        'momagesx', 'dadagesx',
        'sxs_m', 'sxs_c', 'sxs_p', 'sxs_o',
        'sxf_m', 'sxf_c', 'sxf_p', 'sxf_o',
        'sxr_m', 'sxr_c', 'sxr_p', 'sxr_o',
         #'ccpsyfh',
        'opifrq', 'barfrq', 'ritfrq', 'trqfrq', 'herfrq', 'inhfrq',
        'xgwas', 'xbsp', 'xpheno', 'xmorpho', 'ximage',
        'updmed', 'updmh', 'updhdh', 'mvsrc', 'mvrsn', 'crlvl', 'dpdy',
        'attmpt1dy', 'attmpt2dy', 'sxgsdy',
        'rdcwkd', 'rdcwkhw',
        'miscore', 'fiscore',
        'gen1', 'gen2', 'gen3', 'gen4', 'gen5', 'gen6',
        'pbas1wo', 'pbas2wo', 'pbas3wo', 'pbas4wo', 'pbas5wo', 'pbas6wo',
        'pbas7wo', 'pbas8wo', 'pbas9wo', 'pbas10wo', 'pbas11wo',
        'wpaiscr1', 'wpaiscr2', 'wpaiscr3', 'wpaiscr4',
        'attmpt11', 'attmpt12', 'attmpt21', 'attmpt22', 'attmpt3dy', 'attmpt31', 'attmpt32'
       ]

In [None]:
len(cols)

### Dropping Useless Features

In [None]:
df = df.drop(['updmed', 'updmh', 'updhdh', # Update to medication, comorbid, clinical characteristics and or onset of HD?
              'mvsrc', 'mvrsn', 'crlvl', 'dpdy', # Features assocciated with missed visit
             ], axis=1)

In [None]:
# Date of most recent attempts (1 and 2)
# Day of data entry of Profile information (sxgsdy)
df = df.drop(['attmpt1dy', 'attmpt2dy', 'sxgsdy'], axis=1)

In [None]:
pieplot(df.copy())

# Outliers (u + 4xstd)

<b> Get stats needed to find outliers </b>

In [None]:
# Remove variables with only 1 value, string values, and categorical values
stats = df.loc[:,df.nunique() > 2]\
          .drop(['subjid', 'visit', 'region', 'seq', 'handed', 'diagconf', 'race', 'pbas2sv', 'pbas2fr',
                 'pbas2wo', 'pbas9sv', 'pbas9fr', 'pbas9wo', 'pbas10sv', 'pbas10fr', 'pbas10wo'],
                axis=1).describe()
stats

<b> Find variables which are considered outliers </b>

In [None]:
cols = stats.loc[:,
    (stats.loc['max'] > (stats.loc['mean'] + (4 * stats.loc['std']))) | \
    (stats.loc['min'] < (stats.loc['mean'] - (4 * stats.loc['std'])))
].columns

stats.loc[:,
    (stats.loc['max'] > (stats.loc['mean'] + (4 * stats.loc['std']))) | \
    (stats.loc['min'] < (stats.loc['mean'] - (4 * stats.loc['std'])))
]

<b> Sort outliers by the difference of max and min value (nicer plot)</b>

In [None]:
pairs = [pair for pair in zip(cols,
                              np.nanmax(df[cols].values.astype(float), axis=0) - \
                              np.nanmin(df[cols].values.astype(float), axis=0),
                             )
        ]
sort_pairs = sorted(pairs, key=lambda p: p[1])
sort_cols = [pair[0] for pair in sort_pairs]
sort_cols[:5]

<b> Remove nan values </b>

In [None]:
x = df[sort_cols].values
mask = ~np.isnan(x)
filtered_data = [d[m] for d, m in zip(x.T, mask.T)]
print('Outlier variables:', len(filtered_data))

<b> Plot outliers </b>

In [None]:
vals = 8

rows = int(np.ceil(len(filtered_data)/vals))
fig, axs = plt.subplots(rows, figsize=(12,18))

for i in range(0, len(filtered_data), vals):
    r = int(np.ceil(i/vals))
    x = filtered_data[i:i+vals]
        
    axs[r].boxplot(x, labels=sort_cols[i:i+vals])
    
    # sns.boxplot(x=range(i, i+11), y=np.array(x), ax=axs[r], labels=sort_cols[i:i+11])
    
    s = stats[sort_cols[i:i+vals]]
    
    axs[i//vals].scatter(range(1, s.shape[1]+1), s.loc['mean'] + (4*s.loc['std']), c='red', marker='_', s=500)
    axs[i//vals].scatter(range(1, s.shape[1]+1), s.loc['mean'] - (4*s.loc['std']), c='red', marker='_', s=500)

# plt.tight_layout()
# fig.savefig('figures/outliers.png', dpi=1200)

<b> Interesting outliers (PDS4) </b>

In [None]:
# Outliers found in PDS4 were corrected in PDS5

cols = ['trlb1', 'trlb2', 'trlb3', 'alcunits', 'tug1'] # sit, trla, scst, scnt, swrt, verfct
fig, axs = plt.subplots(1, len(cols), figsize=(15,5))
for i, c in enumerate(cols):
    axs[i].boxplot(df[c].dropna().values, labels=[c])
pass

## trlb outlier (PDS4)

In [None]:
print(df.trlb2.describe())
# df.loc[(df['trlb1'] < 240) & (df['trlb2'] > 25), 'trlb2'] = 25
# df.loc[(df.trlb2 > 25), ['trlb1', 'trlb2', 'trlb3']] = np.nan
# df.loc[(df.trlb3 == 240), ['trlb3']] = np.nan

### Remove alcunit (PDS4)

In [None]:
# Remove alcunits outlier
# idx = df[df['alcunits'] == 875].index.values[0]
# df.loc[idx,'alcunits'] = np.nan

# Profile

In [None]:
cols = ['region', 'sex', 'race', 'handed', 'hxsid', 'dssage', 'dsplace', 'dsend', 'caghigh', 'caglow',
        'momhd', 'momagesx', 'dadhd', 'dadagesx', 'fhx',
        'ccmtr', 'ccmtrage', 'sxsubj', 'sxsubjm', 'sxs_m', 'sxs_c', 'sxs_p', 'sxs_o', 'sxfam',
        'sxfamm', 'sxf_m', 'sxf_c', 'sxf_p', 'sxf_o', 'hddiagn', 'sxest', 'sxrater', 'sxestcfd', 'sxreas', 'sxgs',
        'sxraterm', 'sxr_m', 'sxr_c', 'sxr_p', 'sxr_o', 'ccdep', 'ccdepage', 'ccirb', 'ccirbage',
        'ccvab', 'ccvabage', 'ccapt', 'ccaptage', 'ccpob', 'ccpobage', 'ccpsy', 'ccpsyage', 'ccpsyfh', 'cccog',
        'cccogage', 'xgwas', 'xbsp', 'xpheno', 'xmorpho', 'ximage'
       ]

#### Drop

In [None]:
# Drop columns
drop_cols = ['dssage', 'dsplace', 'dsend', # Age of Death, Place of death, Cause of death
             'sxs_m', 'sxs_c', 'sxs_p', 'sxs_o', # 
             'sxf_m', 'sxf_c', 'sxf_p', 'sxf_o',
             'sxr_m', 'sxr_c', 'sxr_p', 'sxr_o',
             'xgwas', 'xbsp', 'xpheno', 'xmorpho', 'ximage']

df = df.drop(drop_cols, axis=1)

cols = [c for c in cols if c not in drop_cols]

In [None]:
pieplot(df.copy())

### Change race to fit order

In [None]:
# This column contains non-continous class data
race_dict = {8: 4, 16: 5, 15: 0}
df['race'] = df['race'].apply(lambda l: race_dict[l] if l in race_dict.keys() else l)

# Replace nan HDDIAGN with sxrater if missing

In [None]:
test = df.groupby('subjid').tail(1)
test.head()

In [None]:
print('Pre-Manifest missing AAO:', ((test['hdcat'] == 2) & (test.hddiagn.isnull())).sum(), '/', (test.hdcat == 2).sum())
print('Manifest missing AAO:', ((test['hdcat'] == 3) & (test.hddiagn.isnull())).sum(), '/', (test.hdcat == 3).sum())

In [None]:
test = test[test['hddiagn'].isnull()]

In [None]:
print('Missing AAO in premanifest and manifest (hddiagn):', test.shape[0])
print('Available AAO estimates (sxest):', ( (test['sxest'] == 1) ).sum() )

print('  - High confidence:', ( (test['sxest'] == 1) & (test['sxestcfd'] == 1) ).sum() )
print('  - Low confidence:', ( (test['sxest'] == 1) & (test['sxestcfd'] == 2) ).sum() )
print('  - NaN confidence:', ( (test['sxest'] == 1) & (test['sxestcfd'].isnull()) ).sum() )


print('Available AAO estimates (sxrater):', ( (~test['sxrater'].isnull()) ).sum() )
print('  - High confidence:', ( (~test['sxrater'].isnull()) & (test['sxestcfd'] == 1) ).sum() )
print('  - Low confidence:', ( (~test['sxrater'].isnull()) & (test['sxestcfd'] == 2) ).sum() )
print('  - NaN confidence:', ( (~test['sxrater'].isnull()) & (test['sxestcfd'].isnull()) ).sum() )

In [None]:
high_conf = df.loc[(df['seq']==1) & (df['sxestcfd'] == 1),
                   ['caghigh', 'hddiagn', 'sxrater']].corr()
high_conf.name = 'sxrater_high'
high_conf = high_conf.rename(index={'sxrater': 'sxrater_high'}, columns={'sxrater': 'sxrater_high'})

In [None]:
low_conf = df.loc[(df['seq']==1) & (df['sxestcfd'] == 2),
                  ['caghigh', 'hddiagn', 'sxrater']].corr()
low_conf.name = 'sxrater_low'
low_conf = low_conf.rename(index={'sxrater': 'sxrater_low'}, columns={'sxrater': 'sxrater_low'})

In [None]:
nan_conf = df.loc[(df['seq']==1) & (df['sxestcfd'].isnull()),
                  ['caghigh', 'hddiagn', 'sxrater']].corr()
nan_conf.name = 'sxrater_nan'
nan_conf = nan_conf.rename(index={'sxrater': 'sxrater_nan'}, columns={'sxrater': 'sxrater_nan'})

In [None]:
corrs = df.loc[df['seq']==1,['caghigh', 'hddiagn', 'sxrater']].corr()
corrs['sxrater_high'] = high_conf['sxrater_high']
corrs.loc['sxrater_high'] = high_conf.loc['sxrater_high']
corrs['sxrater_low'] = low_conf['sxrater_low']
corrs.loc['sxrater_low'] = low_conf.loc['sxrater_low']
corrs['sxrater_nan'] = nan_conf['sxrater_nan']
corrs.loc['sxrater_nan'] = nan_conf.loc['sxrater_nan']

In [None]:
from scipy.stats import pearsonr

cols = ['caghigh', 'hddiagn']
rows = cols + ['sxrater', 'sxrater_high', 'sxrater_low', 'sxrater_nan']
corr = np.empty((len(rows), len(cols)))
pvalues = corr.copy()
anno = corr.copy().astype(str)

target_p = 0.01
for r, r_name in enumerate(rows):
    for c, c_name in enumerate(cols):
        if r_name == 'sxrater_high':
            r_val, c_val = df.loc[(df['seq'] == 1) & (df['sxestcfd'] == 1), [c_name, 'sxrater']]\
                             .rename(columns={'sxrater': r_name}).dropna().values.T
        elif r_name == 'sxrater_low':
            r_val, c_val = df.loc[(df['seq'] == 1) & (df['sxestcfd'] == 2), [c_name, 'sxrater']]\
                             .rename(columns={'sxrater': r_name}).dropna().values.T
        elif r_name == 'sxrater_nan':
            r_val, c_val = df.loc[(df['seq'] == 1) & (df['sxestcfd'].isnull()), [c_name, 'sxrater']]\
                             .rename(columns={'sxrater': r_name}).dropna().values.T
        else:
            r_val, c_val = df.loc[df['seq'] == 1, [r_name, c_name]].dropna().values.T
        corr[r, c], pvalues[r, c] = pearsonr(r_val, c_val)
        # anno[r, c] = 'r: {:.3f}\n(p: {:.1e})'.format(corr[r, c], p_value)
        anno[r, c] = '{:.3f}'.format(corr[r, c])
        
cm = 1/2.54
fig, ax = plt.subplots(1, figsize=(8.5*cm, 8.5*cm))
sns.heatmap(corr, annot=anno, fmt="", vmin=-1, xticklabels=cols, yticklabels=rows,
            cbar_kws={'label': 'Pearson Correlation'},
            annot_kws={"fontsize":8},
            ax=ax,
           )
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=8)
plt.tight_layout()
plt.savefig(join('figures', 'figure1.pdf'), dpi=300)

<b> Create new feature whether AAO was estimated

In [None]:
# Is estimated when missing
df['hddiagn_est'] = (df['hddiagn'].isnull()).astype(int)

<b> Replace empty AAO with Estimated AAO 

In [None]:
# Replace hddiagn with estimate
df.loc[(df['hddiagn'].isnull()),'hddiagn'] = df.loc[(df['hddiagn'].isnull()),'sxrater']

<b> Remove estimates

In [None]:
print('Still missing hddiagn patients:', df.groupby('subjid').head(1).hddiagn.isnull().sum())
df = df.drop(df[df['hddiagn'].isnull()].index)
df = df.drop(['sxrater'], axis=1)
df.head()

<b> Check if AAO is still missing

In [None]:
print('Still missing hddiagn patients:', df.groupby('subjid').head(1).hddiagn.isnull().sum())
print('Premanifest missing:', 
      ((df.groupby('subjid').tail(1).hddiagn.isnull()) & (df.groupby('subjid').tail(1).hdcat == 2)).sum()
     )
print('Manifest missing:',
      ((df.groupby('subjid').tail(1).hddiagn.isnull()) & (df.groupby('subjid').tail(1).hdcat == 3)).sum()
     )

In [None]:
pieplot(df.copy())

### Parent HD

In [None]:
first_visit = df.reset_index().groupby('subjid').first().reset_index()
first_visit.index = first_visit['index']
first_visit = first_visit.drop('index', axis=1)
first_visit.head()

In [None]:
print('Both dad and mom hd missing:', ((first_visit['dadhd'].isnull()) & (first_visit['momhd'].isnull())).sum() )
print('Either dad or mom hd missing:', ((first_visit['dadhd'].isnull()) ^ (first_visit['momhd'].isnull())).sum() )
print('Dad and mom hd known:', (first_visit[['dadhd', 'momhd']].isnull().sum(axis=1) == 0).sum())
print('Total patients', first_visit.shape[0])

### Parentagesx

In [None]:
print('Both dad and mom agesx missing:', ((first_visit['dadagesx'].isnull()) & \
                                          (first_visit['momagesx'].isnull())).sum() )
print('Either dad or mom agesx missing:', ((first_visit['dadagesx'].isnull()) ^ \
                                           (first_visit['momagesx'].isnull())).sum() )
print('Both dad and mom agesx known:', (first_visit[['dadagesx', 'momagesx']].isnull().sum(axis=1) == 0).sum())

In [None]:
# All the rows where the momagesx and dadagesx are available
print('Both dad and mom agesx known:', (first_visit[['dadagesx', 'momagesx']].isnull().sum(axis=1) == 0).sum())
first_visit[(first_visit[['dadagesx', 'momagesx']].isnull().sum(axis=1) == 0)][['dadagesx', 'momagesx']]

In [None]:
# Combine dadagesx and momagesx into parentagesx, where max is taken if both are available
par = df[(df[['dadagesx', 'momagesx']].isnull().sum(axis=1) <= 1)][['dadagesx', 'momagesx']].astype(float)\
        .max(axis=1, skipna=True)

df = df.drop(['dadagesx', 'momagesx'], axis=1)
df.loc[par[~par.isnull()].index,'parentagesx'] = par.loc[~par.isnull()]

In [None]:
test = df.groupby('subjid').head(1)
print('Still missing parent AAO:', ((test['parentagesx'].isnull()) & ((test['momhd'] == 1) | (test['dadhd'] == 1))).sum())
print('Missing but parents had no AAO:', ((test['parentagesx'].isnull()) & ((test['momhd'] != 1) & (test['dadhd'] != 1))).sum())

In [None]:
pieplot(df.copy())

# Transforming numerical values in manifest (from profile)
(age - feature).replace(nan, 0)

- hddiagn: Patient AAO --> years away from AAO
- parentagesx: Parent AAO --> years away from parent AAO (0 if momhd & dadhd == 0, knn if momhd == 1 or dadhd== 1)

- sxsubj: Symptomes first noted by participant (0 if sxsubjm==nan, knn if sxsubjm!=nan)
- sxfam:  Symptomes first noted by family (0 if sxfamm==nan, knn if sxfamm!=nan)

- ccmtrage: At what age did motor symptomes begin (0 if ccmtr!=1, knn if ccmtr==1)
- ccdepage: At what age did the depression begin (0 if ccdep!=1, knn if ccdep==1)
- ccirbage: At what age did the irritability begin (0 if ccirb!=1, knn if ccirb==1)
- ccvabage: At what age did the violent or aggressive behaviour begin (0 if ccvab!=1, knn if ccvab==1)
- ccaptage: At what age did the apathy begin (0 if ccapt!=1, knn if ccapt==1)
- ccpobage: At what age did the perseverative obsessive behaviour begin (0 if ccpob!=1, knn if ccpob==1)
- ccpsyage: At what age did the psychosis begin (0 if ccpsy!=1, knn if ccpsy==1)
- cccogage: At what age did  cognitive impairment first start to have an impact on daily life (0 if cccog!=1, knn if cccog==1)
- rtrddur:  Age at retirment (0 if emplnrsn!=2 (not retired), knn if emplnrsn==2)

In [None]:
# Some nans caused by not retired, some nan caused by unknown (knn)
cols = ['hddiagn', 'parentagesx', 'ccmtrage', 'sxsubj', 'sxfam', 'ccdepage', 'ccirbage', 'ccvabage', 'ccaptage',
        'ccpobage', 'ccpsyage', 'cccogage', 'rtrddur'
       ]

In [None]:
plt.figure(figsize=(15,5))
plt.boxplot(df[cols].dropna().values, labels=cols)
pass

<b> Formula

In [None]:
age_to_distance = lambda x: x+1 if x >= 0 else x

#### hddiagn (AAO)

In [None]:
c = 'hddiagn'
df.loc[:, c] = (df.loc[:, 'age'] - df.loc[:, c]).apply(age_to_distance)

# Replace pre-manifest subjects AAO with 0 (no onset happened yet)
pre_subj = df.groupby('subjid').tail(1).loc[df['hdcat']==2, 'subjid'].values
df.loc[df['subjid'].isin(pre_subj),c] = df.loc[df['subjid'].isin(pre_subj), c].replace(np.nan, 0)

#### parentagesx

In [None]:
print('No age to be estimated:', ((df['momhd']!=1) & (df['dadhd']!=1)).sum(), '/',
      df['parentagesx'].isnull().sum())
print('To be imputed:', (((df['momhd']==1) | (df['dadhd']==1)) & (df['parentagesx'].isnull()) ).sum(), '/',
      df['parentagesx'].isnull().sum())
print('Known ages:', (~df['parentagesx'].isnull()).sum(), '/', df.shape[0])

# Transform parentagesx
c = 'parentagesx'
df.loc[:, c] = (df.loc[:, 'age'] - df.loc[:, c]).apply(age_to_distance)

# Set parentagesx to 0 if no age can be estimated
df.loc[((df['momhd']!=1) & (df['dadhd']!=1)), 'parentagesx'] = 0

#### sxsubj

In [None]:
print('No age to be estimated:', ((df['sxsubjm'].isnull()) & (df['sxsubj'].isnull()) ).sum() )
print('Knn to be estimated:', ( (~df['sxsubjm'].isnull()) & (df['sxsubj'].isnull()) ).sum())
print('Known ages:', (~df['sxsubj'].isnull()).sum(), '/', df.shape[0])

c = 'sxsubj'
df.loc[:, c] = (df.loc[:, 'age'] - df.loc[:, c]).apply(age_to_distance)

# Set sxsubj to 0 if no age can be estimated
df.loc[((df['sxsubjm'].isnull()) & (df['sxsubj'].isnull())), 'sxsubj'] = 0

print(df['sxsubj'].isnull().sum())

#### sxfam

In [None]:
print('No age to be estimated:', ((df['sxfamm'].isnull()) & (df['sxfam'].isnull()) ).sum() )
print('Knn to be estimated:', ( (~df['sxfamm'].isnull()) & (df['sxfam'].isnull()) ).sum())
print('Known ages:', (~df['sxfam'].isnull()).sum(), '/', df.shape[0])

c = 'sxfam'
df.loc[:, c] = (df.loc[:, 'age'] - df.loc[:, c]).apply(age_to_distance)

# Set sxfam to 0 if no age can be estimated
df.loc[((df['sxfamm'].isnull()) & (df['sxfam'].isnull())), 'sxfam'] = 0

print(df['sxfam'].isnull().sum())

#### ccmtrage

In [None]:
number = 'ccmtrage'
boolean = 'ccmtr'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set ccmtrage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### ccdepage

In [None]:
number = 'ccdepage'
boolean = 'ccdep'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set ccdepage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### ccirbage

In [None]:
number = 'ccirbage'
boolean = 'ccirb'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set cccogage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### ccvabage

In [None]:
number = 'ccvabage'
boolean = 'ccvab'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set ccvabage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### ccaptage

In [None]:
number = 'ccaptage'
boolean = 'ccapt'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set ccaptage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### ccpobage

In [None]:
number = 'ccpobage'
boolean = 'ccpob'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

# transform ccpobage
df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set ccpobage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### ccpsyage

In [None]:
number = 'ccpsyage'
boolean = 'ccpsy'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

# transform ccpsyage
df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set ccpsyage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

#### cccogage

In [None]:
number = 'cccogage'
boolean = 'cccog'
print('No age to be estimated:', ( (df[boolean]!=1) & (df[number].isnull()) ).sum())
print('Knn to be estimated:', ( (df[boolean]==1) & (df[number].isnull()) ).sum())
print('Known ages:', (~df[number].isnull()).sum(), '/', df.shape[0])

# transform cccogage
df.loc[:, number] = (df.loc[:, 'age'] - df.loc[:, number]).apply(age_to_distance)

# Set cccogage to 0 if no age can be estimated
df.loc[(df[boolean]!=1) & (df[number].isnull()), number] = 0

print(df[number].isnull().sum())

In [None]:
plt.figure(figsize=(15,5))
plt.boxplot(df[cols].replace(0, np.nan).dropna().values, labels=cols)
pass

In [None]:
pieplot(df.copy())

### Weight, Height and BMI

<b> Missing height, weight and BMI

In [None]:
# Fix height, weight and bmi
df[['height', 'weight', 'bmi']].isnull().sum()

In [None]:
print('Missing weight, height or bmi:',
      (( (df['height'].isnull()) | (df['weight'].isnull()) )  | ( df['bmi'].isnull() )).sum() )

print('Missing weight and Height', ( (df['height'].isnull()) &  ( df['weight'].isnull() )).sum())

print('Missing only weight:',
      ( (~df['height'].isnull()) &  ( df['weight'].isnull() )).sum())
print('Missing only height:',
      ( (~df['weight'].isnull()) &  ( df['height'].isnull() )).sum())

print('Missing only bmi:',
      (( (~df['height'].isnull()) & (~df['weight'].isnull()) ) & ( df['bmi'].isnull() )).sum())

<b> Plot difference of a feature between visits </b>

In [None]:
def dif_between_visits(df_in, label):
    # Change of label between visits
    stats = []
    df_in = df_in[~df_in[label].isnull()].sort_values(['subjid', 'seq']).copy()
    prev_subj = ''

    for i, row in df_in.iterrows():
        if row['subjid'] != prev_subj:
            prev = row[label]
        if (row[label] - prev) < -30 or (row[label] - prev > 30):
            pass
        stats.append(row[label] - prev)
        prev = row[label]
        prev_subj = row['subjid']
    
    plt.xlabel(label + ' differences between visits')
    plt.ylabel('density')
    sns.distplot(stats)

<b> Function to fill in height/weight

In [None]:
def replace_with_latest(df_in, col, t=np.inf):
    """
    :param df_in: The dataframe to work on
    :param col: The column in df_in to alter
    :param t: The maximum difference allowed between visits to use as replacement
    
    This function does a backward pass for each subject.
    It checks if the latest value is not greater than the given threshold (t) or missing.
    If this checks out use the latest value (x) to replace the current value
    Else the lastest value (x) is the current value
    
    After the backward pass the process is repeated again using a forward pass
    """
    subj = ''
    x = np.nan
    for i, row in tqdm(df_in[::-1].iterrows()):
        # New subject found
        if df_in.loc[i,'subjid'] != subj or np.isnan(x):
            # current subject
            subj = row['subjid']
            # Current value
            x = row[col]
        else:
            # Check if the latest value is not greater than the given threshold (t) or missing.
            if abs(x - row[col]) > t or np.isnan(row[col]):
                df_in.loc[i,col] = x
            # Latest value (x) is current value
            else:
                x = row[col]
    subj = ''
    x = np.nan
    for i, row in tqdm(df_in.iterrows()):
        # New subject found
        if df_in.loc[i,'subjid'] != subj or np.isnan(x):
            # current subject
            subj = row['subjid']
            # Current value
            x = row[col]
        else:
            # Check if the latest value is not greater than the given threshold (t) or missing.
            if abs(x - row[col]) > t or np.isnan(row[col]):
                df_in.loc[i,col] = x
            # Latest value (x) is current value
            else:
                x = row[col]
    return df_in

# Weight

In [None]:
sns.boxplot(df['weight'].dropna())

In [None]:
# Change of weight between visits
dif_between_visits(df, 'weight')

<b> Missing

In [None]:
df['weight'].isnull().sum()

<b> Replace (t=np.inf, i.e. only replace nans)

In [None]:
df = replace_with_latest(df.copy(), 'weight')

<b> New missing

In [None]:
df['weight'].isnull().sum()

In [None]:
sns.boxplot(df['weight'].dropna())

In [None]:
# Change of weight between visits
dif_between_visits(df, 'weight')

# height

In [None]:
sns.boxplot(df['height'].dropna())

In [None]:
# Change of height between visits per subject
dif_between_visits(df, 'height')

<b> Example

In [None]:
df[df['subjid'] == 'R715006254'][['subjid', 'seq','height']]

<b> Missing

In [None]:
df['height'].isnull().sum()

<b> Replace height (t=1cm, i.e. height cant change more than 1 cm between visits)

In [None]:
df = replace_with_latest(df.copy(), 'height', 1)

<b> New missing

In [None]:
df['height'].isnull().sum()

<b> Same example

In [None]:
df[df['subjid'] == 'R715006254'][['subjid', 'seq','height']]

In [None]:
dif_between_visits(df, 'height')

# Recalculate BMI

In [None]:
df['bmi'] = (df['weight'] / (df['height'] / 100)**2).round(1)
df['bmi'].isnull().sum()

In [None]:
mis_patients = df[df['bmi'].isnull()]['subjid'].unique()
print(len(mis_patients))

<b> Drop patients with missing BMI

In [None]:
print(df.shape)
df = df[~df['subjid'].isin(mis_patients)]
print(df.shape)

In [None]:
pieplot(df)

In [None]:
df[['weight', 'height', 'bmi']].boxplot()

#### Dummies in profile

In [None]:
df['sex'] = df['sex'].apply(lambda val: 0 if val == 'm' else 1)

In [None]:
# Skip sex, caghigh, caglow, fhx
cat_cols = ['region', 'race', 'handed', 'hxsid', 'momhd', 'dadhd', 'ccmtr', 'sxsubjm', 'sxfamm', 'sxraterm', 'ccpsyfh',
            'ccdep', 'ccirb', 'ccvab', 'ccapt', 'ccpob', 'ccpsy', 'cccog', 'sxestcfd', 'sxreas', 'sxgs', 'sxest'
           ]

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df.copy())

# Infer variables from previous visit

### Section Medical History (MHx)

In [None]:
cols = ['hxalcab', 'hxtobab', 'hxtobcpd', 'hxtobyos', 'hxpacky', 'hxdrugab', 'hxmar', 'hxmarfrq', 'hxher',
        'hxherfrq', 'hxcoc', 'hxcocfrq', 'hxclb', 'hxclbfrq', 'hxamp', 'hxampfrq', 'hxrit', 'hxritfrq', 'hxhal',
        'hxhalfrq', 'hxinh', 'hxinhfrq', 'hxopi', 'hxopifrq', 'hxpak', 'hxpakfrq', 'hxbar', 'hxbarfrq', 'hxtrq',
        'hxtrqfrq'
       ]
df.head(5)[cols]

In [None]:
print('All values after baseline are missing:', df.loc[(df['seq'] > 1)][cols].isnull().all().all())

In [None]:
df.loc[df['seq'] == 1, ['subjid'] + cols].head()

#### Infer mhx features from first visit

In [None]:
df = df.drop(cols, axis=1).merge(df.loc[df['seq'] == 1, ['subjid'] + cols], on='subjid', how='right')
df[['subjid', 'seq'] + cols].head()

#### Set 'hxtobcpd', 'hxtobyos', 'hxpacky' to 0 if hxtobab is unknown or 0

In [None]:
print(df.loc[(df['hxtobab']) != 1, ['hxtobcpd', 'hxtobyos', 'hxpacky']].isnull().all().all())
df.loc[(df['hxtobab']) != 1, ['hxtobcpd', 'hxtobyos', 'hxpacky']] = 0
print(df[['hxtobcpd', 'hxtobyos', 'hxpacky']].isnull().sum())
df.loc[(df['hxtobab']) != 1, ['hxtobcpd', 'hxtobyos', 'hxpacky']].head()

#### Change specific drug abuse 0 if drugab == 0

In [None]:
cols = ['hxmar', 'hxher', 'hxcoc', 'hxclb', 'hxamp','hxrit', 'hxhal', 'hxinh', 'hxopi', 'hxpak', 'hxbar', 'hxtrq']

df.loc[df['hxdrugab']==0,cols] = 0

df.loc[df['hxdrugab']==0,cols].head()

print(df[cols].isnull().sum())

In [None]:
cols = ['hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq', 'hxritfrq', 'hxhalfrq', 'hxinhfrq', 'hxopifrq',
        'hxpakfrq', 'hxbarfrq', 'hxtrqfrq']

df.loc[((df['hxdrugab']==0) | (df['hxdrugab'].isnull()) ),cols] = 0

df.loc[((df['hxdrugab']==0) | (df['hxdrugab'].isnull()) ),cols].head()

print(df[cols].isnull().sum())

#### Change category features to dummy variables

In [None]:
cat_cols = ['hxalcab', 'hxtobab', 'hxdrugab', 'hxmar', 'hxher',  'hxcoc', 'hxclb', 'hxamp', 'hxrit', 'hxhal', 'hxinh',
            'hxopi',  'hxpak',  'hxbar', 'hxtrq']

others = ['hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq', 'hxritfrq', 'hxhalfrq', 'hxinhfrq', 'hxopifrq',
          'hxpakfrq', 'hxbarfrq', 'hxtrqfrq']

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df.copy())

### Section General Variable Items I (Baseline & Follow-Up)

In [None]:
# Also includes, height, weight and bmi, which are handled seperatly
cols = ['alcab', 'alcunits', 'tobab', 'tobcpd', 'tobyos', 'packy', 'cafab', 'cafpd', 'drugab', 'mar', 'marfrq',
        'her', 'herfrq', 'coc', 'cocfrq', 'clb', 'clbfrq', 'amp', 'ampfrq', 'rit', 'ritfrq', 'hal', 'halfrq',
        'inh', 'inhfrq', 'opi', 'opifrq', 'pak', 'pakfrq', 'bar', 'barfrq', 'trq', 'trqfrq'
       ]

df[['subjid', 'seq'] + cols].head(5)

In [None]:
# Set cafab to 1 if cafpd != nan and cafab == 0 or nan
df.loc[(~df['cafpd'].isnull()), 'cafab'] = 1

#### alcunits are 0 when alcab is zero or nan, use Knn if tobab == 1

In [None]:
df.loc[df['seq']==1,'alcab'].isnull().sum()

In [None]:
df.loc[((df['alcab'] == 0) | (df['alcab'].isnull())) & (df['alcunits'].isnull()), 'alcunits'] = 0

In [None]:
print(df['alcunits'].isnull().sum())

#### tobcpd, tobyos, packy are 0 when tobabuse is zero or nan, use ML if tobab == 1

In [None]:
# Tobab is 0 or nan and tobcpd is null, tobcp = 0
df.loc[((df['tobab'] == 0) | (df['tobab'].isnull())) & (df['tobcpd'].isnull()), 'tobcpd'] = 0
# Tobab is 0 or nan and tobcpd is null, tobyos = 0
df.loc[((df['tobab'] == 0) | (df['tobab'].isnull())) & (df['tobyos'].isnull()), 'tobyos'] = 0
# Tobab is 0 or nan and tobcpd is null, packy = 0
df.loc[((df['tobab'] == 0) | (df['tobab'].isnull())) & (df['packy'].isnull()), 'packy'] = 0

In [None]:
print(df['tobcpd'].isnull().sum())
print(df['tobyos'].isnull().sum())
print(df['packy'].isnull().sum())

#### If does not abuse caffeine then number of coffees is always zero


In [None]:
df.loc[df['cafab']==0,'cafpd'] = 0

#### Set drugs to 0 if drugab = 0

In [None]:
d_cols = ['mar', 'her', 'coc', 'clb', 'amp', 'rit', 'hal', 'inh', 'opi', 'pak', 'bar', 'trq']
print('All drugs are nan if drugab = 0:', df.loc[df['drugab'] == 0, d_cols].isnull().all().all())

# Set drugs to 0 if drugab = 0
df.loc[df['drugab'] == 0, d_cols] = 0
print('All drugs are 0 if drugab = 0:', (df.loc[df['drugab'] == 0, d_cols] == 0).all().all())

In [None]:
cols = ['marfrq', 'herfrq', 'cocfrq', 'clbfrq', 'ampfrq', 'ritfrq', 'halfrq', 'inhfrq', 'opifrq',
        'pakfrq', 'barfrq', 'trqfrq']

df.loc[((df['drugab']==0) | (df['drugab'].isnull()) ),cols] = 0
print(df[cols].isnull().sum())

#### make dummies of categorical features

In [None]:
# dummy rest
cat_cols =  ['alcab', 'tobab', 'cafab', 'cafpd', 'drugab', 'mar', 'her', 'coc', 'clb', 'amp', 'rit', 'hal', 'inh',
             'opi', 'pak', 'bar', 'trq']

others = ['marfrq', 'herfrq', 'cocfrq', 'clbfrq', 'ampfrq', 'ritfrq', 'halfrq',
          'inhfrq', 'opifrq', 'pakfrq', 'barfrq', 'trqfrq']

df = df.drop(['opifrq', 'barfrq', 'ritfrq', 'trqfrq', 'herfrq', 'inhfrq', 'pakfrq'], axis=1)

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df)

### Section General Variable Items II (Baseline & Follow-up)

In [None]:
cols = ['updsc', 'hdcat', 'maristat', 'res', 'isced', 'jobclas', 'jobpaid',
        'rdcwk', 'rdcwkd', 'rdcwkhw', # Baseline
        'emplnrsn', 'emplnrd', 'ssdb', 'rtrnwk', 'rtrddur']
df[['subjid', 'seq'] + cols].head(5)

##### One instance says it is not updated but it is

In [None]:
df[df['updsc'] == 0].loc[(~df[df['updsc'] == 0][cols[2:]].isnull()).any(1)][['subjid', 'seq']]

In [None]:
df[df['subjid'] == 'R012716790'][cols]

In [None]:
df.loc[(df['subjid'] == 'R012716790') & (df['seq'] == 2), 'updsc'] = 1
df[df['subjid'] == 'R012716790'][cols]

##### Some updsc are null while all values are 0

In [None]:
cond = (df['updsc'].isnull()) & (df['seq'] != 1)
df[cond][['subjid', 'seq'] + cols]

In [None]:
# Set to updsc to 0 if values miss
df.loc[(df['updsc'].isnull()) & (df['seq'] != 1),'updsc'] = 0
df[cond][cols]

#### All first visits are None

In [None]:
print('Updsc only misses on first visit:', df.loc[df['seq']==1,'updsc'].isnull().sum() == df['updsc'].isnull().sum())
print('If no update all features are nan:', df[df['updsc'] == 0][cols[2:]].isnull().all().all())
print('If visit updates (updsc) is true, at least one feature is not nan:', (~df[df['updsc'] == 1][cols[2:]].isnull()).any().any())

#### 3 cases someone is not employed but does get paid

In [None]:
df.loc[(df['jobclas'] == 4) & (~df['jobpaid'].isnull())][['subjid', 'seq'] + cols]

In [None]:
# Set to nan
df.loc[(df['jobclas'] == 4) & (~df['jobpaid'].isnull()), ['jobpaid']] = np.nan
df.loc[(df['jobclas'] == 4), 'jobpaid'].isnull().all()

#### Replace current values using latest visit (forward fill)

In [None]:
cols = ['maristat', 'res', 'isced', 'jobclas', 'jobpaid', 
        'rdcwk', 'rdcwkd', 'rdcwkhw',
        'emplnrsn', 'emplnrd', 'ssdb', 'rtrnwk', 'rtrddur']
df[['subjid', 'seq', 'updsc', 'hdcat'] + cols].head(10)

In [None]:
df['updsc'].fillna(1, inplace=True)

In [None]:
df.loc[:, cols] = df[['subjid'] + cols].groupby('subjid').fillna(method='ffill')

In [None]:
df[['subjid', 'seq', 'updsc', 'hdcat'] + cols].head(10)

### rtrddur

In [None]:
# Transform rtrddur to a retirement age
df.loc[df['emplnrsn'] != 2, 'rtrddur'] = np.nan

In [None]:
# Keep largest rtrddur per patient
df['rtrddur'] = df.groupby('subjid')['rtrddur'].transform('max')

In [None]:
print('Max retirement ages per patient:', (df.groupby('subjid').rtrddur.nunique()).max())

In [None]:
retired_subj = df[df['emplnrsn']==2].subjid.values

print('Missing while retired:', df[df['emplnrsn']==2].groupby('subjid').rtrddur.max().isnull().sum(),
      '/', df[df['emplnrsn']==2].groupby('subjid').rtrddur.max().shape[0]
     )

print('Missing while not retired:', df[(~df['subjid'].isin(retired_subj))].groupby('subjid').rtrddur.max().isnull().sum(), 
      '/', df[(~df['subjid'].isin(retired_subj))].groupby('subjid').rtrddur.max().shape[0])

print(df.subjid.nunique())

In [None]:
# Set to duration
df.loc[(df['subjid'].isin(retired_subj)), 'rtrddur'] = (df.loc[(df['subjid'].isin(retired_subj)), 'age'] - 
                                                        df.loc[(df['subjid'].isin(retired_subj)), 'rtrddur']).\
                                                        apply(lambda x: x+1 if x >= 0 else x)

# Set others to 0
df.loc[(~df['subjid'].isin(retired_subj)), 'rtrddur'] = 0

In [None]:
# Original retirement age stats
(df.age - df.rtrddur).describe()

In [None]:
# Retirement duration
plt.boxplot(df.groupby('subjid').rtrddur.max().dropna())

#### Drop rdcwkd and rdcwkhw (cant logically impute)

In [None]:
df = df.drop(['rdcwkd', 'rdcwkhw'], axis=1)

#### Change category features dummies

In [None]:
cat_cols = ['maristat', 'res', 'isced', 'jobclas', 'jobpaid', 
            'rdcwk', 'emplnrsn', 'emplnrd', 'ssdb', 'rtrnwk']

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df)

# Motscore

In [None]:
df = df.drop('miscore', axis=1)

In [None]:
pieplot(df)

# Fascore

In [None]:
df = df.drop('fiscore', axis=1)

In [None]:
pieplot(df)

# Form UHDRS Total Functional Capacity (TFC)

In [None]:
# All cols are categorical

cols = ['occupatn', 'finances', 'chores', 'adl', 'carelevl']

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

In [None]:
print(df['tfcscore'].isnull().sum(), 'visits miss the tfcscore')
print('When tfcscore misses then all features are missing:',
      df.loc[(df['tfcscore'].isnull()), cols].isnull().all().all())
print('When tfcscore misses at least', df.loc[(df['tfcscore'].isnull()), cols].isnull().sum(1).min(),
      'feature has to miss')
print('When tfcscore misses a max of', df.loc[(df['tfcscore'].isnull()), cols].isnull().sum(1).max(),
      'features are missing')

### Cognitive Assessments (Cognitive)

In [None]:
cols = ['gen1', 'gen2', 'gen3', 'gen4', 'gen5', 'gen6', 'sdmt', 'sdmt1', 'sdmt2', 'sdmtnd', 'verfct', 'verfctd',
        'verfct5', 'verfct6', 'verfct7', 'verfctnd', 'scnt', 'scnt1', 'scnt2', 'scnt3','scntnd', 'swrt', 'swrt1',
        'swrt2', 'swrt3', 'swrtnd', 'sit', 'sit1', 'sit2', 'sit3', 'trl', 'trla1', 'trla2', 'trla3', 'trlb1',
        'trlb2', 'trlb3', 'verflt', 'verflt05', 'verflt06', 'verflt07'
       ]

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

#### gen1 is nan, but assocciated values are filled in

In [None]:
df[(df['gen1'].isnull()) & (~df['gen2'].isnull())][cols[:6]]

In [None]:
df.loc[(df['gen1'].isnull()) & (~df['gen2'].isnull()), 'gen1'] = 1

#### sdmt == 0 but assocciated values are filled in

In [None]:
df.loc[(df['sdmt']==0) & 
       ( (~df['sdmt1'].isnull()) | (~df['sdmt2'].isnull()) )][['sdmt', 'sdmt1', 'sdmt2', 'sdmtnd']]

In [None]:
df.loc[(df['sdmt']==0) & 
       ( (~df['sdmt1'].isnull()) | (~df['sdmt2'].isnull()) ), 'sdmt'] = 1

#### verfct == 0  but assocciated values are filled in

In [None]:
df.loc[(df['verfct']==0) & (~df['verfctd'].isnull()), ['verfct', 'verfctd']]

In [None]:
df.loc[(df['verfct']==0) & (~df['verfctd'].isnull()), 'verfct'] = 1

#### Drop

In [None]:
df = df.drop(['gen1', 'gen2', 'gen3', 'gen4', 'gen5', 'gen6'], axis=1)

#### Make dummies

In [None]:
cat_cols = ['sdmt', 'sdmtnd', 'verfct', 'verfctd', 'verfctnd', 'scnt', 'scntnd', 'swrt', 'swrtnd', 'sit',
            'trl', 'verflt']


dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df.copy())

### Mini Mental State Examination (MMSE)

In [None]:
# >>> Knn <<< or remove feature or remove rows
df['mmsetotal'].isnull().sum()

In [None]:
df['mmsetotal'].isnull().sum() / df.shape[0]

In [None]:
pieplot(df)

### Physiotherapy Outcomes Measures (Physiotherapy)

In [None]:
cols = ['tug', 'tug1', 'scst', 'scst1']

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

In [None]:
df.tug1.isnull().sum() / df.shape[0]

In [None]:
df.scst1.isnull().sum() / df.shape[0]

#### One scst is 0, however the time is filled in as zero

In [None]:
df.loc[ (df['scst'] == 0) & (~df['scst1'].isnull()), ['scst', 'scst1'] ]

In [None]:
df.loc[ (df['scst'] == 0) & (~df['scst1'].isnull()), ['scst1'] ] = np.nan

#### Make scst and tug dummies

In [None]:
cat_cols = ['tug', 'scst']

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df)

### Problem Behaviours Assessment (PBA‐s)

In [None]:
cols = ['depscore', 'irascore', 'psyscore', 'aptscore', 'exfscore', 'pbas1sv', 'pbas1fr', 'pbas1wo', 'pbas2sv',
        'pbas2fr', 'pbas2wo', 'pbas3sv', 'pbas3fr', 'pbas3wo', 'pbas4sv', 'pbas4fr', 'pbas4wo', 'pbas5sv',
        'pbas5fr', 'pbas5wo', 'pbas6sv', 'pbas6fr', 'pbas6wo', 'pbas7sv', 'pbas7fr', 'pbas7wo', 'pbas8sv',
        'pbas8fr', 'pbas8wo', 'pbas9sv', 'pbas9fr', 'pbas9wo', 'pbas10sv', 'pbas10sm__1', 'pbas10sm__2',
        'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5', 'pbas10fr', 'pbas10wo', 'pbas11sv', 'pbas11fr', 'pbas11wo',
        'pbainfo', 'pbahshd'
       ]

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

In [None]:
print((~df['exfscore'].isnull()).sum())
(df['exfscore'] == ((df['pbas7sv'] * df['pbas7fr']) + (df['pbas8sv'] * df['pbas8fr']) +
                    (df['pbas11sv'] * df['pbas11fr']))).sum()

In [None]:
cols = ['depscore', 'irascore', 'psyscore', 'aptscore', 'exfscore', 'pbas1sv', 'pbas1fr',  'pbas2sv',
        'pbas2fr',  'pbas3sv', 'pbas3fr',  'pbas4sv', 'pbas4fr','pbas5sv',
        'pbas5fr', 'pbas6sv', 'pbas6fr', 'pbas7sv', 'pbas7fr', 'pbas8sv',
        'pbas8fr', 'pbas9sv', 'pbas9fr', 'pbas10sv', 'pbas10fr', 'pbas11sv', 'pbas11fr']

plt.figure(figsize=(15,5))
sns.heatmap(df[cols].corr().iloc[:5,5:], annot=True, square=True)
plt.xticks(rotation=45)
pass

#### Set pbas10sm to 0 if missing

In [None]:
df.loc[:,['pbas10sm__1', 'pbas10sm__2', 'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5']].isnull().sum()

In [None]:
# Replace nan with 0's
df.loc[:,['pbas10sm__1', 'pbas10sm__2', 'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5']
      ] = df[['pbas10sm__1', 'pbas10sm__2', 'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5']].replace(np.nan, 0)

df.loc[:,['pbas10sm__1', 'pbas10sm__2', 'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5']].head()

#### Worst

In [None]:
cols = ['pbas1wo', 'pbas2wo', 'pbas3wo', 'pbas4wo', 'pbas5wo', 'pbas6wo', 'pbas7wo', 'pbas8wo', 'pbas9wo',
        'pbas10wo', 'pbas11wo']

print( df.loc[df['seq']==1,cols].isnull().sum().sum() )
print( df.loc[df['seq']==2,cols].isnull().sum().sum() )
print( df.loc[df['seq']==3,cols].isnull().sum().sum() )
print( df.loc[df['seq']==4,cols].isnull().sum().sum() )
print( df.loc[df['seq']==5,cols].isnull().sum().sum() )

In [None]:
df = df.drop(['pbas1wo', 'pbas2wo', 'pbas3wo', 'pbas4wo', 'pbas5wo', 'pbas6wo',
              'pbas7wo', 'pbas8wo', 'pbas9wo', 'pbas10wo', 'pbas11wo'], axis=1)

#### Set dummies

In [None]:
cat_cols = ['pbas1sv', 'pbas1fr', 'pbas1wo', 'pbas2sv', 'pbas2fr', 'pbas2wo', 'pbas3sv', 'pbas3fr', 'pbas3wo',
            'pbas4sv', 'pbas4fr', 'pbas4wo', 'pbas5sv', 'pbas5fr', 'pbas5wo', 'pbas6sv', 'pbas6fr', 'pbas6wo',
            'pbas7sv', 'pbas7fr', 'pbas7wo', 'pbas8sv', 'pbas8fr', 'pbas8wo', 'pbas9sv', 'pbas9fr', 'pbas9wo',
            'pbas10sv', 'pbas10fr', 'pbas10sm__1', 'pbas10sm__2', 'pbas10sm__3', 'pbas10sm__4', 'pbas10sm__5',
            'pbas10wo', 'pbas11sv', 'pbas11fr', 'pbas11wo', 'pbainfo', 'pbahshd'
       ]

others = ['pbainfo', 'pbahshd']

dummies = pd.get_dummies(df[others].astype('category'), dtype=int)
dummies.head()
df = df.drop(others, axis=1).join(dummies)
df.head()

In [None]:
pieplot(df)

### Short Form Health Survey – 12v2 (SF‐12)

In [None]:
cols = ['scoring', 'pf', 'rp', 'bp', 'gh', 'vt', 'sf', 're', 'mh', 'pcs', 'mcs']

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

In [None]:
score = 'scoring'
cond = (df[cols[1:]].isnull().all(1)) # All missing
print('Length condition:', cond.sum())
print('No missing', score, ':', df[score].isnull().sum())
print('Condition accossiated with', score, ':', df.loc[cond, score].isnull().sum(), '/', cond.sum())
df.loc[cond][(~df.loc[cond,score].isnull())][cols].head()

#### Dummies

In [None]:
cat_cols = ['scoring']

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)

In [None]:
pieplot(df)

### Hospital Anxiety and Depression Scale Snaith Irritability Scale (HADS‐SIS)

In [None]:
# associated with barfrq / trqfrq (drop/knn)
cols = ['anxscore', 'hads_depscore', 'irrscore', 'outscore', 'inwscore']

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

In [None]:
pieplot(df)

### Work Productivity and Activity Impairment‐Specific Health Problem Questionnaire (WPAI‐SHP)

In [None]:
# Many missing, drop all, maybe keep wpairscr4 (Knn)
cols = ['wpaiscr1', 'wpaiscr2', 'wpaiscr3', 'wpaiscr4']

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

In [None]:
# keep wpairscr4 (Knn)
df = df.drop(['wpaiscr1', 'wpaiscr2', 'wpaiscr3', 'wpaiscr4'], axis=1)

In [None]:
pieplot(df)

### C‐SSRS (Baseline / Follow up)

In [None]:
# baseline does not include sbh7
# Follow up does not include 'attmpt1dy', 'attmpt11', 'attmpt3dy', 'attmpt31', 'attmpt32'
# Drop 'attmpt1dy', 'attmpt11', 'attmpt12', 'attmpt2dy', 'attmpt21', 'attmpt22', 'attmpt3dy', 'attmpt31', 'attmpt32'

cols = ['sid1', 'sid2', 'sid3', 'sid4', 'sid5', 'int1', 'int2', 'int3', 'int4', 'int5', 'int6', 'sbh1', 'sbh1n',
        'sbh2', 'sbh3', 'sbh3n', 'sbh4', 'sbh4n', 'sbh5', 'sbh6', 'sbh7'
       ]

print(df[cols].isnull().any(1).sum(), 'visits where any of the features miss')
print('All features have at least one missing:', df[cols].isnull().any().all())

#### Set sbh1n to 0 if sbh1 != 1

In [None]:
df.loc[(df['sbh1']!=1) & (df['sbh1n'].isnull()),'sbh1n'] = 0

#### Set sbh3n to 0 if sbh3 != 1

In [None]:
df.loc[(df['sbh3']!=1) & (df['sbh3n'].isnull()),'sbh3n'] = 0

#### Set sbh4n to 0 if sbh4 != 1

In [None]:
df.loc[(df['sbh4']!=1) & (df['sbh4n'].isnull()),'sbh4n'] = 0

#### Dropping useless columns

In [None]:
df = df.drop(['attmpt11', 'attmpt12', 'attmpt21', 'attmpt22', 'attmpt3dy', 'attmpt31',
              'attmpt32'], axis=1) # 'attmpt1dy','attmpt2dy',

#### Make categorical cols

In [None]:
cat_cols = ['sid1', 'sid2', 'sid3', 'sid4', 'sid5', 'int1', 'int2', 'int3', 'int4', 'int5', 'int6', 'sbh1',
            'sbh2', 'sbh3', 'sbh4', 'sbh5', 'sbh6', 'sbh7']

dummies = pd.get_dummies(df[cat_cols].astype('category'), dtype=int)
dummies.head()
df = df.drop(cat_cols, axis=1).join(dummies)
df.head()

In [None]:
df.shape

In [None]:
pieplot(df)

# Capscore

In [None]:
L = 30
K = 6.27
df['capscore'] = df['age'] * (df['caghigh'] - L) / K
plt.boxplot(df['capscore'])

# Missing feature

In [None]:
print(df.loc[:,df.isnull().sum() > 0].shape)
print(df.loc[:,df.isnull().sum() > 0].columns)

# Missing

In [None]:
df['subjid'].nunique()

In [None]:
print('Columns with nan values:', df.isnull().any().sum())

In [None]:
print(df.isnull().sum().replace(0, np.nan).dropna().sort_values() / df.shape[0])

# Save filtered Dataset

In [None]:
df.to_csv(join('data', 'filtered_pre_and_manifest.csv'), index=False)