In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from collections import defaultdict, Counter
warnings.filterwarnings("ignore")

## Load dataset

In [2]:
# load uds dataset (baseline-first visit, subset features to ~90)
uds = pd.read_csv("../data/investigator_uds_baseline.csv")
print("UDS shape:  ", uds.shape)

# Load mri dataset, and remove couple features
mri_raw = pd.read_csv('../data/investigator_mri_nacc57.csv')
mri_features_removed = ['NACCDICO','NACCNIFT','NACCMRFI','NACCNMRI','NACCMNUM','NACCMRSA','MRIMANU',
                    'MRIMODL','NACCMRIA', 'NACCMRDY', 'MRIT1', 'MRIT2', 'MRIDTI','MRIDWI', 'MRIFLAIR', 
                    'MRIOTHER', 'MRIFIELD', 'NACCMVOL', 'NACCADC']
mri = mri_raw.loc[:, ~mri_raw.columns.isin(mri_features_removed)] 
print("MRI shape:  ", mri.shape)

# Load csf data
csf = pd.read_csv("../data/investigator_fcsf_nacc57.csv")
print("CSF shape:  ", csf.shape)

# Load UDS dataset feature dictionary
uds_dict = pd.read_csv("../data/data_dictionary/uds_feature_dictionary_cleaned.csv")
mri_dict = pd.read_csv("../data/data_dictionary/mri_feature_dictionary_cleaned.csv")

UDS shape:   (45100, 94)
MRI shape:   (11273, 172)
CSF shape:   (3017, 23)


## Check missing rate
Check handbook for missing data encoding
- MRI missing is all due to data not available / not applicable
- But UDS missing might due to multiple reasons:
 - EDUC: 99 (Unknown)
 - CDF: -4 (Not available)
 - GDS:  9 (Did not answer); -4 (Not available); 88 (Could not be calculated)
 - FAS, NPI: 99 (Unknown), -4 (Not available) 
 - NEURO: 95/995 (Physical problem), 96/9996 (Cognitive/behavior problem), 97/997 (Other problem), 98/998 (Verbal Refusal), 99 (Unknown), 88 (Score not calculated; missing at least one MMSE item), -4 (Not available), 

In [3]:
def mask_uds_na(df):
    uds['EDUC'] = uds['EDUC'].mask(uds['EDUC'] == 99, np.nan)
    for var in uds_dict[uds_dict['Category'] == 'CDF']['VariableName']:
        uds[var] = uds[var].mask(uds[var] == -4, np.nan)
    for var in uds_dict[uds_dict['Category'] == 'GDS']['VariableName']:
        if var != "NACCGDS":
            uds[var] = uds[var].mask(uds[var].isin([-4, 9]), np.nan)
        else:
            uds[var] = uds[var].mask(uds[var].isin([-4, 88]), np.nan)
    for var in uds_dict[uds_dict['Category'] == 'FAS']['VariableName']:
        uds[var] = uds[var].mask(uds[var].isin([8, 9, -4]), np.nan)
    for var in uds_dict[uds_dict['Category'] == 'NPI']['VariableName']:
        uds[var] = uds[var].mask(uds[var].isin([8, 9, -4]), np.nan)
    for var in uds_dict[uds_dict['Category'] == 'NEURO']['VariableName']:
        if var not in ['TRAILA', "TRAILB"]:
            uds[var] = uds[var].mask(uds[var].isin([88, 95, 96, 97, 98, 99, -4]), np.nan)
        else:
            uds[var] = uds[var].mask(uds[var].isin([995, 996, 997, 998, -4]), np.nan)
    return uds

def mask_mri_na(df):
    # MRI ROIs missing values (see Handbook)
    for code in [8.8888, 88.8888, 888.8888, 8888.888, 8888.8888,
                 9.999, 99.9999, 999.9999, 9999.999, 9999.9999]:
        df = df.mask(df == code, np.nan)
    return df

def merge_missing_rate(df, df_dict):
    if "MissingRates" not in df_dict:
        tmp = df.isna().mean(axis=0).to_frame().reset_index()
        tmp.columns = ['VariableName', "MissingRates"]
        df_dict = df_dict.merge(tmp, on='VariableName', how='left')
    return df_dict


In [4]:
uds = mask_uds_na(uds)
uds_dict = merge_missing_rate(uds, uds_dict)
uds_dict.to_csv("../data/data_dictionary/uds_feature_dictionary_cleaned.csv", index=False)

mri = mask_mri_na(mri)
mri_dict = merge_missing_rate(mri, mri_dict)
mri_dict.to_csv("../data/data_dictionary/mri_feature_dictionary_cleaned.csv", index = False)

# Missing Values Imputation
TO BE CONTINUED