# Feature Dictionary

This notebook creates a feature mapping dictionary for UDS and MRI tables. Note that the UDS contains 1k+ features but we pre-selected ~90 features and will only focus on those. Besides the default feature description from the handbook, we include the feature categories for future reference.    

Below are the reference links for feature handbook:
- UDS: https://files.alz.washington.edu/documentation/rdd-np.pdf
- MRI: https://files.alz.washington.edu/documentation/rdd-imaging.pdf
- CSF: https://files.alz.washington.edu/documentation/biomarker-ee2-csf-ded.pdf

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")

## UDS Feature Mapping

In [2]:
# uds = pd.read_csv("../data/investigator_nacc57.csv")
uds_sub = pd.read_csv("../data/investigator_uds_baseline.csv")

# Load UDS dataset feature dictionary
data_dictionary = pd.read_csv("../data/data_dictionary/uds3-rdd.csv",encoding = 'unicode_escape')

In [3]:
# Pre-defined variable Category
demo_var = ['NACCID','NACCADC','NACCAGE','NACCVNUM','EDUC','SEX','VISITDAY','VISITMO','VISITYR','NACCAPOE','NACCUDSD','NACCALZP']
cds_var = ['MEMORY','ORIENT','JUDGMENT','COMMUN','HOMEHOBB','PERSCARE','CDRSUM','CDRGLOB','COMPORT','CDRLANG']
gds_var = ['SATIS','DROPACT','EMPTY','BORED','SPIRITS','AFRAID','HAPPY','HELPLESS','STAYHOME',
            'MEMPROB','WONDRFUL','WRTHLESS','ENERGY','HOPELESS','BETTER','NACCGDS']
fas_var = ['BILLS','TAXES','SHOPPING','GAMES','STOVE','MEALPREP','EVENTS','PAYATTN','REMDATES','TRAVEL']
npi_var = ['DEL','DELSEV','HALL','HALLSEV','AGIT','AGITSEV','DEPD','DEPDSEV','ANX','ANXSEV','ELAT',
            'ELATSEV','APA','APASEV','DISN','DISNSEV','IRR','IRRSEV','MOT','MOTSEV','NITE','NITESEV','APP','APPSEV']

beh_data = cds_var + gds_var + fas_var + npi_var

neuropsych_feat = list(set(uds_sub.columns) - set(demo_var + beh_data) - set(['datetime']))
## Neuropsych_feat was loaded from neuropsych_var.csv. we use the extra features instead 
# neuropsych_feat = pd.read_csv('neuropsych_var.csv') # List of features we want to investigate in this category
# neuropsych_feat = neuropsych_feat['Neuropsych Features'].values

# UDS features of interest
uds_feat = np.concatenate((demo_var, beh_data, neuropsych_feat))


##################################
## Start parsing feature maps 
##################################

uds_var_dict = pd.DataFrame()
for var, var_type in zip([demo_var, cds_var, gds_var, fas_var, npi_var, neuropsych_feat],
                         ['demo', 'cdf', 'gds', 'fas', 'npi', 'neuro']):
    df_temp = data_dictionary[data_dictionary['VariableName'].isin(var)].reset_index(drop=True).copy()
    df_temp['Category'] = var_type.upper()
    missing = set(var) - set(df_temp['VariableName'])
    if len(missing) != 0:
        print("Warning: {} (Type: {}) is not in the variable dictionary!".format(missing,  var_type))
        for m in missing:
            df_temp.loc[df_temp.shape[0]] = [m] + [np.nan]*5 + [var_type.upper()]
    uds_var_dict = pd.concat([uds_var_dict, df_temp], axis=0)
    
uds_var_dict = uds_var_dict[['Category', 'VariableName', 'VariableType', 'ShortDescriptor', 
                             'DataType', 'AllowableCodes']].reset_index(drop=True)
uds_var_dict.to_csv("../data/data_dictionary/uds_feature_dictionary_cleaned.csv", index=False)



## MRI feature dictionary

In [4]:
mri_raw = pd.read_csv('../data/investigator_mri_nacc57.csv')
mri_dict = pd.read_csv("../data/data_dictionary/mri-dictionary.csv",encoding = 'unicode_escape')

In [5]:
#############################################################################
## Parse Variable name, category, ShortDescriptor, data_type, and DataSource
#############################################################################
mri_dict_master = pd.DataFrame()
first_row = 0
data_type = mri_dict.iloc[0, 0].split(':')[1].strip()

for i in range(1, 228):
    if "Section" in str(mri_dict.iloc[i, 0]):
        category = re.compile("Section .*[:\.](.*)").findall(mri_dict.iloc[i, 0])[0].strip()
        df_temp = mri_dict.iloc[first_row+1:i, 1:].copy()
        df_temp['Category'] = category
        mri_dict_master = pd.concat([mri_dict_master, df_temp], axis=0)
        first_row = i
df_temp = mri_dict.iloc[first_row:i, 1:].copy()
df_temp['Category'] = category
mri_dict_master = pd.concat([mri_dict_master, df_temp], axis=0)
mri_dict_master.columns = ["VariableName", "ShortDescriptor", "DataType", "DataSource", "Category"]
mri_dict_master = mri_dict_master[mri_dict_master['VariableName'] != "Variable name"].dropna().reset_index(drop=True)
mri_dict_master = mri_dict_master[["VariableName", "Category", "ShortDescriptor", "DataType", "DataSource"]]

########################################
## Parse missing code and allowable code
########################################
mri_feature_dict = defaultdict(dict)
i = 229
var_name = mri_dict.iloc[228,2]
allowable_code, MissingCodes = None, None

while i < mri_dict.shape[0]:
    if mri_dict.iloc[i, 1] == "Variable name":
        mri_feature_dict[var_name]['MissingCodes'] = MissingCodes
        mri_feature_dict[var_name]['AllowableCodes'] = allowable_code
        var_name = mri_dict.iloc[i, 2]
        allowable_code, MissingCodes = None, None
        i += 1
    elif mri_dict.iloc[i, 1] == "Missing codes":
        MissingCodes = mri_dict.iloc[i, 2].replace('Ð',' - ')
        i += 1
        while mri_dict.iloc[i, 1] is np.nan:
            MissingCodes += "\n" +  mri_dict.iloc[i, 2].replace('Ð',' - ')
            i += 1
    elif mri_dict.iloc[i, 1] == "Allowable codes":
        allowable_code = mri_dict.iloc[i, 2].replace('Ð',' - ')
        i += 1
        while mri_dict.iloc[i, 1] is np.nan:
            allowable_code += "\n" +  mri_dict.iloc[i, 2].replace('Ð',' - ')
            i += 1
    else:
        i += 1
mri_feature_df = pd.DataFrame.from_dict(mri_feature_dict, orient='index').reset_index()
mri_feature_df.columns = ['VariableName', 'MissingCodes', 'AllowableCodes']

########################################
## Combine two parser
########################################
mri_dict_master = mri_dict_master.merge(mri_feature_df, on = 'VariableName', how='left')

################################################################################
## Subset Mri features to selected ones, We do not need these variables
################################################################################
mri_features_removed = ['NACCDICO','NACCNIFT','NACCMRFI','NACCNMRI','NACCMNUM','NACCMRSA','MRIMANU',
                    'MRIMODL','NACCMRIA', 'NACCMRDY', 'MRIT1', 'MRIT2', 'MRIDTI','MRIDWI', 'MRIFLAIR', 
                    'MRIOTHER', 'MRIFIELD', 'NACCMVOL', 'NACCADC']
mri_sub = mri_raw.loc[:, ~mri_raw.columns.isin(mri_features_removed)] 

# Nat all features are in mri data dictionary
missing_features = sorted(list(set(mri_sub.columns) - set(mri_dict_master['VariableName'])))
print("Features not in dictionary:  ",  missing_features)
mri_dict_master = mri_dict_master[mri_dict_master['VariableName'].isin(mri_sub.columns)].reset_index(drop=True)
for mf in missing_features:
    mri_dict_master.loc[mri_dict_master.shape[0]] = [mf] + [np.nan] * (mri_dict_master.shape[1]-1)

mri_dict_master.to_csv("../data/data_dictionary/mri_feature_dictionary_cleaned.csv", index = False)

Features not in dictionary:   ['FRONTCSF', 'FRONTGRY', 'FRONTWHT', 'NACCID', 'NACCVNUM', 'OCCIPCSF', 'OCCIPGRY', 'OCCIPWHT', 'PARCSF', 'PARGRY', 'PARWHT', 'TEMPCSF', 'TEMPGRY', 'TEMPWHT']
