# 2019-02-03_Campbell_26

| Info   |                      |
|--------|----------------------|
| Author | L. D. Nicolas May    |
| Date   | 2019-02-23           |
| Note   | This is a re-creation of a dataset that was built using R in order to get data munging experience with Python |

## Load Requirements

### Load useful globals and functions

In [1]:
from config import *
from madc_helpers import *

### Load packages

In [2]:
# Package(s) for REDCap API
import certifi

# Package(s) for munging data
import pandas as pd
import numpy as np
import re
from collections import OrderedDict

## Get Data

### Define data fields and forms

#### UDS 3

In [3]:
#---------#---------#---------#---------#---------#---------#---------#---------

# FIELDS

# Form Header
fields_u3_hd_raw = [
    'ptid'
    , 'form_date'
]
# Form C2 - IVP
fields_u3_c2_raw = [
    'mocatots'
    , 'trailb_c2'
    , 'trailbrr_c2'
    , 'trailbli_c2'
    , 'trailb_c2z'
]
# Form D1 - IVP
fields_u3_d1_raw = [
    'normcog'    # NL
    , 'mciamem'  # MCI
    , 'mciaplus' # MCI
    , 'mcinon1'  # MCI
    , 'mcinon2'  # MCI
    , 'impnomci' # Impaired not MCI
    , 'alzdis'   # AD etio
    , 'alzdisif'
    , 'lbdis'    # LBD etio
    , 'lbdif'
    , 'msa'      # MSA etio
    , 'msaif'
    , 'psp'      # PSP etio
    , 'pspif'
    , 'ftldmo'   # FTLD motor etio
    , 'ftldmoif'
    , 'ftldnos'  # FTLD NOS etio
    , 'ftldnoif'
]
# Combine fields, prefixing C2 and D1 with "fu_" and "tele_"
fields_u3_raw = \
    fields_u3_hd_raw + \
    fields_u3_c2_raw + \
    ['fu_' + f for f in fields_u3_c2_raw] + \
    ['tele_' + f for f in fields_u3_c2_raw] + \
    fields_u3_d1_raw + \
    ['fu_' + f for f in fields_u3_d1_raw] + \
    ['tele_' + f for f in fields_u3_d1_raw]

fields_u3 = ','.join(fields_u3_raw)
# print(fields_u3)


# FORMS

forms_u3_raw = [
    # Lichtenberg Financial Decision Making Self Efficacy Form
    'financial_decision_making_self_efficacy_form'
    # Lichtenberg Financial Decision Making Screening Scale
    , 'lfdss_for_professionals'
]

forms_u3 = ','.join(forms_u3_raw)
# print(forms_u3)

In [4]:
certifi.where()

'/Users/ldmay/anaconda3/lib/python3.7/site-packages/certifi/cacert.pem'

In [5]:
# Get JSON
json_u3 = rc_get_api_data(uri    = REDCAP_API_URI,
                          token  = REDCAP_API_TOKEN_UDS3n,
                          fields = fields_u3,
                          forms  = forms_u3,
                          secure = False)

In [6]:
# Parse JSON as pandas df
df_u3 = pd.read_json(json_u3, convert_dates=['form_date'])

Writing to then reading from CSV makes datatype inference more successful. Why, `pandas`? Why?

In [7]:
# Write to CSV
df_u3.to_csv('py_df_u3.csv')

# Read CSV
df_u3 = pd.read_csv('py_df_u3.csv', sep=',')

In [8]:
# df_u3.head(n=10)

In [9]:
# df_u3.dtypes

In [10]:
# Reorder `df_u3` columns as they're originally listed

# fields_u3_raw[1:20]

orig_cols = df_u3.columns.tolist()
# cols

ordered_cols = list(filter(lambda f: f in orig_cols, fields_u3_raw))
# ordered_cols
remaining_cols = list(filter(lambda f: f not in ordered_cols, orig_cols))
# remaining_cols
reordered_cols = ordered_cols + remaining_cols

df_u3 = df_u3[reordered_cols]

## Process Data

### Clean Data

#### UDS 3

Clean out DDE records (`--1`, `--2`)

In [11]:
print(df_u3.shape)

(593, 213)


In [12]:
# df_u3['ptid'].str.match(r'^UM\d{8}$')
df_u3_cln = df_u3.loc[df_u3['ptid'].str.match(r'^UM\d{8}$')]

In [13]:
print(df_u3_cln.shape)

(503, 213)


Clean out records missing `form_date`s

In [14]:
# df_u3['form_date'].notnull()
df_u3_cln = df_u3_cln.loc[df_u3['form_date'].notnull()]

In [15]:
print(df_u3_cln.shape)

(477, 213)


### Mutate Data

#### UDS 3

Coalese IVP/FVP/TVP fields (`x`, `fu_x`, `tele_x`).

To coalese IVP/FVP/TVP fields, we first have to create a smarter function that sniffs out the IVP/FVP/TVP fields. So given column names `['foo', 'bar', 'baz', 'fu_bar', 'tele_baz', 'fu_qux', 'tele_qux']`, we get `['foo', 'bar', 'baz', 'qux']` back.

In [16]:
# `coalesce_ift_cols` from `madc_helpers` import
df_u3_cln_mut = coalesce_ift_cols(df_u3_cln)

In [17]:
# Write to then read from CSV
df_u3_cln_mut.to_csv('df_u3_cln_mut.csv')
df_u3_cln_mut = pd.read_csv('df_u3_cln_mut.csv')

Simplify diagnosis and etiology fields. (Such a pain-in-the-ass with Python.)

In [18]:
df_u3_cln_mut['uds_dx_der'] = None

In [19]:
# FTLD
df_u3_cln_mut['uds_dx_der'] = \
np.where((df_u3_cln_mut['ftldnos'] == 1.0) & 
         (df_u3_cln_mut['ftldnoif'] == 1.0), 
         'FTLD', df_u3_cln_mut['uds_dx_der'])

In [20]:
# FTLD
df_u3_cln_mut['uds_dx_der'] = \
np.where((df_u3_cln_mut['ftldmo'] == 1.0) & 
         (df_u3_cln_mut['ftldmoif'] == 1.0), 
         'FTLD', df_u3_cln_mut['uds_dx_der'])

In [21]:
# PSP
df_u3_cln_mut['uds_dx_der'] = \
np.where((df_u3_cln_mut['psp'] == 1.0) & 
         (df_u3_cln_mut['pspif'] == 1.0), 
         'PSP', df_u3_cln_mut['uds_dx_der'])

In [22]:
# MSA
df_u3_cln_mut['uds_dx_der'] = \
np.where((df_u3_cln_mut['msa'] == 1.0) & 
         (df_u3_cln_mut['msaif'] == 1.0), 
         'MSA', df_u3_cln_mut['uds_dx_der'])

In [23]:
# LBD
df_u3_cln_mut['uds_dx_der'] = \
np.where((df_u3_cln_mut['lbdis'] == 1.0) & 
         (df_u3_cln_mut['lbdif'] == 1.0), 
         'LBD', df_u3_cln_mut['uds_dx_der'])

In [24]:
# AD
df_u3_cln_mut['uds_dx_der'] = \
np.where((df_u3_cln_mut['alzdis'] == 1.0) & 
         (df_u3_cln_mut['alzdisif'] == 1.0), 
         'AD', df_u3_cln_mut['uds_dx_der'])

In [25]:
# Impaired no MCI
df_u3_cln_mut['uds_dx_der'] = \
np.where(df_u3_cln_mut['impnomci'] == 1.0, 
         'Impaired no MCI', df_u3_cln_mut['uds_dx_der'])

In [26]:
# MCI
df_u3_cln_mut['uds_dx_der'] = \
np.where(df_u3_cln_mut['mcinon2'] == 1.0, 
         'MCI', df_u3_cln_mut['uds_dx_der'])

In [27]:
# MCI
df_u3_cln_mut['uds_dx_der'] = \
np.where(df_u3_cln_mut['mcinon1'] == 1.0, 
         'MCI', df_u3_cln_mut['uds_dx_der'])

In [28]:
# MCI
df_u3_cln_mut['uds_dx_der'] = \
np.where(df_u3_cln_mut['mciaplus'] == 1.0, 
         'MCI', df_u3_cln_mut['uds_dx_der'])

In [29]:
# MCI
df_u3_cln_mut['uds_dx_der'] = \
np.where(df_u3_cln_mut['mciamem'] == 1.0, 
         'MCI', df_u3_cln_mut['uds_dx_der'])

In [30]:
# NL
df_u3_cln_mut['uds_dx_der'] = \
np.where(df_u3_cln_mut['normcog'] == 1.0, 
         'NL', df_u3_cln_mut['uds_dx_der'])

In [31]:
# df_u3_cln_mut['uds_dx_der'].tolist()

In [32]:
df_u3_cln_mut.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477 entries, 0 to 476
Columns: 174 entries, Unnamed: 0 to uds_dx_der
dtypes: float64(155), int64(2), object(17)
memory usage: 648.5+ KB
