# Convert Coded Variables
Using `variables-types.json` and `variable-maps.json`, convert the coded survey data.

In [14]:
import json
import pandas as pd
import numpy as np

In [6]:
with open('variable-types.json', 'r') as fp:
    var_types = json.load(fp)

In [7]:
with open('variable-maps.json', 'r') as fp:
    var_maps = json.load(fp)

In [8]:
df = pd.read_csv("06693-0001-Data.tsv", delim_whitespace=True)

In [9]:
df.shape

(8098, 2954)

In [10]:
var_types.keys()

dict_keys(['yes_or_no', 'checked', 'history', 'inap_dk_na', 'maybe_numerical', 'unsure'])

# Converting a history variable
I leave `INAP` and `DK` as categories; these values may have value in the context of the question.  For example, if the question concerned the respondent's spouse, an `INAP` would indicate an individual without a spouse.

In [11]:
var_types['history'][0]

'V339'

In [12]:
var_maps['V339']

{'1': 'PAST MONTH',
 '2': 'PAST SIX MONTHS',
 '3': 'PAST YEAR',
 '4': 'MORE THAN A YEAR AGO',
 '0': 'INAP',
 '8': 'DK',
 '9': 'NA'}

In [17]:
df['V339'].value_counts()

0    7520
4     496
2      30
3      28
1      20
9       4
Name: V339, dtype: int64

In [18]:
def convert_history_var(var, var_map):
    """var is a Pandas Series."""
    try:
        # get rid of str keys
        new_map = {int(key):value for key, value in var_map.items()}
        print(new_map)
        # for now just replace 'NA' with np.nan
        for key,value in new_map.items():
            if value == 'NA':
                new_map[key] = np.nan
        print(new_map)
        
        return var.map(new_map)
    except Exception as e:
        print(e)

In [20]:
new_V339 = convert_history_var(df['V339'], var_maps['V339'])
new_V339.value_counts()

{1: 'PAST MONTH', 2: 'PAST SIX MONTHS', 3: 'PAST YEAR', 4: 'MORE THAN A YEAR AGO', 0: 'INAP', 8: 'DK', 9: 'NA'}
{1: 'PAST MONTH', 2: 'PAST SIX MONTHS', 3: 'PAST YEAR', 4: 'MORE THAN A YEAR AGO', 0: 'INAP', 8: 'DK', 9: nan}


INAP                    7520
MORE THAN A YEAR AGO     496
PAST SIX MONTHS           30
PAST YEAR                 28
PAST MONTH                20
Name: V339, dtype: int64

In [21]:
len(new_V339)

8098

In [22]:
len(df['V339'])

8098

In [24]:
sum(new_V339.isna())

4

# Converting a Yes/No variable
The conversion is the same as a history variable - we can carry this logic over to any of the categorical variables.  Just convert `NA` to `np.nan`...

In [25]:
var_types['yes_or_no'][0]

'V109'

In [26]:
var_maps['V109']

{'1': 'YES', '5': 'NO', '0': 'INAP', '8': 'DK', '9': 'NA'}

In [27]:
df['V109'].value_counts()

0    3980
5    3675
1     443
Name: V109, dtype: int64

In [28]:
def convert_categorical_var(var, var_map):
    """var is a Pandas Series."""
    try:
        # get rid of str keys
        new_map = {int(key):value for key, value in var_map.items()}
        print(new_map)
        # for now just replace 'NA' with np.nan
        for key,value in new_map.items():
            if value == 'NA':
                new_map[key] = np.nan
        print(new_map)
        
        return var.map(new_map)
    except Exception as e:
        print(e)

In [31]:
new_V109 = convert_categorical_var(df['V109'], var_maps['V109'])
new_V109.value_counts()

{1: 'YES', 5: 'NO', 0: 'INAP', 8: 'DK', 9: 'NA'}
{1: 'YES', 5: 'NO', 0: 'INAP', 8: 'DK', 9: nan}


INAP    3980
NO      3675
YES      443
Name: V109, dtype: int64

# Convert a checked variable

In [32]:
var_types['checked'][0]

'V601'

In [33]:
df['V601'].value_counts()

2    7576
1     522
Name: V601, dtype: int64