In [1]:
# Read the variable maps back in and work on cleaning up/sorting:
import json

with open('variables.json', 'r') as fp:
    var_dict = json.load(fp)

In [2]:
print(json.dumps(var_dict, indent=4))

{
    "V5": {
        "99": "NA"
    },
    "V6": {
        "99": "NA"
    },
    "V7": {
        "90": "-",
        "91": "-",
        "99": "NA"
    },
    "V8": {
        "99": "NA"
    },
    "V9": {
        "99": "NA"
    },
    "V10": {
        "90": "-",
        "91": "-",
        "99": "NA"
    },
    "V11": {
        "999": "NA"
    },
    "V12": {
        "99": "NA"
    },
    "V13": {
        "1": "MALE",
        "2": "FEMALE",
        "9": "NA"
    },
    "V14": {
        "1": "R HAS SPOUSE IN HHL",
        "2": "R HAS PARTNER IN HHL",
        "5": "ALL OTHERS",
        "9": "NA"
    },
    "V15": {
        "99": "NA"
    },
    "V101": {
        "1": "EXCELLENT",
        "2": "VERY GOOD",
        "3": "GOOD",
        "4": "FAIR",
        "5": "POOR",
        "8": "DK",
        "9": "NA"
    },
    "V102": {
        "1": "EXCELLENT",
        "2": "VERY GOOD",
        "3": "GOOD",
        "4": "FAIR",
        "5": "POOR",
        "8": "DK",
        "9": "NA"
    },
    "V103

In [3]:
import numpy as np

In [4]:
def try_to_classify_variables(variables_dict):
    na = set(['NA'])
    dk_na = set(['DK', 'NA'])
    inap_dk_na = set(['INAP', 'DK', 'NA'])
    yes_or_no = set(['YES', 'NO', 'INAP', 'DK', 'NA'])
    history = set(['PAST', 'AGO'])
    checked = set(['CHECKED'])
    number_words = set(['ONE', '95'])
    
    variable_type = {
        'yes_or_no': [],
        'checked': [],
        'history': [],
        'inap_dk_na': [],
        'maybe_numerical': [],
        'unsure': [],
    }
    
    new_variable_dict = {}

    for variable in variables_dict.keys():
        v_dict = variables_dict[variable]
        # remove any dashes from values
        reduced_dict = {int(key):value for key, value in v_dict.items() if value != "-"}
        
        # is the dictionary empty after reduction?
        if not reduced_dict:
            variable_type['maybe_numerical'].append(variable)
        else:
            value_words = {word for value in reduced_dict.values() for word in value.split()}
            print(variable, value_words)
            if value_words == na:
                variable_type['maybe_numerical'].append(variable)
            elif value_words == dk_na:
                variable_type['maybe_numerical'].append(variable)
            elif value_words == inap_dk_na:
                variable_type['inap_dk_na'].append(variable)
            elif value_words == yes_or_no:
                variable_type['yes_or_no'].append(variable)
            elif not checked.isdisjoint(value_words):
                # print("in checked")
                variable_type['checked'].append(variable)
            elif not history.isdisjoint(value_words):
                # print("in history")
                variable_type['history'].append(variable)
            elif not number_words.isdisjoint(value_words):
                variable_type['maybe_numerical'].append(variable)
            else:
                variable_type['unsure'].append(variable)
        
        new_variable_dict[variable] = reduced_dict
    
    return new_variable_dict, variable_type   
            

In [5]:
new_variable_dict, var_types = try_to_classify_variables(var_dict)

V5 {'NA'}
V6 {'NA'}
V7 {'NA'}
V8 {'NA'}
V9 {'NA'}
V10 {'NA'}
V11 {'NA'}
V12 {'NA'}
V13 {'MALE', 'NA', 'FEMALE'}
V14 {'HAS', 'ALL', 'R', 'NA', 'HHL', 'IN', 'OTHERS', 'SPOUSE', 'PARTNER'}
V15 {'NA'}
V101 {'FAIR', 'POOR', 'VERY', 'NA', 'DK', 'EXCELLENT', 'GOOD'}
V102 {'FAIR', 'POOR', 'VERY', 'NA', 'DK', 'EXCELLENT', 'GOOD'}
V103 {'WORSE', 'IF', 'NA', 'DK', 'VOL.', 'BETTER'}
V104 {'TIME', 'OF', 'RARELY', 'NA', 'DK', 'MOST', 'ALWAYS', 'SOMETIMES', 'NEVER', 'THE'}
V105 {'NA', 'DK'}
V106 {'REGULARLY', 'SELDOM', 'NA', 'DK', 'OCCASIONALLY', 'NEVER'}
V107 {'INAP,', 'ALL', 'R', 'MAIN', 'IW', 'SAMPLE', 'HHL', 'IN', 'MENTIONED', 'OTHERS', 'SPOUSE'}
V108 {'SEPARATED', '107', 'NA', 'DK', 'IN', 'DIVORCED', '1', 'NEVER', 'WIDOWED', 'MARRIED;'}
V109 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V110 {'HERE;', 'TIME', 'HAS', 'PLACES', 'INAP', 'RESIDENCE--PART', 'OF', '107', 'TWO', 'IN', 'CURRENTLY', 'SOME', 'ELSE', '1', 'H', 'LIVES', 'PLACE'}
V111 {'TOWN', 'OTHER', 'INAP', 'FORCES', 'OUT', 'ARMED', 'HOME/HOSPITAL', 

V2130 {'CHECKED', 'INAP', 'NOT', 'IF', 'A-J', 'NA', 'BLANK', 'CHECKED;', 'LEFT'}
in checked
V2131 {'NA', 'INAP', 'DK'}
V2132 {'THAN', 'A', 'INAP', 'MORE', 'NA', 'DK', 'MONTHS', '6', 'MONTH', 'YEAR'}
V2133 {'NA', 'INAP', 'DK'}
V2134 {'CHECKED', 'INAP', 'NOT', 'IF', 'A-J', 'NA', 'BLANK', 'CHECKED;', 'LEFT'}
in checked
V2135 {'NA', 'INAP', 'DK'}
V2136 {'THAN', 'A', 'INAP', 'MORE', 'NA', 'DK', 'MONTHS', '6', 'MONTH', 'YEAR'}
V2137 {'NA', 'INAP', 'DK'}
V2138 {'CHECKED', 'INAP', 'NOT', 'IF', 'A-J', 'NA', 'BLANK', 'CHECKED;', 'LEFT'}
in checked
V2139 {'NA', 'INAP', 'DK'}
V2140 {'THAN', 'A', 'INAP', 'MORE', 'NA', 'DK', 'MONTHS', '6', 'MONTH', 'YEAR'}
V2141 {'NA', 'INAP', 'DK'}
V2201 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V2202 {'CHECKED', 'INAP', 'NOT', 'IF', 'A-J', 'NA', 'BLANK', 'CHECKED;', 'LEFT'}
in checked
V2203 {'NA', 'INAP', 'DK'}
V2204 {'THAN', 'A', 'INAP', 'MORE', 'NA', 'DK', 'MONTHS', '6', 'MONTH', 'YEAR'}
V2205 {'NA', 'INAP', 'DK'}
V2206 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V2207 {'CHECKED'

V4307 {'NA', 'INAP', 'DK'}
V4308 {'NA', 'INAP', 'DK'}
V4309 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4310 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4311 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4312 {'THAN', 'A', 'INAP', 'AGO', 'MORE', 'PAST', 'NA', 'DK', 'SIX', 'MONTHS', 'MONTH', 'YEAR'}
in history
V4313 {'NA', 'INAP', 'DK'}
V4314 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4315 {'TWELVE', 'AND', 'LESS', 'INAP', 'WEEK', 'BETWEEN', 'MORE', 'TWO', 'NA', 'DK', 'FOUR', 'WEEKS', 'ONE', 'THAN'}
V4316 {'NORMAL', 'INAP', 'NOT', 'SELF', 'NA', 'DK'}
V4317 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4318 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4319 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4320 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4321 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4322 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4323 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4324 {'NO', 'YES', 'INAP', 'IF', 'NA', 'DK', 'VOL.'}
V4325 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V4326 {'INAP', 'BELIEFS/EXPERIENCES', 'IF', 'NA', 'DRINKING/DRUG', 'DK', 'USE', 'VOL.'}
V4327 {'NO', 'YES',

V6909 {'NO', 'YES', 'INAP', 'NA', 'DK'}
V6910 {'WORK', 'PAPE', 'OR', 'HOME', 'TOOK', 'TOGETHER;', 'MARRIED/OWN', 'BY', 'ADULTS', 'FAMILY', '(USUALLY', 'CHI', 'FARM', 'INAP', 'HOUSEKEEPING', 'R', 'WORKED', '(R);', 'KIDS);', 'NA', '(FAMILY)', 'ILL', 'HELP', 'OTHER', 'CHILDREN', 'MISC', 'TAKING', 'OF', 'DK', 'IN', 'HOME/FAM', 'RESP', 'YOUNGER', 'MYSELF', 'BUSINESS', 'ADU', 'KEEP', 'DONE', 'FOR', 'HELPED', 'OTH', 'N', 'ASSIST', 'INCAPACITATED', 'PARENT', 'FAM', 'ON', 'OUTSIDE', 'CARE', 'CHORES'}
V6911 {'WORK', 'PAPE', 'OR', 'HOME', 'TOOK', 'TOGETHER;', 'MARRIED/OWN', 'BY', 'ADULTS', 'FAMILY', '(USUALLY', 'CHI', 'FARM', 'INAP', 'HOUSEKEEPING', 'R', 'WORKED', '(R);', 'KIDS);', 'NA', '(FAMILY)', 'ILL', 'HELP', 'OTHER', 'CHILDREN', 'MISC', 'TAKING', 'OF', 'DK', 'IN', 'HOME/FAM', 'RESP', 'YOUNGER', 'MYSELF', 'BUSINESS', 'ADU', 'KEEP', 'DONE', 'FOR', 'HELPED', 'OTH', 'N', 'ASSIST', 'INCAPACITATED', 'PARENT', 'FAM', 'ON', 'OUTSIDE', 'CARE', 'CHORES'}
V6912 {'FAIR', 'INAP', 'POOR', 'IF', 'NA', 'DK

In [6]:
len(new_variable_dict)

2945

In [7]:
for key, value in var_types.items():
    print(key, len(value))

yes_or_no 601
checked 535
history 52
inap_dk_na 710
maybe_numerical 74
unsure 973


In [8]:
var_types['unsure']

['V13',
 'V14',
 'V101',
 'V102',
 'V103',
 'V104',
 'V106',
 'V107',
 'V108',
 'V110',
 'V111',
 'V113',
 'V114',
 'V201',
 'V202',
 'V203',
 'V204',
 'V205',
 'V206',
 'V207',
 'V208',
 'V209',
 'V210',
 'V211',
 'V212',
 'V213',
 'V214',
 'V215',
 'V216',
 'V217',
 'V218',
 'V219',
 'V220',
 'V221',
 'V222',
 'V223',
 'V224',
 'V225',
 'V226',
 'V227',
 'V228',
 'V229',
 'V230',
 'V231',
 'V232',
 'V233',
 'V234',
 'V235',
 'V236',
 'V237',
 'V238',
 'V239',
 'V245',
 'V246',
 'V247',
 'V248',
 'V249',
 'V250',
 'V301',
 'V302',
 'V304',
 'V305',
 'V306',
 'V307',
 'V308',
 'V310',
 'V312',
 'V313',
 'V314',
 'V320',
 'V326',
 'V336',
 'V337',
 'V338',
 'V345',
 'V347',
 'V349',
 'V401',
 'V402',
 'V403',
 'V404',
 'V405',
 'V406',
 'V407',
 'V418',
 'V419',
 'V420',
 'V431',
 'V433',
 'V435',
 'V501',
 'V502',
 'V503',
 'V504',
 'V505',
 'V506',
 'V507',
 'V508',
 'V509',
 'V510',
 'V511',
 'V522',
 'V523',
 'V524',
 'V534',
 'V536',
 'V538',
 'V607',
 'V608',
 'V628',
 'V637',
 'V

In [11]:
with open('variable-types.json', 'w') as fp:
    json.dump(var_types, fp)