## PUMS Recoding Example

This notebook loads the raw concatenated PUMS file for the entire USA, extracts relevant variables, recodes them according to the example coding scheme, and writes a CSV file containing the recoded data.

In [1]:
import os
import re
import sys
import math
import numpy as np
import pandas as pd

from collections import Counter
from src import coding_pums as coding

### User Parameters

In [2]:
# where the raw concatenated PUMS input file is found
DATA_DIR = 'data'

# concatenated PUMS file for entire USA
INPUT_FILE_NAME = 'pums_usa_2019.csv'

# whether to adjust income values to constant dollars before recoding
USE_CONSTANT_DOLLARS = True

#### Load the input file

In [3]:
INPUT_FILE = os.path.join(DATA_DIR, INPUT_FILE_NAME)
print('Loading file "{0}"...'.format(INPUT_FILE))
raw_df = pd.read_csv(INPUT_FILE)
raw_df

Loading file "data/pums_usa_2019.csv"...


Unnamed: 0,ST,SERIALNO,SPORDER,AGEP,ADJINC,PWGTP,HISP,RAC1P,SCHL,HICOV,PINCP,SEX
0,1,2019HU1259536,4,10,1010145,185,1,2,7.0,1,,1
1,1,2019HU0915454,2,17,1010145,77,1,1,13.0,1,10000.0,2
2,1,2019HU0915504,1,46,1010145,23,2,1,19.0,1,94600.0,1
3,1,2019HU0915555,1,69,1010145,71,1,1,16.0,1,37100.0,2
4,1,2019HU0915555,2,72,1010145,53,1,1,21.0,1,8400.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3239548,56,2019HU1000576,2,56,1010145,134,1,1,18.0,1,0.0,2
3239549,56,2019HU1000576,3,17,1010145,141,1,1,14.0,1,0.0,1
3239550,56,2019HU1000791,1,66,1010145,24,1,1,16.0,1,13900.0,2
3239551,56,2019HU0999472,1,33,1010145,48,2,1,21.0,1,33700.0,2


#### Drop all individuals who are less than 18 years old

The example coding scheme uses a minimum age of 18 years.

In [4]:
recoded_df = raw_df.loc[raw_df['AGEP'] >= 18]
recoded_df = recoded_df.reset_index(drop=True)
recoded_df

Unnamed: 0,ST,SERIALNO,SPORDER,AGEP,ADJINC,PWGTP,HISP,RAC1P,SCHL,HICOV,PINCP,SEX
0,1,2019HU0915504,1,46,1010145,23,2,1,19.0,1,94600.0,1
1,1,2019HU0915555,1,69,1010145,71,1,1,16.0,1,37100.0,2
2,1,2019HU0915555,2,72,1010145,53,1,1,21.0,1,8400.0,1
3,1,2019HU0915664,1,72,1010145,25,1,2,17.0,1,26400.0,2
4,1,2019HU0915711,1,26,1010145,38,1,1,22.0,1,60000.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2599166,56,2019HU1000576,1,51,1010145,125,1,1,18.0,1,54000.0,1
2599167,56,2019HU1000576,2,56,1010145,134,1,1,18.0,1,0.0,2
2599168,56,2019HU1000791,1,66,1010145,24,1,1,16.0,1,13900.0,2
2599169,56,2019HU0999472,1,33,1010145,48,2,1,21.0,1,33700.0,2


## Recode Selected Variables

In [5]:
def sum_of_values(ctr):
    """
    Sum the values in the given instance of collections.Counter and return the total.
    """
    
    total = 0
    for k,v in ctr.items():
        total += v
    return total

#### Age

In [6]:
def recode_age(df):
    """
    Recode the PUMS AGEP field as follows:
    
    class Age(Enum):
        AGE_18_29   = 0  
        AGE_30_39   = 1
        AGE_40_49   = 2
        AGE_50_59   = 3
        AGE_60_69   = 4
        AGE_70_PLUS = 5
    """
    
    age_values = df['AGEP'].values
    
    new_values = []
    for pums_age in age_values:
        # the PUMS age should be in the range [18,99]
        assert pums_age >= 18 and pums_age <= 99
        
        if pums_age >= 70:
            recoded = coding.Age.AGE_70_PLUS.value
        elif pums_age >= 60:
            recoded = coding.Age.AGE_60_69.value
        elif pums_age >= 50:
            recoded = coding.Age.AGE_50_59.value
        elif pums_age >= 40:
            recoded = coding.Age.AGE_40_49.value
        elif pums_age >= 30:
            recoded = coding.Age.AGE_30_39.value
        else:
            recoded = coding.Age.AGE_18_29.value
            
        new_values.append(recoded)
        
    # drop the AGE1P column
    df = df.drop(columns=['AGEP'])
    # insert a new Age column
    df = df.assign(**{'Age':new_values})
    
    ctr = Counter(df['Age'].values)
    assert min(ctr.keys()) >= 0
    assert max(ctr.keys()) < coding.BIN_COUNTS[coding.Variables.AGE]
    assert df.shape[0] == sum_of_values(ctr)
    return df

In [7]:
recoded_df = recode_age(recoded_df)

#### Race / Ethnicity

In [8]:
def recode_race_eth(df):
    """
    Recode the PUMS RAC1P and HISP fields as follows:
    
    class RaceEth(Enum):
        NH_WHITE = 0  # Non-Hispanic White only
        NH_BLACK = 1  # Non-Hispanic Black only 
        NH_ASIAN = 2  # Non-Hispanic Asian only
        HISPANIC = 3  # Hispanic
        OTHER    = 4  # Other
    """
    
    race_values = df['RAC1P'].values
    hisp_values = df['HISP'].values
    
    new_values = []
    for i, pums_race in enumerate(race_values):
        # pums RAC1P value >= 1 and <= 9
        assert pums_race >= 1 and pums_race <= 9
        
        # check if hispanic or not
        is_hispanic = 1 != hisp_values[i]
        
        if is_hispanic:
            recoded_race = coding.RaceEth.HISPANIC.value
        else:
            if 1 == pums_race:
                # NH white
                recoded_race = coding.RaceEth.NH_WHITE.value
            elif 2 == pums_race:
                # NH black
                recoded_race = coding.RaceEth.NH_BLACK.value
            elif 6 == pums_race:
                # NH asian
                recoded_race = coding.RaceEth.NH_ASIAN.value
            else:
                # other
                recoded_race = coding.RaceEth.OTHER.value
                
        new_values.append(recoded_race)
        
    df = df.drop(columns=['RAC1P'])
    df = df.drop(columns=['HISP'])
    df = df.assign(**{'RaceEth':new_values})
    
    ctr = Counter(df['RaceEth'].values)
    assert min(ctr.keys()) >= 0
    assert max(ctr.keys()) < coding.BIN_COUNTS[coding.Variables.RACE_ETH]
    assert df.shape[0] == sum_of_values(ctr)
    return df    

In [9]:
recoded_df = recode_race_eth(recoded_df)

#### Education

In [10]:
def recode_education(df):
    """
    Recode the PUMS SCHL field as follows:
    
    class Education(Enum):
        COLLEGE_GRAD = 0  # College grad (includes "Missing")
        SOME_COLLEGE = 1  # Some college
        HS_GRAD      = 2  # High school graduate
        NOT_HS_GRAD  = 3  # Not a HS graduate

    """
    
    values = df['SCHL'].values
    
    new_values = []
    for pums_educ in values:
        if np.isnan(pums_educ):
            # missing
            recoded_educ = coding.Education.COLLEGE_GRAD.value
        elif str == type(pums_educ) and 'bb' == pums_educ:
            # less than 3 years old, code as not a HS graduate
            recoded_educ = coding.Education.NOT_HS_GRAD.value
        else:
            if pums_educ >= 20:
                # college graduate: associate degree or higher
                recoded_educ = coding.Education.COLLEGE_GRAD.value
            elif pums_educ >= 18:
                # some college
                recoded_educ = coding.Education.SOME_COLLEGE.value
            elif pums_educ >= 16:
                # HS diploma or GED
                recoded_educ = coding.Education.HS_GRAD.value
            else:
                # not a HS graduate
                recoded_educ = coding.Education.NOT_HS_GRAD.value
                
        new_values.append(recoded_educ)
                
    # drop the 'SCHL' column
    df = df.drop(columns=['SCHL'])
    # insert a new 'Education' column with the recoded values
    df = df.assign(**{'Education':new_values})
    
    ctr = Counter(df['Education'].values)
    assert min(ctr.keys()) >= 0
    assert max(ctr.keys()) < coding.BIN_COUNTS[coding.Variables.EDUCATION]
    assert df.shape[0] == sum_of_values(ctr)
    return df

In [11]:
recoded_df = recode_education(recoded_df)

#### INSURANCE_GROUPING

In [12]:
def recode_insurance(df):
    """
    Recode the PUMS HICOV field as follows:
    
    class Insurance(Enum):
        YES     = 0  # Has health insurance
        NO      = 1  # Does not have health insurance, or missing
    """
    
    values = df['HICOV'].values
    
    new_values = []
    for pums_ins in values:
        if np.isnan(pums_ins):
            # missing
            recoded_ins = coding.Insurance.NO.value
        else:
            if 1 == pums_ins:
                # has health insurance
                recoded_ins = coding.Insurance.YES.value
            else:
                recoded_ins = coding.Insurance.NO.value
                
        new_values.append(recoded_ins)
        
    df = df.drop(columns=['HICOV'])
    df = df.assign(**{'Insurance':new_values})
        
    ctr = Counter(df['Insurance'].values)
    assert min(ctr.keys()) >= 0
    assert max(ctr.keys()) < coding.BIN_COUNTS[coding.Variables.INSURANCE]
    assert df.shape[0] == sum_of_values(ctr)
    return df

In [13]:
recoded_df = recode_insurance(recoded_df)

#### INCOME_GROUPING

In [14]:
def recode_income(df):
    """
    Recode the PUMS PINCP field as follows:
    
    class Income(Enum):
        INC_LT_25   = 0  # less than $25K
        INC_25_50   = 1  # $25K to $49,999
        INC_50_100  = 2  # $50K to $99,999
        INC_GT_100  = 3  # $100K or more
        INC_MISSING = 4  # Missing 


    The values are optionally adjusted to constant dollars.
    """
    
    values = df['PINCP'].values
    adjustments = df['ADJINC'].values
    
    new_values = []
    for i,val in enumerate(values):
        if np.isnan(val):
            # missing
            recoded_inc = coding.Income.INC_MISSING.value
        else:
            if str == type(val) and 'bbbbbbb' == val:
                # N/A (less than 15 years old)
                recoded_inc = coding.Income.INC_MISSING.value
            else:
                
                # adjust for inflation to constant dollars, if desired
                adj_factor = 1.0
                if USE_CONSTANT_DOLLARS:
                    adj_factor = adjustments[i] * 1.0e-6
                
                pums_income = val * adj_factor
                if pums_income < 25000.0:
                    recoded_inc = coding.Income.INC_LT_25.value
                elif pums_income >= 25000.0 and pums_income < 50000.0:
                    recoded_inc = coding.Income.INC_25_50.value
                elif pums_income >= 50000.0 and pums_income < 100000.0:
                    recoded_inc = coding.Income.INC_50_100.value
                else:
                    recoded_inc = coding.Income.INC_GT_100.value
                    
        new_values.append(recoded_inc)
        
    df = df.drop(columns=['PINCP', 'ADJINC'])
    df = df.assign(**{'Income':new_values})
    
    ctr = Counter(df['Income'].values)
    assert min(ctr.keys()) >= 0
    assert max(ctr.keys()) < coding.BIN_COUNTS[coding.Variables.INCOME]
    assert df.shape[0] == sum_of_values(ctr)
    return df

In [15]:
recoded_df = recode_income(recoded_df)

#### SEX

In [16]:
def recode_sex(df):
    """
    Recode the PUMS SEX variable as follows:
    
    class Sex(Enum):
        MALE   = 0
        FEMALE = 1

    """
    
    values = df['SEX'].values
    
    new_values = []
    for pums_sex in values:
        assert 1 == pums_sex or 2 == pums_sex
        
        if 1 == pums_sex:
            recoded_sex = coding.Sex.MALE.value
        else:
            recoded_sex = coding.Sex.FEMALE.value
            
        new_values.append(recoded_sex)

    df = df.drop(columns=['SEX'])
    df = df.assign(**{'Sex':new_values})
    
    ctr = Counter(df['Sex'].values)
    assert min(ctr.keys()) >= 0
    assert max(ctr.keys()) < coding.BIN_COUNTS[coding.Variables.SEX]
    assert df.shape[0] == sum_of_values(ctr)
    return df

In [17]:
recoded_df = recode_sex(recoded_df)

#### Create final dataframe

In [18]:
# sort columns in alphabetical order
final_df = recoded_df.sort_index(axis=1)
final_df

Unnamed: 0,Age,Education,Income,Insurance,PWGTP,RaceEth,SERIALNO,SPORDER,ST,Sex
0,2,1,2,0,23,3,2019HU0915504,1,1,0
1,4,2,1,0,71,0,2019HU0915555,1,1,1
2,5,0,0,0,53,0,2019HU0915555,2,1,0
3,5,2,1,0,25,1,2019HU0915664,1,1,1
4,0,0,2,0,38,0,2019HU0915711,1,1,1
...,...,...,...,...,...,...,...,...,...,...
2599166,3,1,2,0,125,0,2019HU1000576,1,56,0
2599167,3,1,0,0,134,0,2019HU1000576,2,56,1
2599168,4,2,0,0,24,0,2019HU1000791,1,56,1
2599169,1,0,1,0,48,3,2019HU0999472,1,56,1


#### Write recoded PUMS data to disk

In [19]:
# write to disk
f,e = os.path.splitext(INPUT_FILE)
output_file = f + '_recoded' + e
final_df.to_csv(output_file, index=False)
print('Wrote file "{0}".'.format(output_file))

Wrote file "data/pums_usa_2019_recoded.csv".
