In [3]:
from folktables import ACSDataSource, ACSIncome, generate_categories
import pandas as pd


In [4]:
ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ),
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee",
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}

# Texas data

In [7]:
data = pd.read_csv('../data/TX_data.csv')
len(data)

135924

In [63]:

def get_state(state):
    data = data_source.get_data(states=[state], download=True)
    features, labels, groups = ACSIncome.df_to_pandas(data, categories=ACSIncome.features, dummies=True)
    features['PINCP'] = data['PINCP']
    features.to_csv(f'data/{state}_data.csv', index=False)
    print("state {} have data {}".format(state, len(data)))
    return features



In [8]:

# occupation code at https://usa.ipums.org/usa/volii/c2ssoccup.shtml
# N/A: 0
# Management, Business, Science, and Arts Occupations: 0010 - 0425
# Business Operations Specialists: 0500 - 750
# Financial Specialists: 0800 - 0950
# Computer and Mathematical Occupations: 1000 - 1220
# Architecture and Engineering Occupations: 1300 - 1560
# Life, Physical, and Social Science Occupations: 1600 - 1965
# Community and Social Service Occupations: 2000 - 2060
# Legal Occupations: 2100 - 2160
# Education, Training, and Library Occupations: 2200 - 2550
# Arts, Design, Entertainment, Sports, and Media Occupations: 2600 - 2920
# Healthcare Practitioners and Technical Occupations: 3000 - 3540
# Healthcare Support Occupations: 3600 - 3655
# Protective Service Occupations: 3700 - 3955
# Food Preparation and Serving Occupations: 4000 - 4160
# Building and Grounds Cleaning and Maintenance Occupations: 4200 - 4290
# Personal Care and Service Occupations: 4300 - 4650
# Sales and Related Occupations: 4700 - 4965
# Office and Administrative Support Occupations: 5000 - 5940
# Farming, Fishing, and Forestry Occupations: 6000 - 6130
# Construction and Extraction Occupations: 6200 - 6765
# Extraction Workers: 6800 - 6940
# Installation, Maintenance, and Repair Workers: 7000 - 7640
# Production Occupations: 7700 - 8990
# Transportation and Material Moving Occupations: 9000 - 9750
# Military Specific Occupations: 9800 - 9920

def categorize_occupation(code):
    if code == 0:
        return 'N/A'
    elif 10 <= code <= 450:
        return 'MBSA'
    elif 500 <= code <= 750:
        return 'BOS'
    elif 800 <= code < 1000:
        return 'FS'
    elif 1000 <= code < 1300:
        return 'CMO'
    elif 1300 <= code < 1600:
        return 'AEO'
    elif 1600 <= code < 2000:
        return 'LPSSO'
    elif 2000 <= code < 2100:
        return 'CSSO'
    elif 2100 <= code < 2200:
        return 'LO'
    elif 2200 <= code < 2600:
        return 'ETLO'
    elif 2600 <= code <= 2920:
        return 'ASESMO'
    elif 3000 <= code <= 3590:
        return  'HPTO'
    elif 3600 <= code < 3700:
        return 'HSO'
    elif 3700 <= code < 4000:
        return 'PSO'
    elif 4000 <= code < 4200:
        return 'FPASO'
    elif 4200 <= code < 4300:
        return 'BGCMO'
    elif 4300 <= code < 4700:
        return 'PCSO'
    elif 4700 <= code <= 4965:
        return 'SRO'
    elif 5000 <= code <= 5940:
        return 'OASO'
    elif 6000 <= code <= 6130:
        return 'FFFO'
    elif 6200 <= code < 6800:
        return 'CEO'
    elif 6800 <= code < 7000:
        return 'EW'
    elif 7000 <= code < 7700:
        return 'IMRW'
    elif 7700 <= code <= 8990:
        return 'PO'
    elif 9000 <= code <= 9760:
        return 'TMMO'
    elif 9800 <= code <= 9920:
        return 'MSO'
    else:
        raise Exception('Unknown occupation code: {}'.format(code))


    # Add more categories based on the given labels

data['OCCP_bucket'] = data['OCCP'].apply(categorize_occupation)


print(data['OCCP_bucket'].unique())
print(data[:5])

['HPTO' 'SRO' 'FPASO' 'BGCMO' 'CEO' 'CSSO' 'TMMO' 'MBSA' 'LPSSO' 'OASO'
 'MSO' 'PCSO' 'PSO' 'PO' 'ETLO' 'ASESMO' 'FFFO' 'HSO' 'IMRW' 'FS' 'LO'
 'BOS' 'CMO' 'EW' 'AEO']
   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P    PINCP  \
0  21.0  3.0  16.0  5.0  3500.0  48.0  17.0  10.0  2.0    2.0   3200.0   
1  20.0  1.0  16.0  5.0  4720.0  39.0  17.0  50.0  2.0    1.0      0.0   
2  31.0  1.0  17.0  5.0  4110.0  48.0  17.0  30.0  1.0    1.0      0.0   
3  39.0  1.0  21.0  1.0  4255.0  18.0  16.0  40.0  1.0    1.0  35000.0   
4  18.0  1.0  16.0  5.0  4055.0  48.0  17.0  20.0  1.0    6.0  10000.0   

  OCCP_bucket  
0        HPTO  
1         SRO  
2       FPASO  
3       BGCMO  
4       FPASO  


In [9]:
bins = [0, 30, 60, float("inf")]
labels = ['0-30', '30-60', '60+']
data['age_bucket'] = pd.cut(data['AGEP'], bins=bins, labels=labels)
print(data['age_bucket'].unique())
print(data[:5])

['0-30', '30-60', '60+']
Categories (3, object): ['0-30' < '30-60' < '60+']
   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P    PINCP  \
0  21.0  3.0  16.0  5.0  3500.0  48.0  17.0  10.0  2.0    2.0   3200.0   
1  20.0  1.0  16.0  5.0  4720.0  39.0  17.0  50.0  2.0    1.0      0.0   
2  31.0  1.0  17.0  5.0  4110.0  48.0  17.0  30.0  1.0    1.0      0.0   
3  39.0  1.0  21.0  1.0  4255.0  18.0  16.0  40.0  1.0    1.0  35000.0   
4  18.0  1.0  16.0  5.0  4055.0  48.0  17.0  20.0  1.0    6.0  10000.0   

  OCCP_bucket age_bucket  
0        HPTO       0-30  
1         SRO       0-30  
2       FPASO      30-60  
3       BGCMO      30-60  
4       FPASO       0-30  


In [10]:
data.columns

Index(['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX',
       'RAC1P', 'PINCP', 'OCCP_bucket', 'age_bucket'],
      dtype='object')

In [13]:
print(data['SEX'].unique())

[2. 1.]


In [14]:
data.SEX.replace(1, 'M', inplace=True)
data.SEX.replace(2, 'F', inplace=True)

In [15]:
data.to_csv("../data/TX_data.csv", index=False)