In [11]:
from folktables import ACSDataSource, ACSIncome, generate_categories
import pandas as pd


In [12]:
ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ),
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee",
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}

In [15]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')


In [22]:
print(data_source)

<folktables.acs.ACSDataSource object at 0x7fbc706590a0>


# eg: CA data

In [41]:

ca_data = data_source.get_data(states=["CA"], download=True)

ca_features, ca_labels, ca_groups = ACSIncome.df_to_pandas(ca_data, categories=ACSIncome.features, dummies=True)

print(ca_features.head())

   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P
0  30.0  6.0  14.0  1.0  9610.0   6.0  16.0  40.0  1.0    8.0
1  21.0  4.0  16.0  5.0  1970.0   6.0  17.0  20.0  1.0    1.0
2  65.0  2.0  22.0  5.0  2040.0   6.0  17.0   8.0  1.0    1.0
3  33.0  1.0  14.0  3.0  9610.0  36.0  16.0  40.0  1.0    1.0
4  18.0  2.0  19.0  5.0  1021.0   6.0  17.0  18.0  2.0    1.0


In [47]:
ca_features['PINCP'] = ca_data['PINCP']

In [48]:
ca_features.to_csv('data/ca_data.csv', index=False)

In [49]:
len(ca_data)

378817

In [50]:
ca_features.columns

Index(['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX',
       'RAC1P', 'PINCP'],
      dtype='object')

# eg, all data

In [25]:
# all_data = data_source.get_data(download=True)

In [26]:
all_data = data_source.get_data()

In [34]:
len(all_data)

3236107

In [27]:
X, Y, Z = ACSIncome.df_to_numpy(all_data)

In [30]:
print(X[:5])

[[1.800e+01 1.000e+00 1.800e+01 5.000e+00 4.720e+03 1.300e+01 1.700e+01
  2.100e+01 2.000e+00 2.000e+00]
 [5.300e+01 5.000e+00 1.700e+01 5.000e+00 3.605e+03 1.800e+01 1.600e+01
  4.000e+01 1.000e+00 1.000e+00]
 [4.100e+01 1.000e+00 1.600e+01 5.000e+00 7.330e+03 1.000e+00 1.700e+01
  4.000e+01 1.000e+00 1.000e+00]
 [1.800e+01 6.000e+00 1.800e+01 5.000e+00 2.722e+03 1.000e+00 1.700e+01
  2.000e+00 2.000e+00 1.000e+00]
 [2.100e+01 5.000e+00 1.900e+01 5.000e+00 3.870e+03 1.200e+01 1.700e+01
  5.000e+01 1.000e+00 1.000e+00]]


In [33]:
print(ACSIncome.features)

['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX', 'RAC1P']


In [35]:
all_features, all_labels, all_groups = ACSIncome.df_to_pandas(all_data, categories=ACSIncome.features, dummies=True)

# Texas data

In [63]:

def get_state(state):
    data = data_source.get_data(states=[state], download=True)
    features, labels, groups = ACSIncome.df_to_pandas(data, categories=ACSIncome.features, dummies=True)
    features['PINCP'] = data['PINCP']
    features.to_csv(f'data/{state}_data.csv', index=False)
    print("state {} have data {}".format(state, len(data)))
    return features



In [64]:
get_state('TX')

state TX have data 268100


Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,21.0,3.0,16.0,5.0,3500.0,48.0,17.0,10.0,2.0,2.0,3200.0
1,20.0,1.0,16.0,5.0,4720.0,39.0,17.0,50.0,2.0,1.0,0.0
2,31.0,1.0,17.0,5.0,4110.0,48.0,17.0,30.0,1.0,1.0,0.0
3,39.0,1.0,21.0,1.0,4255.0,18.0,16.0,40.0,1.0,1.0,35000.0
4,18.0,1.0,16.0,5.0,4055.0,48.0,17.0,20.0,1.0,6.0,10000.0
...,...,...,...,...,...,...,...,...,...,...,...
135919,19.0,1.0,19.0,5.0,9620.0,48.0,2.0,40.0,1.0,1.0,80700.0
135920,24.0,1.0,19.0,5.0,4760.0,48.0,10.0,40.0,1.0,1.0,100000.0
135921,40.0,1.0,15.0,5.0,6260.0,48.0,12.0,40.0,1.0,1.0,0.0
135922,43.0,1.0,1.0,2.0,6330.0,17.0,0.0,48.0,1.0,1.0,40000.0


In [65]:
get_state('MN')

state MN have data 55783


Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,36.0,1.0,17.0,5.0,6410.0,27.0,16.0,20.0,1.0,1.0,2000.0
1,21.0,1.0,19.0,5.0,4622.0,20.0,17.0,20.0,2.0,1.0,0.0
2,46.0,1.0,24.0,1.0,2100.0,27.0,17.0,50.0,2.0,1.0,1300.0
3,19.0,3.0,18.0,5.0,2440.0,55.0,17.0,12.0,2.0,1.0,4720.0
4,18.0,1.0,18.0,5.0,4720.0,27.0,17.0,30.0,1.0,6.0,130000.0
...,...,...,...,...,...,...,...,...,...,...,...
31016,43.0,6.0,21.0,1.0,2145.0,8.0,1.0,55.0,2.0,1.0,133400.0
31017,17.0,1.0,16.0,5.0,4760.0,27.0,2.0,16.0,2.0,1.0,68900.0
31018,45.0,1.0,19.0,3.0,7150.0,27.0,0.0,38.0,1.0,1.0,1500.0
31019,24.0,3.0,16.0,5.0,4055.0,27.0,2.0,20.0,2.0,1.0,107000.0


In [66]:
US_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI",
              "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI",
              "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
              "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT",
              "VT", "VA", "WA", "WV", "WI", "WY", "PR"]

In [67]:
for state in US_states:
    get_state(state)

state AL have data 47777
state AK have data 6711
state AZ have data 69990
state AR have data 30503
state CA have data 378817
state CO have data 55928
state CT have data 36287
state DE have data 9123
state FL have data 202160
state GA have data 100855
state HI have data 14400
state ID have data 16711
state IL have data 126456
state IN have data 67680
state IA have data 32362
state KS have data 29567
state KY have data 45475
state LA have data 43589
state ME have data 13275
state MD have data 59840
state MA have data 70131
state MI have data 99419
state MN have data 55783
state MS have data 29124
state MO have data 62416
state MT have data 10336
state NE have data 19451
state NV have data 28927
state NH have data 13780
state NJ have data 88586
state NM have data 19247
state NY have data 196967
state NC have data 102523
state ND have data 7876
state OH have data 119086
state OK have data 37648
state OR have data 42117
state PA have data 129066
state RI have data 10489
state SC have data 4

In [68]:
get_state('TX')

state TX have data 268100


Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,21.0,3.0,16.0,5.0,3500.0,48.0,17.0,10.0,2.0,2.0,3200.0
1,20.0,1.0,16.0,5.0,4720.0,39.0,17.0,50.0,2.0,1.0,0.0
2,31.0,1.0,17.0,5.0,4110.0,48.0,17.0,30.0,1.0,1.0,0.0
3,39.0,1.0,21.0,1.0,4255.0,18.0,16.0,40.0,1.0,1.0,35000.0
4,18.0,1.0,16.0,5.0,4055.0,48.0,17.0,20.0,1.0,6.0,10000.0
...,...,...,...,...,...,...,...,...,...,...,...
135919,19.0,1.0,19.0,5.0,9620.0,48.0,2.0,40.0,1.0,1.0,80700.0
135920,24.0,1.0,19.0,5.0,4760.0,48.0,10.0,40.0,1.0,1.0,100000.0
135921,40.0,1.0,15.0,5.0,6260.0,48.0,12.0,40.0,1.0,1.0,0.0
135922,43.0,1.0,1.0,2.0,6330.0,17.0,0.0,48.0,1.0,1.0,40000.0


In [69]:
get_state('CA')

state CA have data 378817


Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,30.0,6.0,14.0,1.0,9610.0,6.0,16.0,40.0,1.0,8.0,48500.0
1,21.0,4.0,16.0,5.0,1970.0,6.0,17.0,20.0,1.0,1.0,0.0
2,65.0,2.0,22.0,5.0,2040.0,6.0,17.0,8.0,1.0,1.0,13100.0
3,33.0,1.0,14.0,3.0,9610.0,36.0,16.0,40.0,1.0,1.0,0.0
4,18.0,2.0,19.0,5.0,1021.0,6.0,17.0,18.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
195660,38.0,1.0,22.0,1.0,1021.0,210.0,0.0,40.0,1.0,6.0,26000.0
195661,39.0,1.0,22.0,1.0,1021.0,210.0,1.0,40.0,2.0,6.0,
195662,61.0,1.0,19.0,1.0,5240.0,17.0,0.0,45.0,1.0,1.0,
195663,69.0,7.0,24.0,1.0,2040.0,207.0,0.0,45.0,1.0,6.0,100000.0
