In [1]:
import numpy as np
import pandas as pd

In [2]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OrdinalEncoder

#  Get Data with folktables 

ACS data extracted from the US Census using folktables: https://github.com/socialfoundations/folktables

Related paper:
Ding, Frances and Hardt, Moritz and Miller, John and Schmidt, Ludwig, **Retiring Adult: New Datasets for Fair Machine 
Learning**,  Advances in Neural Information Processing Systems, vol.34, 2021

In [3]:
from folktables import ACSDataSource, ACSEmployment

In [4]:
years = list(range(2014, 2019))
dfs = []
for year in years:
    data_source = ACSDataSource(survey_year=year, horizon='1-Year', survey='person')
    data = data_source.get_data(states=["MA"], download=True)
    features, labels, _ = ACSEmployment.df_to_numpy(data)
    df = pd.DataFrame(features)
    df.columns = ACSEmployment.features
    df[ACSEmployment.target] = labels
    df['year'] = year
    dfs.append(df)
    

df = pd.concat(dfs).reset_index(drop=True)
df.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR,year
0,30.0,19.0,1.0,0.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,True,2014
1,24.0,19.0,1.0,1.0,2.0,0.0,1.0,3.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,False,2014
2,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,False,2014
3,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,0.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,False,2014
4,83.0,22.0,1.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,False,2014


Feature description from: https://www.census.gov/programs-surveys/acs/microdata/documentation.html

AGEP - age person, numeric

SCHL 
Educational attainment
* bb .N/A (less than 3 years old)
* 01 .No schooling completed
* 02 .Nursery school, preschool
* 03 .Kindergarten
* 04 .Grade 1
* 05 .Grade 2
* 06 .Grade 3
* 07 .Grade 4
* 08 .Grade 5
* 09 .Grade 6
* 10 .Grade 7
* 11 .Grade 8
* 12 .Grade 9
* 13 .Grade 10
* 14 .Grade 11
* 15 .12th grade - no diploma
* 16 .Regular high school diploma
* 17 .GED or alternative credential
* 18 .Some college, but less than 1 year
* 19 .1 or more years of college credit, no degree
* 20 .Associate's degree
* 21 .Bachelor's degree
* 22 .Master's degree
* 23 .Professional degree beyond a bachelor's degree
* 24 .Doctorate degree


MAR Character 1
Marital status
* 1 .Married
* 2 .Widowed
* 3 .Divorced
* 4 .Separated
* 5 .Never married or under 15 years old

RELP Character 2
Relationship
* 00 .Reference person
* 01 .Husband/wife
* 02 .Biological son or daughter
* 03 .Adopted son or daughter
* 04 .Stepson or stepdaughter
* 05 .Brother or sister
* 06 .Father or mother
* 07 .Grandchild
* 08 .Parent-in-law
* 40
* 09 .Son-in-law or daughter-in-law
* 10 .Other relative
* 11 .Roomer or boarder
* 12 .Housemate or roommate
* 13 .Unmarried partner
* 14 .Foster child
* 15 .Other nonrelative
* 16 .Institutionalized group quarters population
* 17 .Noninstitutionalized group quarters population
 
DIS Character 1
Disability recode
* 1 .With a disability
* 2 .Without a disability

ESP Character 1
Employment status of parents
b .N/A (not own child of householder, and not child in subfamily)
* 1 .Living with two parents: both parents in labor force
* 2 .Living with two parents: Father only in labor force
* 3 .Living with two parents: Mother only in labor force
* 4 .Living with two parents: Neither parent in labor force
* 5 .Living with father: Father in the labor force
* 6 .Living with father: Father not in labor force
* 7 .Living with mother: Mother in the labor force
* 8 .Living with mother: Mother not in labor force

CIT Character 1
Citizenship status
* 1 .Born in the U.S.
* 2 .Born in Puerto Rico, Guam, the U.S. Virgin Islands, or the
* .Northern Marianas
* 3 .Born abroad of American parent(s)
* 4 .U.S. citizen by naturalization
* 5 .Not a citizen of the U.S.

MIG Character 1
Mobility status (lived here 1 year ago)
* b .N/A (less than 1 year old)
* 1 .Yes, same house (nonmovers)
* 2 .No, outside US and Puerto Rico
* 3 .No, different house in US or Puerto Rico

MIL Character 1
Military service
* b .N/A (less than 17 years old)
* 1 .Now on active duty
* 2 .On active duty in the past, but not now
* 3 .Only on active duty for training in Reserves/National Guard
* 4 .Never served in the military

ANC Character 1
Ancestry recode
* 1 .Single
* 2 .Multiple
* 3 .Unclassified
* 4 .Not reported
* 8 .Suppressed for data year 2018 for select PUMAs


NATIVITY Character 1
Nativity
* 1 .Native
* 2 .Foreign born

DEAR Character 1
Hearing difficulty
* 1 .Yes
* 2 .No

DEYE Character 1
Vision difficulty
* 1 .Yes
* 2 .No

DREM Character 1
Cognitive difficulty
* b .N/A (Less than 5 years old)
* 1 .Yes
* 2 .No

SEX Character 1
Sex
* 1 .Male
* 2 .Female

RAC1P Character 1
Recoded detailed race code
* 1 .White alone
* 2 .Black or African American alone
* 3 .American Indian alone
* 4 .Alaska Native alone
* 5 .American Indian and Alaska Native tribes specified; or
* .American Indian or Alaska Native, not specified and no other
* .races
* 6 .Asian alone
* 7 .Native Hawaiian and Other Pacific Islander alone
* 8 .Some Other Race alone
* 9 .Two or More Races

ESR - target
* True - employed
* False - unemployed

# Preprocess

## Split partitions

In [5]:
df['partition'] = None
df['partition'] = np.where(df['year']==2014, 'train', df['partition'])
df['partition'] = np.where(df['year']==2015, 'test', df['partition'])
df['partition'] = np.where(df['year']>2015, 'prod', df['partition'])

In [6]:
df['partition'].value_counts()

prod     208892
test      68785
train     68544
Name: partition, dtype: int64

In [7]:
df.isna().any()

AGEP         False
SCHL         False
MAR          False
RELP         False
DIS          False
ESP          False
CIT          False
MIG          False
MIL          False
ANC          False
NATIVITY     False
DEAR         False
DEYE         False
DREM         False
SEX          False
RAC1P        False
ESR          False
year         False
partition    False
dtype: bool

In [8]:
categorical_features = ['MAR', 'RELP', "DIS", "ESP", "CIT", "MIG", "MIL", "ANC", "NATIVITY",
                        "DEAR", "DEYE", "DREM", "SEX", "RAC1P"]

# Treat SCHL as continuous as it has natural order, let LGBM do the split
numeric_features = ['AGEP', 'SCHL'] 

target_col = 'ESR'

In [9]:
# fit cat encoder
categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
categorical_encoder.fit_transform(df[df['partition']=='train'][categorical_features])
# transform cat
df[categorical_features] = categorical_encoder.transform(df[categorical_features])

In [10]:
features = numeric_features + categorical_features

In [11]:
df_train = df[df['partition']=='train'].copy()

## Train client model and get predictions

In [12]:
client_model = LGBMClassifier()

In [13]:
# model
client_model.fit(df[df['partition']=='train'][features], df[df['partition']=='train'][target_col],
                 categorical_feature=categorical_features)

df['y_pred_proba'] = client_model.predict_proba(df[features])[:,1]
df['y_pred'] = client_model.predict(df[features])



In [14]:
df

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,...,DEAR,DEYE,DREM,SEX,RAC1P,ESR,year,partition,y_pred_proba,y_pred
0,30.0,19.0,0.0,0.0,1.0,0.0,0.0,3.0,4.0,0.0,...,1.0,1.0,2.0,0.0,1.0,True,2014,train,0.864899,True
1,24.0,19.0,0.0,1.0,1.0,0.0,0.0,3.0,4.0,1.0,...,1.0,1.0,2.0,1.0,1.0,False,2014,train,0.706134,True
2,5.0,2.0,4.0,4.0,1.0,2.0,0.0,3.0,0.0,1.0,...,1.0,1.0,2.0,1.0,1.0,False,2014,train,0.000107,False
3,5.0,2.0,4.0,4.0,1.0,2.0,0.0,3.0,0.0,1.0,...,1.0,1.0,2.0,0.0,1.0,False,2014,train,0.000084,False
4,83.0,22.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,...,1.0,1.0,2.0,0.0,0.0,False,2014,train,0.038189,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346216,46.0,21.0,0.0,1.0,1.0,0.0,0.0,1.0,4.0,0.0,...,1.0,1.0,2.0,0.0,0.0,True,2018,prod,0.946687,True
346217,15.0,11.0,4.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,2.0,0.0,0.0,False,2018,prod,0.000140,False
346218,36.0,22.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,1.0,...,1.0,1.0,2.0,1.0,0.0,True,2018,prod,0.857156,True
346219,35.0,21.0,0.0,1.0,1.0,0.0,0.0,1.0,4.0,1.0,...,1.0,1.0,2.0,0.0,0.0,True,2018,prod,0.953431,True


# Save reference and production data

In [15]:
data_dir = '../../../nannyml/nannyml/datasets/data/'

In [16]:
df[df['partition']=='test'].to_parquet(data_dir + "employment_MA_reference.pq")

In [17]:
df[df['partition']=='prod'].drop(columns=[target_col]).to_parquet(data_dir + "employment_MA_analysis.pq")

In [18]:
df[df['partition']=='prod'][[target_col]].to_parquet(data_dir + "employment_MA_analysis_target.pq")