In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from folktables import ACSDataSource, ACSEmployment

In [2]:
from docs.utils import print_table

In [3]:
#  Get Data with folktables 

In [4]:
# ACS data extracted from the US Census using folktables: https://github.com/socialfoundations/folktables
# 
# Related paper:
# Ding, Frances and Hardt, Moritz and Miller, John and Schmidt, Ludwig, **Retiring Adult: New Datasets for Fair Machine 
# Learning**,  Advances in Neural Information Processing Systems, vol.34, 2021

In [5]:
years = list(range(2014, 2019))
dfs = []
for year in years:
    data_source = ACSDataSource(survey_year=year, horizon='1-Year', survey='person')
    data = data_source.get_data(states=["MA"], download=True)
    features, labels, _ = ACSEmployment.df_to_numpy(data)
    df = pd.DataFrame(features)
    df.columns = ACSEmployment.features
    df[ACSEmployment.target] = labels
    df['year'] = year
    dfs.append(df)
    

df = pd.concat(dfs).reset_index(drop=True)
df.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR,year
0,30.0,19.0,1.0,0.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,True,2014
1,24.0,19.0,1.0,1.0,2.0,0.0,1.0,3.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,False,2014
2,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,False,2014
3,5.0,2.0,5.0,4.0,2.0,2.0,1.0,3.0,0.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,False,2014
4,83.0,22.0,1.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,False,2014


In [6]:
print_table(df.head())

+----+--------+--------+-------+--------+-------+-------+-------+-------+-------+-------+------------+--------+--------+--------+-------+---------+-------+--------+
|    | AGEP   | SCHL   | MAR   | RELP   | DIS   | ESP   | CIT   | MIG   | MIL   | ANC   | NATIVITY   | DEAR   | DEYE   | DREM   | SEX   | RAC1P   | ESR   | year   |
| 0  | 30     | 19     | 1     | 0      | 2     | 0     | 1     | 3     | 4     | 1     | 1          | 2      | 2      | 2      | 1     | 2       | True  | 2014   |
+----+--------+--------+-------+--------+-------+-------+-------+-------+-------+-------+------------+--------+--------+--------+-------+---------+-------+--------+
| 1  | 24     | 19     | 1     | 1      | 2     | 0     | 1     | 3     | 4     | 2     | 1          | 2      | 2      | 2      | 2     | 2       | False | 2014   |
+----+--------+--------+-------+--------+-------+-------+-------+-------+-------+-------+------------+--------+--------+--------+-------+---------+-------+--------+
| 2  | 5  

In [7]:
# Feature description from (PUMS data dictionary): https://www.census.gov/programs-surveys/acs/microdata/documentation.html
# 
# AGEP - age person, numeric
# 
# SCHL 
# Educational attainment
# * bb .N/A (less than 3 years old)
# * 01 .No schooling completed
# * 02 .Nursery school, preschool
# * 03 .Kindergarten
# * 04 .Grade 1
# * 05 .Grade 2
# * 06 .Grade 3
# * 07 .Grade 4
# * 08 .Grade 5
# * 09 .Grade 6
# * 10 .Grade 7
# * 11 .Grade 8
# * 12 .Grade 9
# * 13 .Grade 10
# * 14 .Grade 11
# * 15 .12th grade - no diploma
# * 16 .Regular high school diploma
# * 17 .GED or alternative credential
# * 18 .Some college, but less than 1 year
# * 19 .1 or more years of college credit, no degree
# * 20 .Associate's degree
# * 21 .Bachelor's degree
# * 22 .Master's degree
# * 23 .Professional degree beyond a bachelor's degree
# * 24 .Doctorate degree
# 
# 
# MAR Character 1
# Marital status
# * 1 .Married
# * 2 .Widowed
# * 3 .Divorced
# * 4 .Separated
# * 5 .Never married or under 15 years old
# 
# RELP Character 2
# Relationship
# * 00 .Reference person
# * 01 .Husband/wife
# * 02 .Biological son or daughter
# * 03 .Adopted son or daughter
# * 04 .Stepson or stepdaughter
# * 05 .Brother or sister
# * 06 .Father or mother
# * 07 .Grandchild
# * 08 .Parent-in-law
# * 09 .Son-in-law or daughter-in-law
# * 10 .Other relative
# * 11 .Roomer or boarder
# * 12 .Housemate or roommate
# * 13 .Unmarried partner
# * 14 .Foster child
# * 15 .Other nonrelative
# * 16 .Institutionalized group quarters population
# * 17 .Noninstitutionalized group quarters population
#  
# DIS Character 1
# Disability recode
# * 1 .With a disability
# * 2 .Without a disability
# 
# ESP Character 1
# Employment status of parents
# * b .N/A (not own child of householder, and not child in subfamily)
# * 1 .Living with two parents: both parents in labor force
# * 2 .Living with two parents: Father only in labor force
# * 3 .Living with two parents: Mother only in labor force
# * 4 .Living with two parents: Neither parent in labor force
# * 5 .Living with father: Father in the labor force
# * 6 .Living with father: Father not in labor force
# * 7 .Living with mother: Mother in the labor force
# * 8 .Living with mother: Mother not in labor force
# 
# CIT Character 1
# Citizenship status
# * 1 .Born in the U.S.
# * 2 .Born in Puerto Rico, Guam, the U.S. Virgin Islands, or the
# * .Northern Marianas
# * 3 .Born abroad of American parent(s)
# * 4 .U.S. citizen by naturalization
# * 5 .Not a citizen of the U.S.
# 
# MIG Character 1
# Mobility status (lived here 1 year ago)
# * b .N/A (less than 1 year old)
# * 1 .Yes, same house (nonmovers)
# * 2 .No, outside US and Puerto Rico
# * 3 .No, different house in US or Puerto Rico
# 
# MIL Character 1
# Military service
# * b .N/A (less than 17 years old)
# * 1 .Now on active duty
# * 2 .On active duty in the past, but not now
# * 3 .Only on active duty for training in Reserves/National Guard
# * 4 .Never served in the military
# 
# ANC Character 1
# Ancestry recode
# * 1 .Single
# * 2 .Multiple
# * 3 .Unclassified
# * 4 .Not reported
# * 8 .Suppressed for data year 2018 for select PUMAs
# 
# 
# NATIVITY Character 1
# Nativity
# * 1 .Native
# * 2 .Foreign born
# 
# DEAR Character 1
# Hearing difficulty
# * 1 .Yes
# * 2 .No
# 
# DEYE Character 1
# Vision difficulty
# * 1 .Yes
# * 2 .No
# 
# DREM Character 1
# Cognitive difficulty
# * b .N/A (Less than 5 years old)
# * 1 .Yes
# * 2 .No
# 
# SEX Character 1
# Sex
# * 1 .Male
# * 2 .Female
# 
# RAC1P Character 1
# Recoded detailed race code
# * 1 .White alone
# * 2 .Black or African American alone
# * 3 .American Indian alone
# * 4 .Alaska Native alone
# * 5 .American Indian and Alaska Native tribes specified; or
# * .American Indian or Alaska Native, not specified and no other
# * .races
# * 6 .Asian alone
# * 7 .Native Hawaiian and Other Pacific Islander alone
# * 8 .Some Other Race alone
# * 9 .Two or More Races
# 
# ESR - target
# * True - employed
# * False - unemployed

In [8]:
# Preprocess

In [9]:
## Split partitions

In [10]:
df['partition'] = None
df['partition'] = np.where(df['year']==2014, 'train', df['partition'])
df['partition'] = np.where(df['year']==2015, 'test', df['partition'])
df['partition'] = np.where(df['year']>2015, 'prod', df['partition'])

In [11]:
df['partition'].value_counts()

prod     208892
test      68785
train     68544
Name: partition, dtype: int64

In [12]:
df.isna().any()

AGEP         False
SCHL         False
MAR          False
RELP         False
DIS          False
ESP          False
CIT          False
MIG          False
MIL          False
ANC          False
NATIVITY     False
DEAR         False
DEYE         False
DREM         False
SEX          False
RAC1P        False
ESR          False
year         False
partition    False
dtype: bool

In [13]:
df_train = df[df['partition']=='train'].copy()

In [14]:
categorical_features = ['SCHL','MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC', 'NATIVITY',
                        'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']

numeric_features = ['AGEP']

In [15]:
for feature in categorical_features:
    display(df_train[feature].value_counts())

21.0    12438
16.0    11780
19.0     7380
22.0     6853
20.0     3942
18.0     3201
1.0      1949
0.0      1858
17.0     1511
13.0     1504
14.0     1503
23.0     1475
11.0     1420
12.0     1368
24.0     1364
9.0      1167
2.0      1013
10.0      986
7.0       926
8.0       900
15.0      891
6.0       813
3.0       774
4.0       764
5.0       764
Name: SCHL, dtype: int64

5.0    30538
1.0    28279
3.0     5255
2.0     3565
4.0      907
Name: MAR, dtype: int64

0.0     26441
2.0     16614
1.0     13222
17.0     3175
16.0     1517
13.0     1514
12.0     1174
7.0       905
6.0       656
10.0      649
15.0      583
5.0       573
4.0       432
3.0       395
11.0      261
8.0       194
9.0       182
14.0       57
Name: RELP, dtype: int64

2.0    59417
1.0     9127
Name: DIS, dtype: int64

0.0    55887
1.0     6636
2.0     2223
7.0     2218
5.0      589
8.0      516
3.0      302
4.0      114
6.0       59
Name: ESP, dtype: int64

1.0    57107
4.0     5364
5.0     4365
2.0      999
3.0      709
Name: CIT, dtype: int64

1.0    58916
3.0     8343
2.0      670
0.0      615
Name: MIG, dtype: int64

4.0    51333
0.0    12496
2.0     3824
3.0      844
1.0       47
Name: MIL, dtype: int64

1.0    36098
2.0    23137
4.0     8765
3.0      544
Name: ANC, dtype: int64

1.0    58815
2.0     9729
Name: NATIVITY, dtype: int64

2.0    65899
1.0     2645
Name: DEAR, dtype: int64

2.0    66971
1.0     1573
Name: DEYE, dtype: int64

2.0    61529
1.0     3817
0.0     3198
Name: DREM, dtype: int64

2.0    35700
1.0    32844
Name: SEX, dtype: int64

1.0    56067
2.0     4180
6.0     4063
8.0     2198
9.0     1884
3.0       92
5.0       47
7.0       13
Name: RAC1P, dtype: int64

In [16]:
# Categorical features are already well encoded for LGBM (non-negative integers-like). Let's just turn them into integers and we're good.

In [17]:
features = numeric_features + categorical_features

target_col = 'employed'

df = df.rename(columns={'ESR':target_col})

df[categorical_features] = df[categorical_features].astype(int)
df[target_col] = df[target_col].astype(int)

In [18]:
## Train client model and get predictions

In [19]:
client_model = LGBMClassifier(random_state=1)

In [20]:
client_model.fit(df[df['partition']=='train'][features], df[df['partition']=='train'][target_col],
                 categorical_feature=categorical_features)

df['prediction'] = client_model.predict(df[features])
df['predicted_probability'] = client_model.predict_proba(df[features])[:,1]



In [21]:
client_model.feature_importances_

array([1129,  394,  126,  285,  106,   13,   86,  145,   50,  180,   54,
         51,   34,   67,  203,   77], dtype=int32)

In [22]:
# Save reference and production data

In [23]:
for feat in categorical_features:
    df[feat] = df[feat].astype(str).astype('category')

In [24]:
df.dtypes

AGEP                      float64
SCHL                     category
MAR                      category
RELP                     category
DIS                      category
ESP                      category
CIT                      category
MIG                      category
MIL                      category
ANC                      category
NATIVITY                 category
DEAR                     category
DEYE                     category
DREM                     category
SEX                      category
RAC1P                    category
employed                    int64
year                        int64
partition                  object
prediction                  int64
predicted_probability     float64
dtype: object

In [25]:
full_reference_data = df[df['partition']=='test'].reset_index(drop=True).drop(columns='partition')
analysis_wo_targets = df[df['partition']=='prod'].reset_index(drop=True).drop(columns=[target_col, 'partition'])
analysis_targets = df[df['partition']=='prod'][[target_col]].reset_index(drop=True)

In [26]:
data_dir = '../../../nannyml/nannyml/datasets/data/'

full_reference_data.to_parquet(data_dir + "employment_MA_reference.pq")
analysis_wo_targets.to_parquet(data_dir + "employment_MA_analysis.pq")
analysis_targets.to_parquet(data_dir + "employment_MA_analysis_target.pq", )