In [41]:
import os
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

GLOBAL_PATH = '../'
DATA_PATH = '../Data/'
OUTPUT_PATH = '../Output/'

In [42]:
### Read in the datasets ###

df_cps20 = pd.read_csv(DATA_PATH + 'estimatesCPS2020.csv')
df_cps21 = pd.read_csv(DATA_PATH + 'estimatesCPS2021.csv')
df_medicaid = pd.read_csv(DATA_PATH + 'medicaid19.csv')
df_fiscal = pd.read_csv(DATA_PATH + 'covid_aid.csv')
df_gdp = pd.read_csv(DATA_PATH + 'gdp.csv')
df_minwage = pd.read_csv(DATA_PATH + 'minwage.csv')

In [43]:
### Append the 2020 and 2021 CPS datasets ###

# Append the CPS datasets, use concat
df_cps20['Year'] = 2020
df_cps21['Year'] = 2021
df_cps = pd.concat([df_cps20, df_cps21], axis=0)
df_cps.shape

(1224, 14)

In [44]:
### Merge the CPS and with other datasets based on State and Dates ###

# 1. Merge the medicaid dataset with the fiscal aid dataset
df_medicaid_fiscal = pd.merge(df_medicaid, df_fiscal, on='State')
df_medicaid_fiscal.shape # (51, 9)
df_medicaid_fiscal.head()

Unnamed: 0,State,Medicaid_Year,Medicaid_Services,CHIP,Medicaid_Admin,Medicaid_Total,Award_Obligations,Award_Outlays,Num_Awards
0,Alabama,2019,5880234000.0,391366800.0,216223800.0,6487824000.0,27100000000.0,24600000000.0,297743
1,Alaska,2019,2096340000.0,36966550.0,147328900.0,2280636000.0,12200000000.0,10900000000.0,47985
2,Arizona,2019,13167870000.0,234392500.0,314217500.0,13716480000.0,42100000000.0,40100000000.0,335840
3,Arkansas,2019,6842931000.0,189476300.0,400691500.0,7433099000.0,14300000000.0,13500000000.0,165436
4,California,2019,87855980000.0,3650584000.0,6243263000.0,97749830000.0,276400000000.0,252300000000.0,2732743


In [45]:
# 2. Merge the minwage dataset with the gdp dataset
df_minwage_gdp = pd.merge(df_minwage, df_gdp, on='State')
df_minwage_gdp.shape # (51, 33)
# keep columns State, MinWage20, MinWage21, GDP20, GDP21
df_minwage_gdp = df_minwage_gdp[['State', 'MinWage20', 'MinWage21', 'GDP20', 'GDP21']]
df_minwage_gdp.head()

Unnamed: 0,State,MinWage20,MinWage21,GDP20,GDP21
0,Alabama,7.25,7.25,199880800000.0,209979300000.0
1,Alaska,10.19,10.34,50705200000.0,50869400000.0
2,Arizona,12.0,12.15,327178000000.0,347656000000.0
3,Arkansas,10.0,11.0,117268200000.0,123347300000.0
4,California,13.0,14.0,2667221000000.0,2874731000000.0


In [46]:
# 3. Merge df_cps and df_medicaid_fiscal on state

# but the state in df_cps is in number format
# so we need to convert the state in df_cps to string
State_dict = {1: 'Alabama', 2: 'Alaska', 4: 'Arizona', 5: 'Arkansas', 6: 'California', 8: 'Colorado', 9: 'Connecticut', 10: 'Delaware',
            11: 'District of Columbia', 12: 'Florida', 13: 'Georgia', 15: 'Hawaii', 16: 'Idaho', 17: 'Illinois', 18: 'Indiana', 19: 'Iowa', 
            20: 'Kansas', 21: 'Kentucky', 22: 'Louisiana', 23: 'Maine', 24: 'Maryland', 25: 'Massachusetts', 26: 'Michigan', 27: 'Minnesota',
            28: 'Mississippi', 29: 'Missouri', 30: 'Montana', 31: 'Nebraska', 32: 'Nevada', 33: 'New Hampshire', 34: 'New Jersey',
            35: 'New Mexico', 36: 'New York', 37: 'North Carolina', 38: 'North Dakota', 39: 'Ohio', 40: 'Oklahoma', 41: 'Oregon',
            42: 'Pennsylvania', 44: 'Rhode Island', 45: 'South Carolina', 46: 'South Dakota', 47: 'Tennessee', 48: 'Texas', 49: 'Utah',
            50: 'Vermont', 51: 'Virginia', 53: 'Washington', 54: 'West Virginia', 55: 'Wisconsin', 56: 'Wyoming'}

df_cps['State'] = df_cps['State'].map(State_dict)

df_cps_medicaid_fiscal = pd.merge(df_cps, df_medicaid_fiscal, on='State')
df_cps_medicaid_fiscal.shape # (1224, 22) 1224 = 51 * 12 * 2


(1224, 22)

In [47]:
df_cps_medicaid_fiscal.head()
# remove Unnamed: 0 column
df_cps_medicaid_fiscal = df_cps_medicaid_fiscal.drop(columns=['Unnamed: 0'])

In [50]:
# 4. Merge df_cps_medicaid_fiscal and df_minwage_gdp on state
df_final = pd.merge(df_cps_medicaid_fiscal, df_minwage_gdp, on='State')
df_final.shape # (1224, 26)

# if df_cps_medicaid_fiscal Year is 2020, then use MinWage20 and GDP20 from df_minwage_gdp
# if df_cps_medicaid_fiscal Year is 2021, then use MinWage21 and GDP21 from df_minwage_gdp
df_final['MinWage'] = np.where(df_final['Year'] == 2020, df_final['MinWage20'], df_final['MinWage21'])
df_final['GDP'] = np.where(df_final['Year'] == 2020, df_final['GDP20'], df_final['GDP21'])

# drop MinWage20, MinWage21, GDP20, GDP21
df_final = df_final.drop(columns=['MinWage20', 'MinWage21', 'GDP20', 'GDP21'])

# check columns
df_final.columns




Index(['Month', 'State', 'Civil_Pop', 'LaborForce_Pop', 'Unemp_Pop', 'Emp_Pop',
       'Emp_Rate', 'Unemp_Rate', 'LF_Particip_Rate', 'Emp_Pop_Ratio',
       'Union_Pop', 'Union_Rate', 'Year', 'Medicaid_Year', 'Medicaid_Services',
       'CHIP', 'Medicaid_Admin', 'Medicaid_Total', 'Award_Obligations',
       'Award_Outlays', 'Num_Awards', 'MinWage', 'GDP'],
      dtype='object')

In [51]:
df_final.head()

Unnamed: 0,Month,State,Civil_Pop,LaborForce_Pop,Unemp_Pop,Emp_Pop,Emp_Rate,Unemp_Rate,LF_Particip_Rate,Emp_Pop_Ratio,Union_Pop,Union_Rate,Year,Medicaid_Year,Medicaid_Services,CHIP,Medicaid_Admin,Medicaid_Total,Award_Obligations,Award_Outlays,Num_Awards,MinWage,GDP
0,1,Alabama,3871113.0,2191401.0,71895.1913,2119506.0,96.719214,3.280786,56.609074,54.751851,87503.6079,3.993044,2020,2019,5880234000.0,391366847.0,216223796.0,6487824000.0,27100000000.0,24600000000.0,297743,7.25,199880800000.0
1,2,Alabama,3872006.0,2204607.0,54473.3402,2150134.0,97.529114,2.470886,56.937082,55.530231,154302.9432,6.999113,2020,2019,5880234000.0,391366847.0,216223796.0,6487824000.0,27100000000.0,24600000000.0,297743,7.25,199880800000.0
2,3,Alabama,3873022.0,2201768.0,71239.6282,2130529.0,96.764436,3.235564,56.848848,55.009467,205893.2337,9.351267,2020,2019,5880234000.0,391366847.0,216223796.0,6487824000.0,27100000000.0,24600000000.0,297743,7.25,199880800000.0
3,4,Alabama,3874113.0,2169832.0,295493.1542,1874339.0,86.381749,13.618251,56.008484,48.381108,170584.1993,7.861632,2020,2019,5880234000.0,391366847.0,216223796.0,6487824000.0,27100000000.0,24600000000.0,297743,7.25,199880800000.0
4,5,Alabama,3875399.0,2241937.0,203557.9046,2038379.0,90.920445,9.079555,57.85048,52.597914,173450.7058,7.736645,2020,2019,5880234000.0,391366847.0,216223796.0,6487824000.0,27100000000.0,24600000000.0,297743,7.25,199880800000.0


In [52]:
# save the final data
df_final.to_csv(DATA_PATH + 'final_data.csv', index=False)