In [1]:
import pandas as pd
import numpy as np
import warnings

pd.set_option("mode.copy_on_write", True)

# Poverty Data in 2021

In [2]:
poverty_data=pd.read_csv("USDA_data/Poverty.csv")
# columns name : ['FIPS*', 'Name', 'RUC Code', 'All people in poverty (2021) Percent', 'Children ages 0-17 in poverty (2021) Percent']
poverty_data.rename(columns={'FIPS*': 'FIPS'}, inplace=True)
poverty_data["YR"]="2021" # add a column as Year
poverty_data.head(5)

Unnamed: 0,FIPS,Name,RUC Code,All people in poverty (2021) Percent,Children ages 0-17 in poverty (2021) Percent,YR
0,23000,Maine,,11.2,13.8,2021
1,23001,Androscoggin,3.0,14.0,19.1,2021
2,23003,Aroostook,7.0,14.9,18.9,2021
3,23005,Cumberland,2.0,7.7,8.7,2021
4,23007,Franklin,8.0,11.7,15.7,2021


In [3]:
poverty_data=poverty_data[['FIPS', 'All people in poverty (2021) Percent','Children ages 0-17 in poverty (2021) Percent']]
poverty_data

Unnamed: 0,FIPS,All people in poverty (2021) Percent,Children ages 0-17 in poverty (2021) Percent
0,23000,11.2,13.8
1,23001,14.0,19.1
2,23003,14.9,18.9
3,23005,7.7,8.7
4,23007,11.7,15.7
...,...,...,...
528,12125,22.7,24.3
529,12127,13.8,19.1
530,12129,10.1,16.8
531,12131,12.4,18.8


# Education Data

In [4]:
education_CompletingCollege=pd.read_csv("USDA_data/Education(CompletingCollege).csv")
education_CompletingCollege.rename(columns={'2023 Rural-urban Continuum Code*': 'RUC Code'}, inplace=True)

education_CompletingCollege_melt = education_CompletingCollege.melt(
    id_vars=['FIPS', 'Name','RUC Code'],
    var_name='YR',
    value_name='Completing College'
)

education_CompletingHighSchool_only=pd.read_csv("USDA_data/Education(CompletingHighSchool_only).csv")
education_CompletingHighSchool_only.rename(columns={'2023 Rural-urban Continuum Code*': 'RUC Code'}, inplace=True)

education_CompletingHighSchool_only_melt  = education_CompletingHighSchool_only.melt(
    id_vars=['FIPS', 'Name','RUC Code'],
    var_name='YR',
    value_name='Completing High School Only'
)


education_NotCompletingHighSchool=pd.read_csv("USDA_data/Education(NotCompletingHighSchool).csv")
education_NotCompletingHighSchool.rename(columns={'2023 Rural-urban Continuum Code*': 'RUC Code'}, inplace=True)
education_NotCompletingHighSchool_melt = education_NotCompletingHighSchool.melt(
    id_vars=['FIPS', 'Name','RUC Code'],
    var_name='YR',
    value_name='Not Completing High School'
)


In [5]:
education_CompletingHighSchool_only_2 = education_CompletingHighSchool_only_melt.drop(columns=['Name', 'RUC Code'])
education_NotCompletingHighSchool_2 = education_NotCompletingHighSchool_melt.drop(columns=['Name', 'RUC Code'])
# in case RUC Code and Name different expression

education_data = (
    education_CompletingCollege_melt
    .merge(education_CompletingHighSchool_only_2, on=['FIPS', 'YR'], how='inner')  # Inner join on FIPS and YR
    .merge(education_NotCompletingHighSchool_2, on=['FIPS', 'YR'], how='inner')   # Another inner join
    
)


# education_data=education_data[['FIPS','YR','Completing College','Completing High School Only','Not Completing High School']]
education_data.head(5)


Unnamed: 0,FIPS,Name,RUC Code,YR,Completing College,Completing High School Only,Not Completing High School
0,12000,Florida,,1970,10.3%,30.7%,47.4%
1,12001,"Alachua, FL",2.0,1970,23.1%,24.5%,40.2%
2,12003,"Baker, FL",1.0,1970,3.6%,26.5%,65.1%
3,12005,"Bay, FL",3.0,1970,9.2%,31.5%,49.0%
4,12007,"Bradford, FL",6.0,1970,4.5%,25.7%,63.5%


In [6]:
# unemployment_income=pd.read_csv("USDA_data/UnemploymentRate(%).csv")
# unemployment_income.columns = unemployment_income.iloc[0]
# unemployment_income = unemployment_income[2:].reset_index(drop=True)
# unemployment_income = unemployment_income.iloc[:, :-1]
# unemployment_income.rename(columns={'FIPS ': 'FIPS'}, inplace=True)
# unemployment_income.head(5)


In [7]:
# income_data = unemployment_income[['FIPS', 'Name', 'Median Household Income (2021)']]
# income_data["YR"]="2021" # add a column as Year
# income_data.head(5)

In [8]:
# unemployment_data=unemployment_income.iloc[:, :-1]
# unemployment_data.head(5)
# unemployment_data = unemployment_data.melt(
#     id_vars=['FIPS', 'Name'],
#     var_name='YR',
#     value_name='Unemployment Rate (%)'
# )
# unemployment_data['YR'] = unemployment_data['YR'].astype(int)
# unemployment_data.head(5)

# Unemployment Data

In [9]:
unemployment_all=pd.read_csv("USDA_data/Unemployment.csv")
unemployment_all=unemployment_all[unemployment_all["State"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"])]
unemployment_all = unemployment_all.iloc[:, :-2]
unemployment_all_melt = unemployment_all.melt(
    id_vars=['FIPS_Code', 'State', 'Area_Name'],
    var_name='Metric',
    value_name='value'
)
unemployment_all_melt['YR'] = unemployment_all_melt['Metric'].str[-4:]
unemployment_all_melt['Metric'] = unemployment_all_melt['Metric'].str[:-5]

unemployment_all_pivot = unemployment_all_melt.pivot(
    index=['FIPS_Code', 'State', 'Area_Name', 'YR'],  # Identifiers for rows
    columns='Metric',                              # Column headers
    values='value'                                 # Values to populate
)

unemployment_all_pivot.reset_index(inplace=True)
unemployment_all_data = unemployment_all_pivot[["FIPS_Code", "State", "Area_Name", "YR", "Civilian_labor_force", "Unemployment_rate"]]
unemployment_all_data.rename(columns={'FIPS_Code': 'FIPS'}, inplace=True)
unemployment_all_data.rename(columns={'Area_Name': 'Name'}, inplace=True)
unemployment_all_data.head(5)


Metric,FIPS,State,Name,YR,Civilian_labor_force,Unemployment_rate
0,1000,AL,Alabama,2000,2147173,4.6
1,1000,AL,Alabama,2001,2128027,5.2
2,1000,AL,Alabama,2002,2112621,5.9
3,1000,AL,Alabama,2003,2128668,6.0
4,1000,AL,Alabama,2004,2138306,5.6


# Med-Income Data

In [10]:
pre_income=pd.read_csv("USDA_data/Unemployment.csv")
pre_income=pre_income[pre_income["State"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"])]
med_income_data=pre_income[["FIPS_Code","Median_Household_Income_2021", "Med_HH_Income_Percent_of_State_Total_2021"]]
med_income_data.head(5)


Unnamed: 0,FIPS_Code,Median_Household_Income_2021,Med_HH_Income_Percent_of_State_Total_2021
1,1000,53990,100.0
2,1001,66444,123.1
3,1003,65658,121.6
4,1005,38649,71.6
5,1007,48454,89.7


# Population Data

In [11]:
population=pd.read_csv("USDA_data/Population.csv")
population_filted=population[population["State"].isin(["FL", "WA", "OR", "GA", "OK", "AL", "CO", "ME"])]
population_filted_melt = population_filted.melt(
    id_vars=['FIPStxt', 'State', 'Area_Name'],
    var_name='Metric',
    value_name='value'
)
population_filted_melt['YR'] = population_filted_melt['Metric'].str[-4:]
population_filted_melt['Metric'] = population_filted_melt['Metric'].str[:-5]
population_filted_pivot = population_filted_melt.pivot(
    index=['FIPStxt', 'State', 'Area_Name', 'YR'],  # Identifiers for rows
    columns='Metric',                              # Column headers
    values='value'                                 # Values to populate
)

population_filted_pivot.reset_index(inplace=True)
population_filted_pivot.columns


Index(['FIPStxt', 'State', 'Area_Name', 'YR', 'BIRTHS', 'CENSUS_202', 'DEATHS',
       'DOMESTIC_MIG', 'ESTIMATES_BASE', 'Economic_typology', 'GQ_ESTIMATES',
       'GQ_ESTIMATES_BASE', 'INTERNATIONAL_MIG', 'NATURAL_CHG', 'NET_MIG',
       'N_POP_CHG', 'POP_ESTIMATE', 'RESIDUAL', 'R_BIRTH', 'R_DEATH',
       'R_DOMESTIC_MIG', 'R_INTERNATIONAL_MIG', 'R_NATURAL_CHG', 'R_NET_MIG',
       'Rural_Urban_Continuum_Code', 'Urban_Influence'],
      dtype='object', name='Metric')

CENSUS_202: Could indicate population counts or census data for 2020.  
ESTIMATES_BASE: Likely represents the baseline population estimates.  
POP_ESTIMATE: Most likely the estimated total population.  
N_POP_CHG: Represents net population change.  
R_NATURAL_CHG: Rate of natural population change (births minus deaths).  
R_NET_MIG: Rate of net migration, impacting population.  

In [12]:

population2021_data=population_filted_pivot[population_filted_pivot["YR"]=="2021"]
population2021_data = population2021_data[["FIPStxt", "State", "Area_Name", "YR", "POP_ESTIMATE"]]
population2021_data.rename(columns={'FIPStxt': 'FIPS'}, inplace=True)
population2021_data.rename(columns={'Area_Name': 'Name'}, inplace=True)
population2021_data.rename(columns={'POP_ESTIMATE': 'Estimated Population'}, inplace=True)
# population_data.rename(columns={'N_POP_CHG': 'Net Population Change'}, inplace=True)
# population_data.rename(columns={'R_NATURAL_CHG': 'Rate of Poulation Change(birth minus deaths)'}, inplace=True)
population2021_data = population2021_data.dropna(subset=["Estimated Population"])
population2021_data


Metric,FIPS,State,Name,YR,Estimated Population
3,1000,AL,Alabama,2021,5050380
10,1001,AL,Autauga County,2021,59203
17,1003,AL,Baldwin County,2021,239439
24,1005,AL,Barbour County,2021,24533
31,1007,AL,Bibb County,2021,22359
...,...,...,...,...,...
3699,53069,WA,Wahkiakum County,2021,4586
3706,53071,WA,Walla Walla County,2021,62211
3713,53073,WA,Whatcom County,2021,226701
3720,53075,WA,Whitman County,2021,43224


# Merge all dataset

In [13]:
#check data type, prepare for merging

# print(education_data.dtypes)
# print(poverty_data.dtypes)
# print(income_data.dtypes)
# print(unemployment_data.dtypes)
# print(population2021_data.dtypes)
# print(unemployment_all_data.dtypes)
education_data['YR'] = education_data['YR'].astype('str')
unemployment_all_data['YR'] = unemployment_all_data['YR'].astype('str')


In [14]:

# income_data=income_data.drop(columns='Name')
# unemployment_data=unemployment_data.drop(columns='Name')

unemployment_all_data=unemployment_all_data.drop(columns=['Name', 'State'])

In [15]:

# Merge all the datasets on 'FIPS' and 'YR'
merged_data = (
    education_data
    # .merge(income_data, on=['FIPS', 'YR'], how='outer')
    # .merge(unemployment_data, on=['FIPS', 'YR'], how='outer')
    .merge(unemployment_all_data,on=['FIPS', 'YR'], how='outer')
)
merged_data[merged_data["YR"].isin(["2021", "2018-2022"])]

Unnamed: 0,FIPS,Name,RUC Code,YR,Completing College,Completing High School Only,Not Completing High School,Civilian_labor_force,Unemployment_rate
23,1000,Alabama,,2018-2022,27.2%,30.4%,12.3%,,
26,1000,,,2021,,,,2259349,3.4
51,1001,"Autauga, AL",2.0,2018-2022,29.6%,31.1%,9.6%,,
54,1001,,,2021,,,,26545,2.8
79,1003,"Baldwin, AL",3.0,2018-2022,32.6%,27.8%,8.4%,,
...,...,...,...,...,...,...,...,...,...
14866,53073,,,2021,,,,112503,6.1
14891,53075,"Whitman, WA",4.0,2018-2022,50.9%,14.3%,4.2%,,
14894,53075,,,2021,,,,23425,4.4
14919,53077,"Yakima, WA",2.0,2018-2022,18.2%,28.4%,24.3%,,


In [None]:
merged_data.to_parquet("data/USDA_education_unemployment.parquet", index=False)
population2021_data.to_parquet("data/USDA_population2021.parquet", index=False)
poverty_data.to_parquet("data/USDA_poverty2021.parquet", index=False)
med_income_data.to_parquet("data/USDA_medIncome2021.parquet", index=False)