In [1]:
import pandas as pd

# Load the 2022 and 2019 datasets
data_2022_path = 'data2019.csv'
data_2019_path = 'data2022.csv'

data_2022 = pd.read_csv(data_2022_path)
data_2019 = pd.read_csv(data_2019_path)

# Ensure EMP is treated as numeric for calculations, and filter out missing or invalid entries
data_2022['EMP'] = pd.to_numeric(data_2022['EMP'], errors='coerce')
data_2019['EMP'] = pd.to_numeric(data_2019['EMP'], errors='coerce')

# Merge datasets based on common identifiers (STATEA, COUNTYA, and naics)
merged_data = pd.merge(
    data_2019[['STATEA', 'COUNTYA', 'naics', 'EMP']].rename(columns={'EMP': 'EMP_2019'}),
    data_2022[['STATEA', 'COUNTYA', 'naics', 'EMP']].rename(columns={'EMP': 'EMP_2022'}),
    on=['STATEA', 'COUNTYA', 'naics'],
    how='inner'
)

# Calculate growth rate: (EMP_2022 - EMP_2019) / EMP_2019
merged_data['Growth_Rate'] = ((merged_data['EMP_2022'] - merged_data['EMP_2019']) / 
                              merged_data['EMP_2019']) * 100

# Drop rows with missing EMP_2019 or EMP_2022 values
merged_data = merged_data.dropna(subset=['EMP_2019', 'EMP_2022'])

# Save the result to a new CSV file
merged_data.to_csv('employment_growth_rate.csv', index=False)

print("results saved as employment_growth_rate.csv")

results saved as employment_growth_rate.csv
