## Population Data Cleaning

Process U.S. Census Bureau county population estimates (2000-2024) by combining three separate datasets and creating standardized county identifiers.

### Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("mode.copy_on_write", True)

### Load and Process 2000-2010 Population Data

Filter to county level (SUMLEV==50) and reshape from wide to long format for time series analysis.

In [2]:
# Load and process 2000-2010 population data
population_00_10 = pd.read_csv(
    "../01_data/raw/population/co-est00int-tot.csv", encoding="latin1"
)

population_00_10 = population_00_10[population_00_10["SUMLEV"] == 50]

year_cols = [col for col in population_00_10.columns if col.startswith("POPESTIMATE")]
keep_cols = ["STATE", "COUNTY", "STNAME", "CTYNAME"] + year_cols
population_00_10 = population_00_10[keep_cols]

population_00_10_long = population_00_10.melt(
    id_vars=["STATE", "COUNTY", "STNAME", "CTYNAME"],
    value_vars=year_cols,
    var_name="year",
    value_name="population",
)

population_00_10_long["year"] = (
    population_00_10_long["year"].str.extract(r"(\d+)").astype(int)
)

population_00_10_long["fips"] = population_00_10_long["STATE"].astype(str).str.zfill(
    2
) + population_00_10_long["COUNTY"].astype(str).str.zfill(3)

### Load and Process 2010-2019 Population Data

In [3]:
# Load and process 2010-2019 population data
population_10_19 = pd.read_csv(
    "../01_data/raw/population/co-est2019-alldata.csv", encoding="latin1"
)

population_10_19 = population_10_19[population_10_19["SUMLEV"] == 50]

year_cols = [c for c in population_10_19.columns if c.startswith("POPESTIMATE")]
keep_cols = ["STATE", "COUNTY", "STNAME", "CTYNAME"] + year_cols
population_10_19 = population_10_19[keep_cols]

population_10_19_long = population_10_19.melt(
    id_vars=["STATE", "COUNTY", "STNAME", "CTYNAME"],
    value_vars=year_cols,
    var_name="year",
    value_name="population",
)

population_10_19_long["year"] = (
    population_10_19_long["year"].str.extract(r"(\d+)").astype(int)
)

population_10_19_long["fips"] = population_10_19_long["STATE"].astype(str).str.zfill(
    2
) + population_10_19_long["COUNTY"].astype(str).str.zfill(3)

### Load and Process 2020-2024 Population Data

In [4]:
# Load and process 2020-2024 population data
population_20_24 = pd.read_csv(
    "../01_data/raw/population/co-est2024-alldata.csv", encoding="latin1"
)

population_20_24 = population_20_24[population_20_24["SUMLEV"] == 50]

year_cols = [c for c in population_20_24.columns if c.startswith("POPESTIMATE")]
keep_cols = ["STATE", "COUNTY", "STNAME", "CTYNAME"] + year_cols
population_20_24 = population_20_24[keep_cols]

population_20_24_long = population_20_24.melt(
    id_vars=["STATE", "COUNTY", "STNAME", "CTYNAME"],
    value_vars=year_cols,
    var_name="year",
    value_name="population",
)

population_20_24_long["year"] = (
    population_20_24_long["year"].str.extract(r"(\d+)").astype(int)
)

population_20_24_long["fips"] = population_20_24_long["STATE"].astype(str).str.zfill(
    2
) + population_20_24_long["COUNTY"].astype(str).str.zfill(3)

### Combine All Years and Filter to Study States

Merge the three time periods and retain only the 9 states relevant to the opioid policy analysis.

In [5]:
# Combine all years and filter to study states
population_2000_2024 = pd.concat(
    [population_00_10_long, population_10_19_long, population_20_24_long],
    ignore_index=True,
)

study_states = [
    "Florida",
    "Washington",
    "North Carolina",
    "Georgia",
    "Oregon",
    "South Carolina",
    "Idaho",
    "Montana",
    "Tennessee",
    "Mississippi",
    "Colorado",
    "California",
]
pop_filtered = population_2000_2024[
    population_2000_2024["STNAME"].isin(study_states)
].copy()

print(f"Total rows: {len(population_2000_2024):,}")
print(f"Filtered rows: {len(pop_filtered):,}")
print(f"States: {sorted(pop_filtered['STNAME'].unique())}")

pop_filtered.head()

Total rows: 81,713
Filtered rows: 21,996
States: ['California', 'Colorado', 'Florida', 'Georgia', 'Idaho', 'Mississippi', 'Montana', 'North Carolina', 'Oregon', 'South Carolina', 'Tennessee', 'Washington']


Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,year,population,fips
186,6,1,California,Alameda County,2000,1449840,6001
187,6,3,California,Alpine County,2000,1209,6003
188,6,5,California,Amador County,2000,35153,6005
189,6,7,California,Butte County,2000,203807,6007
190,6,9,California,Calaveras County,2000,40645,6009


### Export Cleaned Data

In [6]:
# Export cleaned data
pop_filtered.to_csv("../01_data/clean/population_2000_2024.csv", index=False)
print("Exported to: ../01_data/clean/population_2000_2024.csv")

Exported to: ../01_data/clean/population_2000_2024.csv
