In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests

This step improts the FIPS data and subsets it to the group of desired states in our anlaysis. 

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_United_States_FIPS_codes_by_county"
response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, "html.parser")

table = soup.find("table", {"class": "wikitable"})

rows = table.find_all("tr")
data = []
for row in rows[1:]:  # Skip the header row
    cols = row.find_all("td")
    if len(cols) >= 2:  # Ensure the row has at least two columns
        fips_code = cols[0].text.strip()
        county_name = cols[1].text.strip()
        data.append({"FIPS Code": fips_code, "County Name": county_name})

county_fips_df = pd.DataFrame(data)

county_fips_df.to_csv("county_fips.csv", index=False)

print(county_fips_df.head(10))

  FIPS Code      County Name
0     01001   Autauga County
1     01003   Baldwin County
2     01005   Barbour County
3     01007      Bibb County
4     01009    Blount County
5     01011   Bullock County
6     01013    Butler County
7     01015   Calhoun County
8     01017  Chambers County
9     01019  Cherokee County


Below we add a mapping for:
- Each State FIPS (first 2 digits of a FIPS code)
- The corresponsing 2 letter abbreviation for these states

In [3]:
first_2 = ["53", "01", "13", "23", "08", "12", "40", "41"]
mapping = {
    "53": "WA",
    "01": "AL",
    "13": "GA",
    "23": "ME",
    "08": "CO",
    "12": "FL",
    "40": "OK",
    "41": "OR",
}


county_fips_df["state_FIPS"] = county_fips_df["FIPS Code"].str[:2]

fips_df = county_fips_df[county_fips_df["state_FIPS"].isin(first_2)].copy()


fips_df["State"] = fips_df["state_FIPS"].map(mapping)


fips_df = fips_df.rename(columns={"FIPS Code": "County_FIPS", "County Name": "County"})

print(fips_df.head(5))

  County_FIPS          County state_FIPS State
0       01001  Autauga County         01    AL
1       01003  Baldwin County         01    AL
2       01005  Barbour County         01    AL
3       01007     Bibb County         01    AL
4       01009   Blount County         01    AL


In [4]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os


GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")


url = "https://raw.githubusercontent.com/MIDS-at-Duke/opioids-2024-data-queens-king/main/data/USDA_education_poverty_unemployment_income.parquet"


headers = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
response = requests.get(url, headers=headers)


epu_df = pd.read_parquet("USDA_education_poverty_unemployment_income.parquet")
print(epu_df.head(5))

   FIPS     Name  RUC Code         YR Completing College  \
0  1000  Alabama       NaN       1970               7.8%   
1  1000  Alabama       NaN       1980              12.2%   
2  1000  Alabama       NaN       1990              15.7%   
3  1000  Alabama       NaN       2000              19.0%   
4  1000  Alabama       NaN  2008-2012              22.3%   

  Completing High School Only Not Completing High School  \
0                       25.9%                      58.7%   
1                       31.8%                      43.5%   
2                       29.4%                      33.1%   
3                       30.4%                      24.7%   
4                       31.3%                      17.4%   

   All people in poverty (2021) Percent  \
0                                   NaN   
1                                   NaN   
2                                   NaN   
3                                   NaN   
4                                   NaN   

   Children ages 0-

In [5]:
file_path = "data/opioid_shipment_WA_FL_andconstants.parquet"

opioid = pd.read_parquet(file_path)

opioid["state-county"] = opioid["BUYER_STATE"] + "|" + opioid["BUYER_COUNTY"]

opioid.head(2)

opioid["state-county-yr"] = opioid["state-county"] + "|" + opioid["year"].astype("str")

unique_id_of_op = opioid["state-county-yr"].nunique()
length_op = len(opioid)

print(
    f"the length of the opioid dataset is {unique_id_of_op} while the distinctcount of the unique id state-county-yr is {length_op}"
)

opioid["year"].value_counts()

the length of the opioid dataset is 7128 while the distinctcount of the unique id state-county-yr is 7128


year
2006    513
2011    511
2010    511
2014    511
2009    510
2007    510
2013    510
2012    510
2008    509
2016    508
2019    507
2018    507
2015    506
2017    505
Name: count, dtype: int64

Below is a skeleton model to merge the main opioid data with the FIPS Dataset. I assume here that the naming convention for each county is consistent between the FIPS reference table and the main Opioid Dataset. If not, an additional dictionary wil be needed where names get fixed prior to the merge. This Merge Connects: 
- The Main Opioid Dataset
- FIPS Code for each county

In [6]:
df3 = pd.DataFrame(
    {
        "FIPS": [134, 234, 333],
        "County": ["County A", "County B", "County C"],
        "State": ["AB", "BB", "AA"],
    }
)

df3["state-county"] = df3["State"] + "|" + df3["County"]


df1 = pd.DataFrame(
    {
        "Year": [2005, 2006, 2007],
        "State": ["AB", "BB", "AA"],
        "County": ["County A", "County B", "County C"],
        "Opioid": [10, 20, 30],
    }
)

df1["state-county"] = df1["State"] + "|" + df1["County"]
df1 = pd.merge(
    df1, df3, how="left", on=["state-county"], indicator=True, validate="m:1"
)

print(df1.head(3))

   Year State_x  County_x  Opioid state-county  FIPS  County_y State_y _merge
0  2005      AB  County A      10  AB|County A   134  County A      AB   both
1  2006      BB  County B      20  BB|County B   234  County B      BB   both
2  2007      AA  County C      30  AA|County C   333  County C      AA   both


Below is some dummy data to test my merge operation prior to setting it in main_merge_viz. In this instance we merge two datasets:
- The Main Opioid Dataset
- The Dataset that contains Population per FIPS (or population per FIPS-year)

In [7]:
# df1 = pd.DataFrame({
#     'FIPS': [134, 234, 333],
#     'Year': [2005,2006,2007],
#     'State': ['GA', 'FL', 'WA'],
#     'County': ['County A', 'County B', 'County C']
#     'Value1': [10, 20, 30]
# })


# df2 = pd.DataFrame({
#     'FIPS':[134, 234, 333],
#     'County Population':[1000, 200, 3000],
# })

In [8]:
# Testing Merge Functions assuring we get a One-One OR Many-One Merge

In [9]:
# merged_data = pd.merge(df1,df2, how='left', on=['FIPS'], indicator=True, validate='m:1')

# print(merged_data.sample(2))