In [1]:
""" 
This code is used to map FIPS Code to county based on county name and state name
The FIPS Codes have been taken from https://github.com/kjhealy/fips-codes/raw/master/state_and_county_fips_master.csv
Some basic data cleaning is required, such as making the county name lowercase and removing the word "county" from the county name
After this, specific changes have been done for counties that had specific issues - such as replacing st. with saint and removing spaces
"""

' \nThis code is used to map FIPS Code to county based on county name and state name\nThe FIPS Codes have been taken from https://github.com/kjhealy/fips-codes/raw/master/state_and_county_fips_master.csv\nSome basic data cleaning is required, such as making the county name lowercase and removing the word "county" from the county name\nAfter this, specific changes have been done for counties that had specific issues - such as replacing st. with saint and removing spaces\n'

In [2]:
# Importing Libraries
import pandas as pd

pd.set_option("mode.copy_on_write", True)

In [3]:
###################################################################
# Reading and cleaning the Opioid data
###################################################################

In [4]:
# Reading Dosage Data (that has been cleaned, has county name but not county FIPS)
# dos_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/cleaning_dosage/20_Intermediate_Files/Dosage_FULL.parquet"
dos_data_path = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/cleaning_dosage/20_Intermediate_Files/AllDosage.parquet"


dos_data = pd.read_parquet(dos_data_path)

In [5]:
dos_data.shape
# (43152, 4)
dos_data.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,Year,MME
0,ABBEVILLE,SC,2006,3136215.0
1,ABBEVILLE,SC,2007,3232603.0
2,ABBEVILLE,SC,2008,3070698.0
3,ABBEVILLE,SC,2009,3827607.0
4,ABBEVILLE,SC,2010,4612935.0


In [6]:
# Removing spaces from the county name
dos_data["BUYER_COUNTY"] = dos_data["BUYER_COUNTY"].str.replace(" ", "").str.lower()



# Add 'County' at the end of each value in the 'BUYER_COUNTY' column


dos_data["BUYER_COUNTY"] = dos_data["BUYER_COUNTY"]



dos_data.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,Year,MME
0,abbeville,SC,2006,3136215.0
1,abbeville,SC,2007,3232603.0
2,abbeville,SC,2008,3070698.0
3,abbeville,SC,2009,3827607.0
4,abbeville,SC,2010,4612935.0


In [7]:
new_column_names = {
    "BUYER_STATE": "state",
    "BUYER_COUNTY": "name",
}

# Rename columns using the dictionary
dos_data1 = dos_data.rename(columns=new_column_names)

len(dos_data1)

43152

In [8]:
###################################################################
# Reading and cleaning the FIPS Code data
###################################################################

In [9]:
# Reading FIPS Code data
data_code = pd.read_csv(
    "https://github.com/kjhealy/fips-codes/raw/master/state_and_county_fips_master.csv"
)

In [10]:
data_code.sample(5)

Unnamed: 0,fips,name,state
1399,27121,Pope County,MN
2773,48411,San Saba County,TX
2205,40073,Kingfisher County,OK
72,2020,Anchorage Municipality,AK
1824,34037,Sussex County,NJ


In [11]:
# Since there are states as well as counties in the fips data, we will only keep the counties
data_code1 = data_code[data_code["state"].notnull()]
data_code1.state.value_counts()

state
TX    254
GA    159
VA    134
KY    120
MO    115
KS    105
IL    102
NC    100
IA     99
TN     95
NE     93
IN     92
OH     88
MN     87
MI     83
MS     82
OK     77
AR     75
WI     72
PA     67
AL     67
FL     67
SD     66
CO     64
LA     64
NY     62
CA     58
MT     56
WV     55
ND     53
SC     46
ID     44
WA     39
OR     36
NM     33
AK     29
UT     29
MD     24
WY     23
NJ     21
NV     17
ME     16
AZ     15
VT     14
MA     14
NH     10
CT      8
RI      5
HI      5
DE      3
DC      1
Name: count, dtype: int64

In [12]:
# Remove the word "county" from the county name since it is repetitive
data_code1["name"] = data_code1["name"].str.replace("county", "", case=False)



# Remove spaces from the resulting strings


data_code1["name"] = data_code1["name"].str.replace(" ", "")


# Convert the strings to lowercase


data_code1["name"] = data_code1["name"].str.lower()

# Remove spaces from the resulting strings
data_code1["name"] = data_code1["name"].str.replace("`", "")
data_code1["name"] = data_code1["name"].str.replace("'", "")

In [13]:
# Since the Codes data has the word "parish" for some counties while the drugs data does not have the word for the corresponding counties, we will remove this word from the Codes data
data_code1["name"] = data_code1["name"].str.replace("parish", "")

In [14]:
# Since the Codes data has the word "st." while the drugs data has the word "saint" for the corresponding counties, we will replace "st." with "saint" in the Codes data
data_code1["name"] = data_code1["name"].str.replace(r"st.", "saint")

In [15]:
# After the previous step, only one county "saintjohnthebaptist" is not matching, so manually changing that value
data_code1["name"] = data_code1["name"].str.replace(
    "saintjohnthebaptist", "stjohnthebaptist"
)

In [16]:
data_code1[data_code1["name"].str.contains("john")]

Unnamed: 0,fips,name,state
151,5071,johnson,AR
385,12109,saintjohns,FL
481,13167,johnson,GA
653,17087,johnson,IL
753,18081,johnson,IN
857,19103,johnson,IA
951,20091,johnson,KS
1069,21115,johnson,KY
1180,22095,stjohnthebaptist,LA
1560,29101,johnson,MO


In [17]:
data_code2 = data_code1.copy()
data_code2["name1"] = data_code2["name"]
data_code2["state1"] = data_code2["state"]

In [18]:
###################################################################
# Merging the two datasets to get the FIPS code - this will be a left join to the dosage data
###################################################################

In [19]:
merged_df = pd.merge(dos_data1, data_code1, on=["name", "state"], how="left")

In [20]:
merged_df.sample(5)

Unnamed: 0,name,state,Year,MME,fips
38890,tunica,MS,2012,1262613.0,28143.0
37118,stewart,TN,2016,6758604.0,47161.0
13894,gallia,OH,2019,7889651.0,39053.0
2082,bastrop,TX,2019,14119900.0,48021.0
4732,calaveras,CA,2016,22296220.0,6009.0


In [21]:
missing_fips_rows = merged_df[merged_df["fips"].isnull()]
missing_fips_rows[["name", "state"]].value_counts()

name             state
adjuntas         PR       14
naguabo          PR       14
rincon           PR       14
radford          VA       14
quebradillas     PR       14
                          ..
saipan           MP        5
bristolbay       AK        4
northwestarctic  AK        3
aleutianseast    AK        2
skagway          AK        1
Name: count, Length: 111, dtype: int64

In [22]:
missing_fips_rows[~missing_fips_rows["state"].isin(["PR", "AK", "VA"])][
    ["name", "state"]
].value_counts()

name                    state
donaana                 NM       14
guam                    GU       14
saintcroix              VI       14
saintegenevieve         MO       14
saintjohn               VI       14
saintthomas             VI       14
stjoseph                IN       14
northernmarianaislands  MP        9
palau                   PW        5
saipan                  MP        5
Name: count, dtype: int64

In [23]:
missing_fips_rows["state"].value_counts()

state
PR    1063
AK     244
VI      42
VA      28
NM      14
GU      14
MP      14
MO      14
IN      14
PW       5
Name: count, dtype: int64

In [24]:
"""
Since none of these states are in our list of test and control states, we will not be using these rows for our analysis, so we will not be imputing the FIPS codes for these rows
"""

'\nSince none of these states are in our list of test and control states, we will not be using these rows for our analysis, so we will not be imputing the FIPS codes for these rows\n'

In [25]:
file_path = "C:/Users/divya/OneDrive/Documents/Duke/MIDS/Semester 1/IDS 720 - PDS/Opioid_Project/opioid-2023-group-8-final-opioid/20_Intermediate_Files/Dosage_with_CountyCode.csv"  # Replace with your desired file path

# Save the DataFrame to a CSV file
merged_df.to_csv(file_path, index=False)