## PDS Group 7

In [1]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

In [2]:
# Load the processed ARCOs data and display its info
arcos = pd.read_parquet("arcos_processed.parquet")
arcos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162771 entries, 0 to 162770
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   BUYER_STATE   162771 non-null  object 
 1   BUYER_COUNTY  162771 non-null  object 
 2   YEAR          162771 non-null  int32  
 3   TOTAL_MME     162771 non-null  float64
dtypes: float64(1), int32(1), object(2)
memory usage: 4.3+ MB


In [3]:
# check for missing values in each column
arcos.isna().sum()

BUYER_STATE     0
BUYER_COUNTY    0
YEAR            0
TOTAL_MME       0
dtype: int64

In [4]:
arcos.sample(5)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME
115898,NC,PAMLICO,2007,274.883485
101780,GA,BROOKS,2012,1172.383954
22894,GA,MCINTOSH,2016,1129.177587
44223,FL,SEMINOLE,2009,1758.096255
73103,GA,DADE,2009,331.566305


In [5]:
# Create a new column for merging with population data
arcos["merge_on_arcos"] = (
    arcos["BUYER_COUNTY"].str.upper() + " COUNTY, " + arcos["BUYER_STATE"]
)

In [6]:
arcos["merge_on_arcos"].sample(5)

142822       PAMLICO COUNTY, NC
25554     GREENVILLE COUNTY, SC
83059        LIBERTY COUNTY, GA
92126        LIBERTY COUNTY, GA
28068         ORANGE COUNTY, NC
Name: merge_on_arcos, dtype: object

In [7]:
# Load the population and mortality data and display its info
pop = pd.read_csv("merged_mortality_population.csv")
pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61559 entries, 0 to 61558
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   County                           61559 non-null  object
 1   County Code                      61559 non-null  int64 
 2   Year                             61559 non-null  int64 
 3   Drug/Alcohol Induced Cause       61559 non-null  object
 4   Drug/Alcohol Induced Cause Code  61559 non-null  object
 5   Deaths                           61559 non-null  int64 
 6   STATE                            61559 non-null  int64 
 7   COUNTY                           61559 non-null  int64 
 8   STNAME                           61559 non-null  object
 9   CTYNAME                          61559 non-null  object
 10  year                             61559 non-null  int64 
 11  population                       61559 non-null  int64 
 12  fips                            

In [8]:
pop["merge_on_pop"] = pop["County"].str.upper()
pop["merge_on_pop"].sample(5)

15771      DAVIDSON COUNTY, TN
58147       HANCOCK COUNTY, IA
8642     KIT CARSON COUNTY, CO
8373       FAULKNER COUNTY, AR
39953      MITCHELL COUNTY, KS
Name: merge_on_pop, dtype: object

In [9]:
# Merge the ARCOs data with the population and mortality data
merged = arcos.merge(
    pop,
    how="inner",
    left_on=["merge_on_arcos", "YEAR"],
    right_on=["merge_on_pop", "Year"],
)

In [10]:
merged.sample(5)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME,merge_on_arcos,County,County Code,Year,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,STATE,COUNTY,STNAME,CTYNAME,year,population,fips,merge_on_pop
118064,GA,COFFEE,2007,1854.86844,"COFFEE COUNTY, GA","Coffee County, GA",13069,2007,All other non-drug and non-alcohol causes,O9,322,13,69,Georgia,Coffee County,2007,41299,13069,"COFFEE COUNTY, GA"
180028,NC,CHEROKEE,2011,1712.373016,"CHEROKEE COUNTY, NC","Cherokee County, NC",37039,2011,All other non-drug and non-alcohol causes,O9,341,37,39,North Carolina,Cherokee County,2011,27180,37039,"CHEROKEE COUNTY, NC"
2613,NC,HERTFORD,2015,1940.880636,"HERTFORD COUNTY, NC","Hertford County, NC",37091,2015,All other non-drug and non-alcohol causes,O9,253,37,91,North Carolina,Hertford County,2015,24373,37091,"HERTFORD COUNTY, NC"
180715,NC,NEW HANOVER,2008,3780.531507,"NEW HANOVER COUNTY, NC","New Hanover County, NC",37129,2008,All other non-drug and non-alcohol causes,O9,1512,37,129,North Carolina,New Hanover County,2008,197709,37129,"NEW HANOVER COUNTY, NC"
69888,FL,HIGHLANDS,2010,883.247591,"HIGHLANDS COUNTY, FL","Highlands County, FL",12055,2010,All other alcohol-induced causes,A9,10,12,55,Florida,Highlands County,2010,98700,12055,"HIGHLANDS COUNTY, FL"


In [27]:
merged_cleaned = merged.drop(
    columns=[
        "merge_on_arcos",
        "merge_on_pop",
        "YEAR",
        "Year",
        "STATE",
        "COUNTY",
        "STNAME",
        "CTYNAME",
        "fips",
    ]
)

merged_cleaned = merged_cleaned.rename(
    columns={
        "BUYER_STATE": "Buyer State",
        "BUYER_COUNTY": "Buyer County",
        "TOTAL_MME": "Total_MME",
        "population": "Population",
        "year": "Year",
    }
)

merged_cleaned.sample(5)

Unnamed: 0,Buyer State,Buyer County,Total_MME,County,County Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,Year,Population
194595,NC,LINCOLN,1416.689425,"Lincoln County, NC",37109,Drug poisonings (overdose) Unintentional (X40-...,D1,15,2013,79111
143267,GA,HENRY,660.762993,"Henry County, GA",13151,Drug poisonings (overdose) Unintentional (X40-...,D1,26,2011,206911
115868,FL,PINELLAS,793270.702065,"Pinellas County, FL",12103,All other alcohol-induced causes,A9,192,2012,921642
170603,GA,DODGE,445.053086,"Dodge County, GA",13091,All other non-drug and non-alcohol causes,O9,239,2011,21614
51988,FL,COLUMBIA,3105.871843,"Columbia County, FL",12023,Drug poisonings (overdose) Unintentional (X40-...,D1,11,2014,67779


In [28]:
# Analyze the "Drug/Alcohol Induced Cause Code" column
merged_cleaned["Drug/Alcohol Induced Cause"].value_counts()

Drug/Alcohol Induced Cause
All other non-drug and non-alcohol causes             159321
Drug poisonings (overdose) Unintentional (X40-X44)     51836
All other alcohol-induced causes                       43345
Drug poisonings (overdose) Suicide (X60-X64)           12452
All other drug-induced causes                           2753
Drug poisonings (overdose) Undetermined (Y10-Y14)       1276
Alcohol poisonings (overdose) (X45, X65, Y15)            984
Name: count, dtype: int64

In [29]:
# Analyze the corresponding "Drug/Alcohol Induced Cause Code" column
merged_cleaned["Drug/Alcohol Induced Cause Code"].value_counts()

Drug/Alcohol Induced Cause Code
O9    159321
D1     51836
A9     43345
D2     12452
D9      2753
D4      1276
A1       984
Name: count, dtype: int64

In [30]:
# Filter rows with specific cause codes
merged_cleaned = merged_cleaned[
    merged_cleaned["Drug/Alcohol Induced Cause Code"].isin(["D1", "D4", "D9"])
]

In [31]:
# Inspect the cause codes value counts again
merged_cleaned["Drug/Alcohol Induced Cause Code"].value_counts()

Drug/Alcohol Induced Cause Code
D1    51836
D9     2753
D4     1276
Name: count, dtype: int64

In [32]:
merged_cleaned.to_csv("final_merged.csv", index=False)

In [33]:
merged_cleaned.sample(10)

Unnamed: 0,Buyer State,Buyer County,Total_MME,County,County Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,Year,Population
228873,NC,NEW HANOVER,5933.04,"New Hanover County, NC",37129,Drug poisonings (overdose) Unintentional (X40-...,D1,36,2011,205966
263669,OR,CLACKAMAS,330082.9,"Clackamas County, OR",41005,Drug poisonings (overdose) Unintentional (X40-...,D1,32,2010,376788
233272,WA,THURSTON,9877.971,"Thurston County, WA",53067,Drug poisonings (overdose) Unintentional (X40-...,D1,25,2012,258527
117700,GA,BARTOW,4531.31,"Bartow County, GA",13015,Drug poisonings (overdose) Unintentional (X40-...,D1,20,2010,100195
120266,GA,FLOYD,9093.416,"Floyd County, GA",13115,Drug poisonings (overdose) Unintentional (X40-...,D1,17,2010,96274
45088,FL,MIAMI-DADE,51857.24,"Miami-Dade County, FL",12086,Drug poisonings (overdose) Unintentional (X40-...,D1,77,2010,2506972
168450,GA,CLARKE,2202894.0,"Clarke County, GA",13059,Drug poisonings (overdose) Unintentional (X40-...,D1,12,2012,119891
249692,NC,DAVIDSON,4111.052,"Davidson County, NC",37057,Drug poisonings (overdose) Unintentional (X40-...,D1,28,2006,157209
266299,OR,DESCHUTES,13727.16,"Deschutes County, OR",41017,Drug poisonings (overdose) Unintentional (X40-...,D1,10,2008,156820
18903,NC,MECKLENBURG,21.09608,"Mecklenburg County, NC",37119,Drug poisonings (overdose) Unintentional (X40-...,D1,98,2015,1033238


In [34]:
merged_cleaned.to_csv("final_merged.csv", index=False)