## PDS Group 7

In [2]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)

In [3]:
# Load the processed ARCOs data and display its info
arcos = pd.read_parquet("arcos_processed.parquet")
arcos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162771 entries, 0 to 162770
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   BUYER_STATE   162771 non-null  object 
 1   BUYER_COUNTY  162771 non-null  object 
 2   YEAR          162771 non-null  int32  
 3   TOTAL_MME     162771 non-null  float64
dtypes: float64(1), int32(1), object(2)
memory usage: 4.3+ MB


In [4]:
# check for missing values in each column
arcos.isna().sum()

BUYER_STATE     0
BUYER_COUNTY    0
YEAR            0
TOTAL_MME       0
dtype: int64

In [5]:
arcos.sample(5)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME
12738,FL,MADISON,2019,1154.24448
105446,GA,TROUP,2011,1290.712015
31173,FL,MADISON,2009,147.217714
103520,GA,GLYNN,2014,364.569874
154517,SC,PICKENS,2011,8290.317597


In [6]:
arcos["merge_on_arcos"] = (
    arcos["BUYER_COUNTY"].str.upper() + " COUNTY, " + arcos["BUYER_STATE"]
)

In [7]:
arcos["merge_on_arcos"].sample(5)

138678      THURSTON COUNTY, WA
100134    OGLETHORPE COUNTY, GA
70480        EMANUEL COUNTY, GA
122826        LENOIR COUNTY, NC
51831          BAKER COUNTY, FL
Name: merge_on_arcos, dtype: object

In [8]:
pop = pd.read_csv("merged_mortality_population.csv")
pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61559 entries, 0 to 61558
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   County                           61559 non-null  object
 1   County Code                      61559 non-null  int64 
 2   Year                             61559 non-null  int64 
 3   Drug/Alcohol Induced Cause       61559 non-null  object
 4   Drug/Alcohol Induced Cause Code  61559 non-null  object
 5   Deaths                           61559 non-null  int64 
 6   STATE                            61559 non-null  int64 
 7   COUNTY                           61559 non-null  int64 
 8   STNAME                           61559 non-null  object
 9   CTYNAME                          61559 non-null  object
 10  year                             61559 non-null  int64 
 11  population                       61559 non-null  int64 
 12  fips                            

In [9]:
pop["merge_on_pop"] = pop["County"].str.upper()
pop["merge_on_pop"].sample(5)

11704       DALLAS COUNTY, TX
43226      CARROLL COUNTY, AR
24643     HARDEMAN COUNTY, TX
42055        JONES COUNTY, SD
15438    JEFFERSON COUNTY, OR
Name: merge_on_pop, dtype: object

In [10]:
merged = arcos.merge(
    pop,
    how="inner",
    left_on=["merge_on_arcos", "YEAR"],
    right_on=["merge_on_pop", "Year"],
)

In [11]:
merged.sample(5)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,YEAR,TOTAL_MME,merge_on_arcos,County,County Code,Year,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,STATE,COUNTY,STNAME,CTYNAME,year,population,fips,merge_on_pop
96199,FL,CHARLOTTE,2009,5402.395473,"CHARLOTTE COUNTY, FL","Charlotte County, FL",12015,2009,All other non-drug and non-alcohol causes,O9,2155,12,15,Florida,Charlotte County,2009,159629,12015,"CHARLOTTE COUNTY, FL"
266023,OR,MULTNOMAH,2011,98519.945417,"MULTNOMAH COUNTY, OR","Multnomah County, OR",41051,2011,All other non-drug and non-alcohol causes,O9,5165,41,51,Oregon,Multnomah County,2011,749742,41051,"MULTNOMAH COUNTY, OR"
210646,NC,UNION,2010,1160.347976,"UNION COUNTY, NC","Union County, NC",37179,2010,All other non-drug and non-alcohol causes,O9,1202,37,179,North Carolina,Union County,2010,202109,37179,"UNION COUNTY, NC"
161312,GA,COBB,2010,7726.013568,"COBB COUNTY, GA","Cobb County, GA",13067,2010,All other alcohol-induced causes,A9,28,13,67,Georgia,Cobb County,2010,690063,13067,"COBB COUNTY, GA"
49472,FL,MANATEE,2009,6515.54836,"MANATEE COUNTY, FL","Manatee County, FL",12081,2009,Drug poisonings (overdose) Unintentional (X40-...,D1,58,12,81,Florida,Manatee County,2009,320711,12081,"MANATEE COUNTY, FL"


In [16]:
merged_cleaned = merged.drop(
    columns=[
        "merge_on_arcos",
        "merge_on_pop",
        "YEAR",
        "Year",
        "STATE",
        "COUNTY",
        "STNAME",
        "CTYNAME",
        "fips",
    ]
)

merged_cleaned = merged_cleaned.rename(
    columns={
        "BUYER_STATE": "State",
        "BUYER_COUNTY": "County",
        "TOTAL_MME": "Total_MME",
        "population": "Population",
    }
)

merged_cleaned.sample(5)

Unnamed: 0,State,County,Total_MME,County.1,County Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,year,Population
189461,NC,CUMBERLAND,6058.068,"Cumberland County, NC",37051,All other alcohol-induced causes,A9,14,2011,330503
206811,NC,GATES,65.47872,"Gates County, NC",37073,All other non-drug and non-alcohol causes,O9,128,2010,12165
43545,FL,MARTIN,2250.303,"Martin County, FL",12085,All other non-drug and non-alcohol causes,O9,1595,2006,141802
126236,GA,HALL,653267500.0,"Hall County, GA",13139,All other non-drug and non-alcohol causes,O9,1116,2010,180033
221832,NC,CABARRUS,15165.84,"Cabarrus County, NC",37025,Drug poisonings (overdose) Unintentional (X40-...,D1,20,2010,178588


In [18]:
merged_cleaned.to_csv("final_merged.csv", index=False)

In [20]:
merged_cleaned["Drug/Alcohol Induced Cause"].value_counts()

Drug/Alcohol Induced Cause
All other non-drug and non-alcohol causes             159321
Drug poisonings (overdose) Unintentional (X40-X44)     51836
All other alcohol-induced causes                       43345
Drug poisonings (overdose) Suicide (X60-X64)           12452
All other drug-induced causes                           2753
Drug poisonings (overdose) Undetermined (Y10-Y14)       1276
Alcohol poisonings (overdose) (X45, X65, Y15)            984
Name: count, dtype: int64

In [21]:
merged_cleaned["Drug/Alcohol Induced Cause Code"].value_counts()

Drug/Alcohol Induced Cause Code
O9    159321
D1     51836
A9     43345
D2     12452
D9      2753
D4      1276
A1       984
Name: count, dtype: int64

In [33]:
code_mask = merged_cleaned["Drug/Alcohol Induced Cause Code"].isin(["D1", "D4", "D9"])
code_filter = merged_cleaned[code_mask]

merged_cleaned = merged_cleaned[~code_mask]

In [34]:
merged_cleaned.sample(5)

Unnamed: 0,State,County,Total_MME,County.1,County Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths,year,Population
6473,GA,APPLING,32.69542,"Appling County, GA",13001,All other non-drug and non-alcohol causes,O9,192,2013,18370
217664,NC,GUILFORD,1467472.0,"Guilford County, NC",37081,All other alcohol-induced causes,A9,35,2011,494953
268818,OR,CROOK,1894.461,"Crook County, OR",41013,All other alcohol-induced causes,A9,12,2009,21410
46032,FL,DUVAL,39730.05,"Duval County, FL",12031,Drug poisonings (overdose) Suicide (X60-X64),D2,26,2010,865876
83275,FL,LAFAYETTE,83.26697,"Lafayette County, FL",12067,All other non-drug and non-alcohol causes,O9,69,2012,8791


In [31]:
mme_by_county = (
    code_filter.groupby("County Code", as_index=False)["Total_MME"]
    .sum()
    .rename(columns={"Total_MME	": "MME_D1D4D9"})
)