In [1]:
# Impoting required packages
import pandas as pd
import numpy as np
import zipfile

# setting default option
pd.set_option("mode.copy_on_write", True)

In [2]:
# View the files present in the Zip file
z = zipfile.ZipFile("../../Data/raw/raw_mortality.zip")
z.namelist()

['Underlying Cause of Death, 2009.txt',
 '__MACOSX/',
 '__MACOSX/._Underlying Cause of Death, 2009.txt',
 'Underlying Cause of Death, 2008.txt',
 'Underlying Cause of Death, 2003.txt',
 'Underlying Cause of Death, 2014.txt',
 'Underlying Cause of Death, 2015.txt',
 'Underlying Cause of Death, 2005.txt',
 'Underlying Cause of Death, 2011.txt',
 'Underlying Cause of Death, 2010.txt',
 'Underlying Cause of Death, 2004.txt',
 'Underlying Cause of Death, 2012.txt',
 'Underlying Cause of Death, 2006.txt',
 'Underlying Cause of Death, 2007.txt',
 'Underlying Cause of Death, 2013.txt']

In [3]:
# creating list of files which start with "Underlying" so as to ignore system files
file_list = sorted([f for f in z.namelist() if f.startswith("Underlying")])
file_list

['Underlying Cause of Death, 2003.txt',
 'Underlying Cause of Death, 2004.txt',
 'Underlying Cause of Death, 2005.txt',
 'Underlying Cause of Death, 2006.txt',
 'Underlying Cause of Death, 2007.txt',
 'Underlying Cause of Death, 2008.txt',
 'Underlying Cause of Death, 2009.txt',
 'Underlying Cause of Death, 2010.txt',
 'Underlying Cause of Death, 2011.txt',
 'Underlying Cause of Death, 2012.txt',
 'Underlying Cause of Death, 2013.txt',
 'Underlying Cause of Death, 2014.txt',
 'Underlying Cause of Death, 2015.txt']

In [4]:
# read a single file to understand structure and cleaning rules required
test = pd.read_csv(z.open(file_list[0]), sep="\t")
test.sample(5)

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
783,,"Rockdale County, GA",13247.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,497.0
641,,"Walton County, FL",12131.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,438.0
1225,,"Comanche County, KS",20033.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,34.0
2758,,"Montgomery County, OH",39113.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,5410.0
1011,,"Clark County, IN",18019.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,985.0


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4102 entries, 0 to 4101
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Notes                            15 non-null     object 
 1   County                           4087 non-null   object 
 2   County Code                      4087 non-null   float64
 3   Year                             4087 non-null   float64
 4   Year Code                        4087 non-null   float64
 5   Drug/Alcohol Induced Cause       4087 non-null   object 
 6   Drug/Alcohol Induced Cause Code  4087 non-null   object 
 7   Deaths                           4087 non-null   float64
dtypes: float64(4), object(4)
memory usage: 256.5+ KB


In [6]:
test[test["Notes"].notnull()]

Unnamed: 0,Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
4087,---,,,,,,,
4088,"Dataset: Underlying Cause of Death, 1999-2017",,,,,,,
4089,Query Parameters:,,,,,,,
4090,Group By: County; Year; Drug/Alcohol Induced C...,,,,,,,
4091,Show Totals: Disabled,,,,,,,
4092,Show Zero Values: Disabled,,,,,,,
4093,Show Suppressed: False,,,,,,,
4094,---,,,,,,,
4095,Help: See http://wonder.cdc.gov/wonder/help/uc...,,,,,,,
4096,---,,,,,,,


In [7]:
test[test["County"].notnull()]["Notes"].value_counts()

Series([], Name: count, dtype: int64)

In [8]:
# read data from all the files and append to list
df_list = []
for file in file_list:
    # read individual files
    df_temp = pd.read_csv(z.open(file), sep="\t")

    # drop the notes columns and remove rows with null values in State column and alaska
    df_temp.drop(columns=["Notes"], inplace=True)
    df_temp.dropna(subset=["County"], inplace=True)

    # add the cleaned temp Df to the main list
    df_list.append(df_temp)

In [9]:
# create the dataframe
df = pd.concat(df_list, ignore_index=True)
df.sample(5)

Unnamed: 0,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
56292,"Montgomery County, TN",47125.0,2015.0,2015.0,All other alcohol-induced causes,A9,20.0
36505,"Boone County, NE",31011.0,2011.0,2011.0,All other non-drug and non-alcohol causes,O9,70.0
48815,"Bureau County, IL",17011.0,2014.0,2014.0,All other non-drug and non-alcohol causes,O9,374.0
8501,"Mariposa County, CA",6043.0,2005.0,2005.0,All other non-drug and non-alcohol causes,O9,170.0
1006,"Boone County, IN",18011.0,2003.0,2003.0,All other non-drug and non-alcohol causes,O9,436.0


In [10]:
# check for null values
df.isnull().sum()

County                             0
County Code                        0
Year                               0
Year Code                          0
Drug/Alcohol Induced Cause         0
Drug/Alcohol Induced Cause Code    0
Deaths                             0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57241 entries, 0 to 57240
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   County                           57241 non-null  object 
 1   County Code                      57241 non-null  float64
 2   Year                             57241 non-null  float64
 3   Year Code                        57241 non-null  float64
 4   Drug/Alcohol Induced Cause       57241 non-null  object 
 5   Drug/Alcohol Induced Cause Code  57241 non-null  object 
 6   Deaths                           57241 non-null  object 
dtypes: float64(3), object(4)
memory usage: 3.1+ MB


In [12]:
# Cleaning the data
df2 = df.copy()

# Pad county code with 0 for consistency with other data sets
df2["County Code"] = df2["County Code"].astype(int).astype(str).str.zfill(5)

# Convert Year to Int
df2["Year"] = df2["Year"].astype(int)

# Convert Deaths to Int
df2["Deaths"] = df2["Deaths"].replace("Missing", np.nan)
df2["Deaths"] = (
    df2["Deaths"].astype(float).astype("Int64")
)  # making it as int64 so that we retain null values

In [13]:
df2["Drug/Alcohol Induced Cause"].unique()

array(['All other non-drug and non-alcohol causes',
       'Drug poisonings (overdose) Unintentional (X40-X44)',
       'All other alcohol-induced causes',
       'All other drug-induced causes',
       'Drug poisonings (overdose) Suicide (X60-X64)',
       'Drug poisonings (overdose) Undetermined (Y10-Y14)',
       'Alcohol poisonings (overdose) (X45, X65, Y15)',
       'Drug poisonings (overdose) Homicide (X85)'], dtype=object)

In [14]:
# Store only the rows related drugs
required_causes = [
    "Drug poisonings (overdose) Unintentional (X40-X44)",
    "All other drug-induced causes",
    "Drug poisonings (overdose) Homicide (X85)",
    "Drug poisonings (overdose) Suicide (X60-X64)",
    "Drug poisonings (overdose) Undetermined (Y10-Y14)",
]

In [15]:
df3 = df2[df2["Drug/Alcohol Induced Cause"].isin(required_causes)]

In [16]:
# remove extra columns
df3.drop(columns=["Year Code", "Drug/Alcohol Induced Cause Code"], inplace=True)

In [17]:
df3

Unnamed: 0,County,County Code,Year,Drug/Alcohol Induced Cause,Deaths
1,"Baldwin County, AL",01003,2003,Drug poisonings (overdose) Unintentional (X40-...,10
38,"Jefferson County, AL",01073,2003,Drug poisonings (overdose) Unintentional (X40-...,37
39,"Jefferson County, AL",01073,2003,All other drug-induced causes,32
54,"Mobile County, AL",01097,2003,Drug poisonings (overdose) Unintentional (X40-...,26
78,"Anchorage Borough, AK",02020,2003,Drug poisonings (overdose) Unintentional (X40-...,31
...,...,...,...,...,...
57202,"Waukesha County, WI",55133,2015,Drug poisonings (overdose) Unintentional (X40-...,34
57208,"Winnebago County, WI",55139,2015,Drug poisonings (overdose) Unintentional (X40-...,22
57218,"Fremont County, WY",56013,2015,Drug poisonings (overdose) Unintentional (X40-...,10
57224,"Laramie County, WY",56021,2015,Drug poisonings (overdose) Unintentional (X40-...,13


In [21]:
# renaming columns
df3.rename(
    columns={"Drug/Alcohol Induced Cause": "Cause", "County Code": "County_Code"},
    inplace=True,
)

In [22]:
# use fips file to generate proper county name and state
fips = pd.read_csv("../../Data/raw/county_fips.csv")
fips["countyfips"] = fips["countyfips"].astype(str).str.zfill(5)

In [23]:
# merge with fips
# performing left join to get the county names
df4 = pd.merge(
    df3,
    fips,
    how="left",
    left_on="County_Code",
    right_on="countyfips",
    validate="m:1",
    indicator=True,
)

In [29]:
df4[df4["_merge"] == "left_only"]

Unnamed: 0,County,County_Code,Year,Cause,Deaths,BUYER_COUNTY,BUYER_STATE,countyfips,_merge


In [30]:
df4.columns

Index(['County', 'County_Code', 'Year', 'Cause', 'Deaths', 'BUYER_COUNTY',
       'BUYER_STATE', 'countyfips', '_merge'],
      dtype='object')

In [32]:
# select required colums
df5 = df4[["BUYER_STATE", "BUYER_COUNTY", "County_Code", "Year", "Cause", "Deaths"]]

# rename columns
df5 = df5.rename(columns={"BUYER_COUNTY": "County", "BUYER_STATE": "State"})

In [33]:
df5

Unnamed: 0,State,County,County_Code,Year,Cause,Deaths
0,AL,BALDWIN,01003,2003,Drug poisonings (overdose) Unintentional (X40-...,10
1,AL,JEFFERSON,01073,2003,Drug poisonings (overdose) Unintentional (X40-...,37
2,AL,JEFFERSON,01073,2003,All other drug-induced causes,32
3,AL,MOBILE,01097,2003,Drug poisonings (overdose) Unintentional (X40-...,26
4,AK,ANCHORAGE,02020,2003,Drug poisonings (overdose) Unintentional (X40-...,31
...,...,...,...,...,...,...
10427,WI,WAUKESHA,55133,2015,Drug poisonings (overdose) Unintentional (X40-...,34
10428,WI,WINNEBAGO,55139,2015,Drug poisonings (overdose) Unintentional (X40-...,22
10429,WY,FREMONT,56013,2015,Drug poisonings (overdose) Unintentional (X40-...,10
10430,WY,LARAMIE,56021,2015,Drug poisonings (overdose) Unintentional (X40-...,13


In [None]:
# write to parquet in main file