# Loading Data

In [None]:
import glob
files = sorted(glob.glob("Vitalstatistics/Underlying Cause of Death*.txt"))

for f in files:
    with open(f, "r", encoding="utf-8", errors="ignore") as fp:
        n = sum(1 for _ in fp)
    print(f"{f}: {n} rows")

Vitalstatistics\Underlying Cause of Death, 2003.txt: 4103 rows
Vitalstatistics\Underlying Cause of Death, 2004.txt: 4151 rows
Vitalstatistics\Underlying Cause of Death, 2005.txt: 4218 rows
Vitalstatistics\Underlying Cause of Death, 2006.txt: 4278 rows
Vitalstatistics\Underlying Cause of Death, 2007.txt: 4320 rows
Vitalstatistics\Underlying Cause of Death, 2008.txt: 4385 rows
Vitalstatistics\Underlying Cause of Death, 2009.txt: 4380 rows
Vitalstatistics\Underlying Cause of Death, 2010.txt: 4457 rows
Vitalstatistics\Underlying Cause of Death, 2011.txt: 4531 rows
Vitalstatistics\Underlying Cause of Death, 2012.txt: 4550 rows
Vitalstatistics\Underlying Cause of Death, 2013.txt: 4610 rows
Vitalstatistics\Underlying Cause of Death, 2014.txt: 4667 rows
Vitalstatistics\Underlying Cause of Death, 2015.txt: 4799 rows


In [None]:
import polars as pl

lazy_frames = [
    pl.scan_csv(f, separator="\t", infer_schema_length=0)
    for f in files
]

vital_stats = pl.concat(lazy_frames)
vital_stats.sink_parquet("vital_stats_2003_2015.parquet")


In [2]:
df = pl.read_parquet("vital_stats_2003_2015.parquet")
df.head()

Notes,County,County Code,Year,Year Code,Drug/Alcohol Induced Cause,Drug/Alcohol Induced Cause Code,Deaths
str,str,str,str,str,str,str,str
,"""Autauga County, AL""","""1001""","""2003.0""","""2003.0""","""All other non-drug and non-alc…","""O9""","""397.0"""
,"""Baldwin County, AL""","""1003""","""2003.0""","""2003.0""","""Drug poisonings (overdose) Uni…","""D1""","""10.0"""
,"""Baldwin County, AL""","""1003""","""2003.0""","""2003.0""","""All other alcohol-induced caus…","""A9""","""14.0"""
,"""Baldwin County, AL""","""1003""","""2003.0""","""2003.0""","""All other non-drug and non-alc…","""O9""","""1479.0"""
,"""Barbour County, AL""","""1005""","""2003.0""","""2003.0""","""All other non-drug and non-alc…","""O9""","""287.0"""


## Data Cleaning & Column Drop

In [20]:
import polars as pl

lf = pl.scan_parquet("vital_stats_2003_2015.parquet")

vital_clean = (
    lf
    .select([
        pl.col("County").alias("county"),
        pl.col("County Code").alias("county_code"),
        pl.col("Year").alias("year"),
        pl.col("Drug/Alcohol Induced Cause").alias("cause"),
        pl.col("Drug/Alcohol Induced Cause Code").alias("cause_code"),
        pl.col("Deaths").alias("deaths"),
    ])
    .with_columns([
        pl.col("year")
            .str.replace(r'\.0$', '') 
            .str.replace('"', ''),

        pl.col("county_code")
            .str.replace(r'\.0$', '')
            .str.replace('"', ''),

        pl.col("deaths")
            .str.replace(r'\.0$', '')
            .str.replace('"', ''),

        pl.col("county")
            .str.replace('"', ''),

        pl.col("cause")
            .str.replace('"', ''),

        pl.col("cause_code")
            .str.replace('"', ''),
    ])
    .with_columns([
        pl.col("year").cast(pl.Int64, strict=False),
        pl.col("county_code").cast(pl.Int64, strict=False),
        pl.col("deaths").cast(pl.Int64, strict=False),
    ])
)

vital_clean.sink_parquet("vital_stats_2003_2015_clean.parquet")


In [28]:
df = pl.read_parquet("vital_stats_2003_2015_clean.parquet")
df.shape

(57436, 6)

## Basic Checks

In [None]:
# Data Types Check
df.schema 

Schema([('county', String),
        ('county_code', Int64),
        ('year', Int64),
        ('cause', String),
        ('cause_code', String),
        ('deaths', Int64)])

In [None]:
# Year Check
df.group_by("year").len().sort("year") # wierd null values
df.filter(pl.col("year").is_null()).head(20)

county,county_code,year,cause,cause_code,deaths
str,i64,i64,str,str,i64
,,,,,
,,,,,
,,,,,
,,,,,
,,,,,
…,…,…,…,…,…
,,,,,
,,,,,
,,,,,
,,,,,


In [None]:
n_null_year = df.filter(pl.col("year").is_null()).height
# n_null_year # is 195
df_no_null = df.filter(pl.col("year").is_not_null()) # drop null rows
df_no_null.shape
df_no_null.group_by("year").len().sort("year")
df_no_null.write_parquet("vital_stats_2003_2015_fully_cleaned.parquet")

In [None]:
df = pl.read_parquet("vital_stats_2003_2015_fully_cleaned.parquet")
# duplicate check
dup = (
    df
    .group_by(["county_code", "year", "cause_code"])
    .len()
    .filter(pl.col("len") > 1)
    .sort("len", descending=True)
)
dup.head(20)


county_code,year,cause_code,len
i64,i64,str,u32


In [None]:
# deaths min/max check
df.select([
    pl.col("deaths").min().alias("deaths_min"),
    pl.col("deaths").max().alias("deaths_max"),
])

deaths_min,deaths_max
i64,i64
10,60312
