# Preprocessing US Vital Statistics Data

This notebook processes the US Vital Statistics mortality data to create a county-year level dataset of opioid overdose deaths.

**Goal:** Create a dataset compatible with ARCOS and Population data for DiD analysis.
**Input:** `data/raw/US_VitalStatistics/Underlying Cause of Death, YYYY.txt`
**Output:** `data/processed/vital_stats_deaths_2006_2015.parquet`


In [13]:
import pandas as pd
import os
import glob

# Configuration
raw_data_dir = '../data/raw/US_VitalStatistics'
output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)

# States to keep (14 states for DiD analysis)
states_filter = ['FL', 'WA', 'GA', 'AL', 'SC', 'NC', 'TN', 'MS', 'OR', 'CO', 'MN', 'NV', 'CA', 'VA']
year_min, year_max = 2006, 2015

print("Libraries imported and configuration set.")

Libraries imported and configuration set.


In [14]:
# Inspect one file to understand the structure and codes
sample_file = os.path.join(raw_data_dir, 'Underlying Cause of Death, 2006.txt')
df_sample = pd.read_csv(sample_file, sep='\t')

print("Columns:", df_sample.columns.tolist())
print("\nUnique Causes of Death:")
print(df_sample[['Drug/Alcohol Induced Cause Code', 'Drug/Alcohol Induced Cause']].drop_duplicates().sort_values('Drug/Alcohol Induced Cause Code'))


Columns: ['Notes', 'County', 'County Code', 'Year', 'Year Code', 'Drug/Alcohol Induced Cause', 'Drug/Alcohol Induced Cause Code', 'Deaths']

Unique Causes of Death:
     Drug/Alcohol Induced Cause Code  \
2                                 A9   
1                                 D1   
119                               D2   
56                                D4   
40                                D9   
0                                 O9   
4262                             NaN   

                             Drug/Alcohol Induced Cause  
2                      All other alcohol-induced causes  
1     Drug poisonings (overdose) Unintentional (X40-...  
119        Drug poisonings (overdose) Suicide (X60-X64)  
56    Drug poisonings (overdose) Undetermined (Y10-Y14)  
40                        All other drug-induced causes  
0             All other non-drug and non-alcohol causes  
4262                                                NaN  


In [15]:
# Process all files
all_data = []

for year in range(year_min, year_max + 1):
    file_path = os.path.join(raw_data_dir, f'Underlying Cause of Death, {year}.txt')
    
    if not os.path.exists(file_path):
        print(f"Warning: File not found for {year}")
        continue
        
    print(f"Processing {year}...")
    
    # Read file
    df = pd.read_csv(file_path, sep='\t')
    
    # Drop notes/footer rows (where County Code is NaN)
    df = df.dropna(subset=['County Code'])
    
    # Extract State
    df['State'] = df['County'].str.split(', ').str[-1]
    
    # Filter for selected states
    df = df[df['State'].isin(states_filter)]
    
    # Filter for Drug-related causes (Codes starting with 'D')
    # D1: Unintentional, D2: Suicide, D3: Homicide, D4: Undetermined, D9: Other
    df_drug = df[df['Drug/Alcohol Induced Cause Code'].str.startswith('D', na=False)].copy()
    
    # Ensure County Code is integer
    df_drug['County Code'] = df_drug['County Code'].astype(int)
    
    # Convert Deaths to numeric before aggregating
    df_drug['Deaths'] = pd.to_numeric(df_drug['Deaths'], errors='coerce').fillna(0)
    
    # Aggregate deaths by County and Year
    # We want total drug deaths
    df_agg = df_drug.groupby(['County Code', 'Year', 'State'])['Deaths'].sum().reset_index()
    
    all_data.append(df_agg)

# Combine all years
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    print(f"\nProcessing complete. Total rows: {len(final_df)}")
    print(final_df.head())
else:
    print("\nNo data processed.")


Processing 2006...
Processing 2007...
Processing 2008...
Processing 2009...
Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...

Processing complete. Total rows: 2569
   County Code    Year State  Deaths
0         1003  2006.0    AL    11.0
1         1021  2006.0    AL    13.0
2         1073  2006.0    AL    86.0
3         1097  2006.0    AL    33.0
4         1101  2006.0    AL    12.0


In [16]:
# Clean up and save
final_df = final_df.rename(columns={'County Code': 'fips', 'Year': 'year', 'Deaths': 'drug_deaths'})

# Ensure correct data types
final_df['year'] = final_df['year'].astype(int)
final_df['fips'] = final_df['fips'].astype(int)

# Handle potential non-numeric values in drug_deaths (e.g. "suppressed" or strings)
final_df['drug_deaths'] = pd.to_numeric(final_df['drug_deaths'], errors='coerce')
final_df['drug_deaths'] = final_df['drug_deaths'].fillna(0).astype(int)

# Save to Parquet
output_file = os.path.join(output_dir, 'vital_stats_deaths_2006_2015.parquet')
final_df.to_parquet(output_file, index=False)

print(f"Saved to {output_file}")
print(final_df.info())
print(final_df.head())

Saved to ../data/processed\vital_stats_deaths_2006_2015.parquet
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   fips         2569 non-null   int64 
 1   year         2569 non-null   int64 
 2   State        2569 non-null   object
 3   drug_deaths  2569 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 80.4+ KB
None
   fips  year State  drug_deaths
0  1003  2006    AL           11
1  1021  2006    AL           13
2  1073  2006    AL           86
3  1097  2006    AL           33
4  1101  2006    AL           12


In [17]:
# Verify the saved file
df_verify = pd.read_parquet(output_file)
print(df_verify.head())
print(df_verify.info())
print("\nTotal Drug Deaths:", df_verify['drug_deaths'].sum())
print("Years:", df_verify['year'].unique())
print("States:", df_verify['State'].unique())


   fips  year State  drug_deaths
0  1003  2006    AL           11
1  1021  2006    AL           13
2  1073  2006    AL           86
3  1097  2006    AL           33
4  1101  2006    AL           12
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   fips         2569 non-null   int64 
 1   year         2569 non-null   int64 
 2   State        2569 non-null   object
 3   drug_deaths  2569 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 80.4+ KB
None

Total Drug Deaths: 127049
Years: [2006 2007 2008 2009 2010 2011 2012 2013 2014 2015]
States: ['AL' 'CA' 'CO' 'FL' 'GA' 'MN' 'MS' 'NV' 'NC' 'OR' 'SC' 'TN' 'VA' 'WA']
Years: [2006 2007 2008 2009 2010 2011 2012 2013 2014 2015]
States: ['AL' 'CA' 'CO' 'FL' 'GA' 'MN' 'MS' 'NV' 'NC' 'OR' 'SC' 'TN' 'VA' 'WA']


In [18]:
print(df_verify.sort_values('drug_deaths', ascending=False).head(10))


      fips  year State  drug_deaths
1741  6037  2013    CA          846
17    6037  2006    CA          825
707   6037  2009    CA          785
235   6037  2007    CA          781
2009  6037  2014    CA          760
2293  6037  2015    CA          749
463   6037  2008    CA          732
1475  6037  2012    CA          716
946   6037  2010    CA          715
1201  6037  2011    CA          706
