To start, we'll load up a few modules with the `import` statement. When you load a module using import, all of the functions available are now accessible to you. Modules and import statements help programmers avoid naming conflicts because you can use short, straightforward names for functions and variables without worrying that they're already taken. Matlab does not have anything equivalent to Python's module system and therefore can be harder to read.

In [1]:
import sqlite3
import pandas as pd
import re
import numpy as np

## Download Data

Notes:

1) Need to download SQLite browser (???)
Download sqlite from https://www.sqlite.org/download.html
or sqlite browser from https://sqlitebrowser.org/dl/
2) Need sqlite3 module and probably pandas

In [3]:
# Data sets available at https://catalog.data.gov/dataset
data = pd.read_csv('NCHS_-_Leading_Causes_of_Death__United_States.csv',',')


In [108]:
# Look at data
data.head()

Unnamed: 0,Year,113 Cause Name,Cause Name,State,Deaths,Age-adjusted Death Rate
0,2012,"Nephritis, nephrotic syndrome and nephrosis (N...",Kidney disease,Vermont,21,2.6
1,2016,"Nephritis, nephrotic syndrome and nephrosis (N...",Kidney disease,Vermont,30,3.7
2,2013,"Nephritis, nephrotic syndrome and nephrosis (N...",Kidney disease,Vermont,30,3.8
3,2000,"Intentional self-harm (suicide) (*U03,X60-X84,...",Suicide,District of Columbia,23,3.8
4,2014,"Nephritis, nephrotic syndrome and nephrosis (N...",Kidney disease,Arizona,325,4.1


In [131]:
# What are the column names?
print([col for col in data.columns])

['Year', 'Cause_Description', 'Cause_Name', 'State', 'Deaths', 'Age_adjusted_Death_Rate']
10296


In [110]:
# Clean names
def clean_names(df):
    L = []
    for col in df.columns:
        L.append(re.sub(r"\s+|-", '_', col))
    df.columns = L
        
        

In [111]:
# Clean names and print out new names
clean_names(data)
print([col for col in data.columns])

['Year', '113_Cause_Name', 'Cause_Name', 'State', 'Deaths', 'Age_adjusted_Death_Rate']


In [112]:
#113 Cause Name will still cause trouble
cols = [col for col in data.columns]
cols[1] = 'Cause_Description'
data.columns = cols
print([col for col in data.columns])

['Year', 'Cause_Description', 'Cause_Name', 'State', 'Deaths', 'Age_adjusted_Death_Rate']


In [126]:
# Make year table
conn = sqlite3.connect("leading_cases_of_death.sqlite")
cur = conn.cursor()
# Sort list of unique years
yr_unique = np.sort(data.Year.unique()).tolist()
sql_statement1 = ("DROP TABLE IF EXISTS Year")
sql_statement2 = '''CREATE TABLE Year(
                    YearID INTEGER PRIMARY KEY,
                    Year INTEGER NOT NULL
                    )'''
cur.execute(sql_statement1)
cur.execute(sql_statement2)
for yr in yr_unique:
    cur.execute("INSERT INTO Year (Year) VALUES (?)", (yr,))
    conn.commit()
conn.close()


In [127]:
# Check year table created
conn = sqlite3.connect("leading_cases_of_death.sqlite")
df_year = pd.read_sql_query("SELECT * FROM Year", conn)
conn.close()
df_year.head()

Unnamed: 0,YearID,Year
0,1,1999
1,2,2000
2,3,2001
3,4,2002
4,5,2003


In [128]:
# Create Cause Table
conn = sqlite3.connect("leading_cases_of_death.sqlite")
cur = conn.cursor()
# Create list of unique causes
cause_unique = data.Cause_Name.unique().tolist()
maxLengthCause = max([len(item) for item in cause_unique])
cause_desc_unique = data.Cause_Description.unique().tolist()
maxLengthDesc = max([len(item) for item in cause_desc_unique])
sql_statement1 = ("DROP TABLE IF EXISTS Cause")
sql_statement2 = '''CREATE TABLE Cause(
                    CauseID INTEGER PRIMARY KEY,
                    Cause_Name VARCHAR({0}),
                    CauseDescription VARCHAR({1})
                    )'''.format(maxLengthCause,maxLengthDesc)
cur.execute(sql_statement1)
cur.execute(sql_statement2)
for i in range(len(cause_unique)):
    cur.execute("INSERT INTO Cause (Cause_Name,CauseDescription) VALUES (?,?)", 
                (cause_unique[i], cause_desc_unique[i]))
    conn.commit()
conn.close()

In [123]:
# Check cause table created
conn = sqlite3.connect("leading_cases_of_death.sqlite")
df_cause = pd.read_sql_query("SELECT * FROM Cause", conn)
conn.close()
df_cause.head()

Unnamed: 0,CauseID,Cause_Name,CauseDescription
0,1,Kidney disease,"Nephritis, nephrotic syndrome and nephrosis (N..."
1,2,Suicide,"Intentional self-harm (suicide) (*U03,X60-X84,..."
2,3,Alzheimer's disease,Alzheimer's disease (G30)
3,4,Influenza and pneumonia,Influenza and pneumonia (J09-J18)
4,5,Diabetes,Diabetes mellitus (E10-E14)


In [117]:
# Make state table
conn = sqlite3.connect("leading_cases_of_death.sqlite")
cur = conn.cursor()
# Sort list of unique years
state_unique = np.sort(data.State.unique()).tolist()
maxLength = max([len(item) for item in state_unique])
sql_statement1 = ("DROP TABLE IF EXISTS State")
sql_statement2 = '''CREATE TABLE State(
                    StateID INTEGER PRIMARY KEY,
                    State VARCHAR({0})
                    )'''.format(maxLength,)
cur.execute(sql_statement1)
cur.execute(sql_statement2)
for state in state_unique:
    cur.execute("INSERT INTO State (State) VALUES (?)", (state,))
    conn.commit()
conn.close()

In [118]:
# Check state table created
conn = sqlite3.connect("leading_cases_of_death.sqlite")
df_state = pd.read_sql_query("SELECT * FROM State", conn)
conn.close()
df_state.head()

Unnamed: 0,StateID,State
0,1,Alabama
1,2,Alaska
2,3,Arizona
3,4,Arkansas
4,5,California


In [137]:
# # Make deaths table
conn = sqlite3.connect("leading_cases_of_death.sqlite")
cur = conn.cursor()
# Merge ID values to dataframe
merged = pd.merge(data,df_state,how='outer',on=['State'])
merged = pd.merge(merged,df_cause,how='outer',on=['Cause_Name'])
merged = pd.merge(merged, df_year, how ='outer',on=['Year'])

sql_statement1 = ("DROP TABLE IF EXISTS Deaths")
sql_statement2 = '''CREATE TABLE Deaths (
                    ID INTEGER PRIMARY KEY,  
                    Deaths INTEGER,
                    Age_adjusted_Death_Rate FLOAT,
                    CauseID INTEGER,
                    YearID INTEGER,
                    StateID INTEGER,
                    FOREIGN KEY (CauseID) REFERENCES Cause(CauseID),
                    FOREIGN KEY (YearID) REFERENCES Year(YearID),
                    FOREIGN KEY (StateID) REFERENCES State(StateID)
                    );'''
cur.execute(sql_statement1)
cur.execute(sql_statement2)

# Get data into right format
deaths = merged['Deaths'].values.tolist()
age_adj_rate = merged['Age_adjusted_Death_Rate'].values.tolist()
causeID = merged['CauseID'].values.tolist()
yearID = merged['YearID'].values.tolist()
stateID = merged['StateID'].values.tolist()

for i in range(len(merged)):
    cur.execute('''INSERT INTO Deaths 
    (Deaths,Age_adjusted_Death_Rate,CauseID,YearID,StateID) 
    VALUES (?,?,?,?,?)''',
    (deaths[i],age_adj_rate[i],causeID[i],yearID[i],stateID[i]))
    conn.commit()
conn.close()

In [138]:
# Check death table created
conn = sqlite3.connect("leading_cases_of_death.sqlite")
df_death = pd.read_sql_query("SELECT * FROM Deaths", conn)
conn.close()
df_death.head()

Unnamed: 0,ID,Deaths,Age_adjusted_Death_Rate,CauseID,YearID,StateID
0,1,21,2.6,1,14,47
1,2,44,7.2,1,14,9
2,3,414,5.7,1,14,3
3,4,58,5.6,1,14,42
4,5,482,6.6,1,14,49


In [161]:
# Queries
def query(db,query):
    conn = sqlite3.connect(db)
    try:
        df = pd.read_sql_query(query, conn)
        conn.close()
        return df
    except Exception as e:
        print("ERROR",e)
    conn.close()
    

In [162]:
conn = sqlite3.connect("leading_cases_of_death.sqlite")
cur = conn.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('State',), ('Year',), ('Cause',), ('Deaths',)]


In [182]:
# What were the leading causes of death in Oregon in 2016?
db = "leading_cases_of_death.sqlite"
df = query(db, '''SELECT State.State, Deaths.Deaths, Year.Year, Cause.Cause_Name 
      FROM Deaths LEFT OUTER JOIN State
      ON Deaths.StateID = State.StateID
      LEFT OUTER JOIN Year ON Deaths.YearID = Year.YearID
      LEFT OUTER JOIN Cause on Deaths.CauseID = Cause.CauseID
      WHERE State.State = 'Oregon' AND Year.Year = '2016';''')
# Year AS y, Cause as c
# y.Year, c.Cause_Name

In [184]:
df

Unnamed: 0,State,Deaths,Year,Cause_Name
0,Oregon,398,2016,Kidney disease
1,Oregon,452,2016,Influenza and pneumonia
2,Oregon,772,2016,Suicide
3,Oregon,1240,2016,Diabetes
4,Oregon,1786,2016,Alzheimer's disease
5,Oregon,1943,2016,Stroke
6,Oregon,2105,2016,Unintentional injuries
7,Oregon,2080,2016,CLRD
8,Oregon,6968,2016,Heart disease
9,Oregon,8078,2016,Cancer
