# Install Dependent Libraries (Databricks)

Note: If running this notebook in Databricks, you will need the following libraries. If these libraries are not installed on your Databricks Cluster, you can simply uncomment and run the following cell to install those libraries in the notebook before you import the dependencies.

Libraries needed:
- koalas
- mlflow
- tensorflow
- imblearn

In [1]:
# dbutils.library.installPyPI("koalas")
# dbutils.library.installPyPI("mlflow")
# dbutils.library.installPyPI("tensorflow")
# dbutils.library.installPyPI("imblearn")
# dbutils.library.restartPython()

# Import Dependencies

In [2]:
# import databricks.koalas as ks
import pandas as pd

import numpy as np
import gzip

In [3]:
import mlflow.sklearn
import mlflow.keras

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.preprocessing import StandardScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [7]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, utils

# Create a Keras model that's compatible with scikit-learn
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import pickle
import tempfile
from tensorflow.keras.models import Sequential, load_model, save_model, Model
from tensorflow.keras.layers import Dense

# Connect to the AWS S3 Mount and Read CSV (Databricks only)

In [8]:
# ACCESS_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-access-key")
# SECRET_KEY = "ENTER_YOUR_KEY_HERE" # dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
# ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
# AWS_BUCKET_NAME = "ENTER_YOUR_BUCKET_HERE" #Or the bucket you saved your data to
# MOUNT_NAME = "mnt_s3"
# s3_uri = f"s3a://{ACCESS_KEY}:{ENCODED_SECRET_KEY}@{AWS_BUCKET_NAME}"
# mount_uri = f"/mnt/{MOUNT_NAME}"
# display(dbutils.fs.ls(mount_uri))

In [9]:
# # Read CSVs
# df = pd.read_csv("/dbfs/mnt/%s/Project 3 Stuff/cod_clean.csv.gz" % MOUNT_NAME, compression="gzip")

# Read the CSV (Local Jupyter Notebook only)

In [10]:
# Read CSVs
df = pd.read_csv("../data/cod_clean.csv.gz", compression="gzip")

# Select your Features and Labels

In [11]:
# Drop unnecessary column
df = df.drop(columns="ICD Code")
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Year,Cause of Death,Race,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,2005,All other forms of chronic ischemic heart dise...,White,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,2005,Other chronic obstructive pulmonary disease,White,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,"Of trachea, bronchus and lung",White,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,2005,Intentional self-harm,White,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,2005,"Stroke, not specified as hemorrhage or infarct...",White,Diseases of the circulatory system


In [12]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Education Level,Month of Death,Sex/Gender,Age Groups,Marital Status,Year,Cause of Death,Race,Cause of Death Category
0,8th grade or less,June,M,85 years and over,Married,2005,All other forms of chronic ischemic heart dise...,White,Diseases of the circulatory system
1,"9 - 12th grade, no diploma",January,F,45 - 54 years,Married,2005,Other chronic obstructive pulmonary disease,White,Diseases of the respiratory system
2,high school graduate or GED completed,January,F,65 - 74 years,Widowed,2005,"Of trachea, bronchus and lung",White,Neoplasms
3,high school graduate or GED completed,January,M,55 - 64 years,Married,2005,Intentional self-harm,White,External causes of morbidity and mortality
4,high school graduate or GED completed,January,M,75 - 84 years,Married,2005,"Stroke, not specified as hemorrhage or infarct...",White,Diseases of the circulatory system


In [13]:
# Select desired features
cleanup_df = df[["Cause of Death", "Cause of Death Category", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]

In [14]:
# Select desired labels (5 causes of death, plus a "control group")
cleanup_df = cleanup_df.loc[(cleanup_df["Cause of Death"] == "Other cerebrovascular diseases and their sequelae ") | 
                            (cleanup_df["Cause of Death"] == "All other diseases of respiratory system ") |
                            (cleanup_df["Cause of Death Category"] == "External causes of morbidity and mortality") |
                            (cleanup_df["Cause of Death"] == "Alzheimer's disease ") |
                            (cleanup_df["Cause of Death"] == "Diabetes mellitus ") |
                            (cleanup_df["Cause of Death"] == "All other symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified ")
                            ]
cleanup_df = cleanup_df.reset_index(drop=True)

In [15]:
# Standardize values
cleanup_df = cleanup_df.replace({
    "Endocrine, nutritional and metabolic diseases": "Diabetes mellitus",
    "Diseases of the nervous system": "Alzheimer's Disease",
    "Diseases of the circulatory system": "Cerebrovascular Diseases",
    "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified": "Other"
})

In [16]:
# Arrange final columns
selected_features = cleanup_df[["Cause of Death Category", "Year", "Month of Death", "Sex/Gender",
                        "Marital Status", "Age Groups", "Education Level", "Race"]]
selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,External causes of morbidity and mortality,2005,January,M,Married,55 - 64 years,high school graduate or GED completed,White
1,Cerebrovascular Diseases,2005,January,F,Widowed,75 - 84 years,8th grade or less,White
2,Alzheimer's Disease,2005,January,M,Widowed,85 years and over,"some college credit, but no degree",White
3,Alzheimer's Disease,2005,January,M,Married,75 - 84 years,"some college credit, but no degree",White
4,Diabetes mellitus,2005,January,M,Married,65 - 74 years,Master’s degree,White


In [17]:
# Print labels
for x in selected_features["Cause of Death Category"].unique():
    print(x)

External causes of morbidity and mortality
Cerebrovascular Diseases
Alzheimer's Disease
Diabetes mellitus
Diseases of the respiratory system
Other


# Select Labels for Test and Control Groups

In [18]:
selected_features = selected_features.loc[(selected_features["Cause of Death Category"] == "External causes of morbidity and mortality") |
                                         (selected_features["Cause of Death Category"] == "Other")]
selected_features = selected_features.reset_index(drop=True)

In [19]:
selected_features

Unnamed: 0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
0,External causes of morbidity and mortality,2005,January,M,Married,55 - 64 years,high school graduate or GED completed,White
1,External causes of morbidity and mortality,2005,January,F,"Never married, single",5 - 14 years,8th grade or less,American Indian
2,External causes of morbidity and mortality,2005,January,F,"Never married, single",5 - 14 years,8th grade or less,American Indian
3,External causes of morbidity and mortality,2005,January,M,Widowed,85 years and over,"some college credit, but no degree",White
4,External causes of morbidity and mortality,2005,January,M,"Never married, single",35 - 44 years,Associate degree,White
...,...,...,...,...,...,...,...,...
1525972,External causes of morbidity and mortality,2015,December,F,"Never married, single",45 - 54 years,"some college credit, but no degree",Black
1525973,External causes of morbidity and mortality,2015,December,F,"Never married, single",55 - 64 years,high school graduate or GED completed,Black
1525974,External causes of morbidity and mortality,2015,December,M,"Never married, single",55 - 64 years,high school graduate or GED completed,Black
1525975,External causes of morbidity and mortality,2015,December,M,"Never married, single",25 - 34 years,high school graduate or GED completed,White


# Preview Data Distribution by Features and Classes

In [20]:
cod = selected_features.groupby("Cause of Death Category").count()
cod.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level,Race
Cause of Death Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
External causes of morbidity and mortality,1412694,1412694,1412694,1412694,1412694,1412694,1412694
Other,113283,113283,113283,113283,113283,113283,113283


In [21]:
age = selected_features.groupby("Age Groups").count()
age.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Education Level,Race
Age Groups,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
45 - 54 years,243069,243069,243069,243069,243069,243069,243069
85 years and over,210517,210517,210517,210517,210517,210517,210517
25 - 34 years,204668,204668,204668,204668,204668,204668,204668
35 - 44 years,198380,198380,198380,198380,198380,198380,198380
55 - 64 years,181627,181627,181627,181627,181627,181627,181627
15 - 24 years,174319,174319,174319,174319,174319,174319,174319
75 - 84 years,152466,152466,152466,152466,152466,152466,152466
65 - 74 years,118595,118595,118595,118595,118595,118595,118595
5 - 14 years,18238,18238,18238,18238,18238,18238,18238
1 - 4 years,12989,12989,12989,12989,12989,12989,12989


In [22]:
race = selected_features.groupby("Race").count()
race.sort_values(by="Year", ascending=False)

Unnamed: 0_level_0,Cause of Death Category,Year,Month of Death,Sex/Gender,Marital Status,Age Groups,Education Level
Race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
White,1286957,1286957,1286957,1286957,1286957,1286957,1286957
Black,184780,184780,184780,184780,184780,184780,184780
Asian or Pacific Islander,32854,32854,32854,32854,32854,32854,32854
American Indian,21386,21386,21386,21386,21386,21386,21386


# Apply One-Hot Encoding

In [23]:
label_encoder = LabelEncoder()
onehotencoder = OneHotEncoder()

### Encode X data (features)

In [24]:
column_list = ["Month of Death", "Age Groups", "Education Level", "Sex/Gender", "Marital Status", "Race"]

In [25]:
for column in column_list:
    # Reshape column data; fit to the one-hot-encoder (expands columns)
    X = onehotencoder.fit_transform(selected_features[column].values.reshape(-1,1)).toarray()
    
    # Send the one-hot-encoded information from that column to a new dataframe
    dfOneHot = pd.DataFrame(X, columns = [column+str(int(i)) for i in range(X.shape[1])])
    
    # Merge the one-hot-encoded dataframe to the master dataframe
    selected_features = selected_features.merge(dfOneHot, how="right", right_index=True, left_index=True)
    
    # Drop the column selected (no longer needed)
    selected_features = selected_features.drop([column], axis=1)

selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death0,Month of Death1,Month of Death2,Month of Death3,Month of Death4,Month of Death5,Month of Death6,Month of Death7,...,Sex/Gender1,Marital Status0,Marital Status1,Marital Status2,Marital Status3,Marital Status4,Race0,Race1,Race2,Race3
0,External causes of morbidity and mortality,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,External causes of morbidity and mortality,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,External causes of morbidity and mortality,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,External causes of morbidity and mortality,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,External causes of morbidity and mortality,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Encode Y data (labels/categories)

In [26]:
selected_features["Cause of Death Category"] = label_encoder.fit_transform(selected_features["Cause of Death Category"])
selected_features.head()

Unnamed: 0,Cause of Death Category,Year,Month of Death0,Month of Death1,Month of Death2,Month of Death3,Month of Death4,Month of Death5,Month of Death6,Month of Death7,...,Sex/Gender1,Marital Status0,Marital Status1,Marital Status2,Marital Status3,Marital Status4,Race0,Race1,Race2,Race3
0,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0,2005,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Select Data Values

In [27]:
X = selected_features.iloc[:, 1:45]
y = selected_features.iloc[:, 0]

# Create a Train Test Split

In [28]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Make Keras Pickle-able

Boiler plate code found at https://github.com/tensorflow/tensorflow/issues/34697

In [29]:
# Hotfix function
def make_keras_picklable():
    def __getstate__(self):
        model_str = ""
        with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
            save_model(self, fd.name, overwrite=True)
            model_str = fd.read()
        d = {'model_str': model_str}
        return d

    def __setstate__(self, state):
        with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
            fd.write(state['model_str'])
            fd.flush()
            model = load_model(fd.name)
        self.__dict__ = model.__dict__


    cls = Model
    cls.__getstate__ = __getstate__
    cls.__setstate__ = __setstate__

# Run the function
make_keras_picklable()

# Make a Keras Deep Learning Classifier

In [30]:
classifier = models.Sequential()
number_inputs = 44
first_hidden_layer = 60
second_hidden_layer = 48
third_hidden_layer = 36
fourth_hidden_layer = 24
fifth_hidden_layer = 12
sixth_hidden_layer = 6
number_classes = 2

classifier.add(layers.Dense(units=first_hidden_layer, activation='relu', input_dim=number_inputs))
classifier.add(layers.Dense(units=second_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=third_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=fourth_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=fifth_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=sixth_hidden_layer, activation='relu'))
classifier.add(layers.Dense(units=number_classes, activation='softmax'))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
# Perform Random Under Sampling and Standard Scaling
data_transform = make_pipeline_imb(StandardScaler(), RandomUnderSampler())

X_train_resample, y_train_resample = RandomUnderSampler().fit_resample(X_train, y_train)
# X_train_resample = StandardScaler().fit_transform(X_train_resample)

X_test_resample, y_test_resample = RandomUnderSampler().fit_resample(X_test, y_test)
X_test_resample = StandardScaler().fit_transform(X_test_resample)

# Fit Model

In [32]:
classifier.fit(X_train_resample, y_train_resample, epochs=80)

  tensor_proto.tensor_content = nparray.tostring()


Epoch 1/80


  if not isinstance(values, collections.Sequence):
  if not isinstance(wrapped_dict, collections.Mapping):


Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80


Epoch 80/80


<tensorflow.python.keras.callbacks.History at 0x1e56edf5c88>

In [33]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 60)                2700      
_________________________________________________________________
dense_1 (Dense)              (None, 48)                2928      
_________________________________________________________________
dense_2 (Dense)              (None, 36)                1764      
_________________________________________________________________
dense_3 (Dense)              (None, 24)                888       
_________________________________________________________________
dense_4 (Dense)              (None, 12)                300       
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 78        
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 1

# Perform Predictions

In [34]:
results = classifier.evaluate(X_test_resample, y_test_resample)

   1/1755 [..............................] - ETA: 0s - loss: 0.6932 - accuracy: 1.0000

  tensor_proto.tensor_content = nparray.tostring()




In [35]:
print(f"Model Accuracy: {results[1]*100}%")

Model Accuracy: 49.99109506607056%


In [36]:
predictions = classifier.predict(X_test_resample[[0]])
predicted_class_num = classifier.predict_classes(X_test_resample[[0]])
predicted_class_string = label_encoder.inverse_transform(predicted_class_num)
predicted_accuracy = predictions[0,1]*100

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [37]:
print(f"Class: {predicted_class_string}")
print(f"Probability: {predicted_accuracy}")

Class: ['External causes of morbidity and mortality']
Probability: 50.0


# Save the Model

In [38]:
classifier.save("saved_model/Model_1_External_Causes.h5")

In [39]:
External_Causes_Model = tf.keras.models.load_model("saved_model/Model_1_External_Causes.h5")

In [40]:
X_test.iloc[0].T.to_csv("sample.csv")

In [41]:
test_df = pd.DataFrame(X_test.iloc[0, :])
test_df = test_df.T
test_df.to_csv("sample.csv", index=False)