# Data 622
## Assignment 5 - FeedForward Neural Network
Mark Ly
Student ID: 00504696

In [314]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
import matplotlib.pyplot as plt

# Preparing dataset
After loading the dataset from UCI Machine Learning Heart Disease Data, we need to perform some data wrangling. There
 are 13 features and the last feature is the target variable with 5 classes from 0-4
 * Impute the missing data with the mean
 * one-hot encode categorical features
 * Scale each continuous feature with zero mean and unit variance
 * one-hot encode target variable using _keras.utils.to categorical_
 * Randomly split data into 70% training and 30% test.

To minimize data leakage, we will first one hot encode all categorical data before splitting. Afterwards we will
impute the missing variables for both X train and X test with the mean before the scaling the continuous features.

In [315]:
df_hd = pd.read_csv("processed.cleveland.data", header=None, na_values="?")
clev_cols = {0:'age',1:'sex',2:'cp',3:'trestbps',4:'chol',5:'fbs',6:'restecg',7:'thalach',8:'exang',
              9:'oldpeak',10:'slope',11:'ca',12:'thal',13:'num'}
df_hd_named = df_hd.rename(columns=clev_cols)
df_hd_named.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


In [316]:
df_hd_named.slope.value_counts()

1.0    142
2.0    140
3.0     21
Name: slope, dtype: int64

In [317]:
cat_var = ['cp','restecg']
enc = preprocessing.OneHotEncoder(categories='auto')
df_hd_named_enc = pd.DataFrame(enc.fit_transform(df_hd_named[cat_var]).toarray())
df_hd_named_enc.columns = enc.get_feature_names(cat_var)

df_ohe = pd.concat([df_hd_named, df_hd_named_enc], axis=1)
df_ohe.drop(cat_var,axis=1,inplace=True)
df_ohe.columns



Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'slope', 'ca', 'thal', 'num', 'cp_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0',
       'restecg_0.0', 'restecg_1.0', 'restecg_2.0'],
      dtype='object')

In [268]:
df_ohe.isnull().sum()

age            0
sex            0
trestbps       0
chol           0
fbs            0
thalach        0
exang          0
oldpeak        0
slope          0
ca             4
thal           2
num            0
cp_1.0         0
cp_2.0         0
cp_3.0         0
cp_4.0         0
restecg_0.0    0
restecg_1.0    0
restecg_2.0    0
dtype: int64

In [318]:
from tensorflow.keras.utils import to_categorical
y = df_ohe['num'].values
y_imp = to_categorical(y, num_classes=5,dtype=np.float32)
X = df_ohe.drop(['num'],axis=1).values

In [319]:
X_train, X_test, y_train, y_test = train_test_split(X, y_imp, test_size=0.3, random_state=5)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(212, 18)
(212, 5)
(91, 18)
(91, 5)


In [320]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_imp = imp.fit_transform(X_train)
X_test_imp = imp.fit_transform(X_test)

In [321]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_imp)
X_train_imp_scale = scaler.transform(X_train_imp)

In [322]:
scaler.fit(y_train)
y_train_imp_scale = scaler.transform(y_train)

In [323]:
scaler.fit(X_test_imp)
X_test_imp_scale = scaler.transform(X_test_imp)

In [324]:
scaler.fit(y_test)
y_test_imp_scale = scaler.transform(y_test)

# FeedForward neural network
We will hypertune the number of neurons in the hidden layer to find the network with the best number of neurons.
- 5-50 inclusive with a step 5

for the activation function for the hidden and output layers we will use:
- hidden = _ReLu_
- output = _softmax_

for model compile we will use:
- optimizer = _adam_
- loss = _categorical crossentropy_
- metrics = _accuracy_

Finally, we will train the network with
- _batch size_ = 32
- _validation split_ = 0.2
- _epochs_ = 25

In [310]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
neurons = np.arange(5,55,5)
temp = []
for i in neurons:
    model = Sequential([
        Dense(i, activation='relu'), # for the first layer you need to specify input_dim
        Dense(5, activation='softmax') # output layer with sigmoid to make sure the output is between 0 and 1
                           ])
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    # temp.append([i,model.fit(X_train_imp_scale, y_train_imp_scale, validation_split=0.2, epochs=25, verbose=1,batch_size=32)])
    model.fit(X_train_imp_scale, y_train_imp_scale, validation_split=0.2, epochs=25, verbose=1,batch_size=32)
    temp.append([i,model.evaluate(X_test_imp_scale,  y_test_imp_scale, verbose=2)])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
3/3 - 0s - loss: 0.0014 - accuracy: 0.2088
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
3/3 - 0s - loss: -9.5814e-01 - accuracy: 0.3077
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
3/3 - 0s - loss: -1.0528e+00 - accur

In [386]:
accuracy = []
for i in range(len(temp)):
    accuracy.append((temp[i][1][1]))

d = {'num neurons':neurons,'accuracy':accuracy}
best_neuro = pd.DataFrame(d)
best_neuro[best_neuro['accuracy']==best_neuro['accuracy'].max()]



Unnamed: 0,num neurons,accuracy
5,30,0.505495


# Best number nuerons

From tunning, we see that 30 neurons gives us the highest accuracy rate at 0.5055.

In [388]:
model = Sequential([
      Dense(30, activation='relu'), # for the first layer you need to specify input_dim
      Dense(5, activation='softmax') # output layer with sigmoid to make sure the output is between 0 and 1
    ])
model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
model.fit(X_train_imp_scale, y_train_imp_scale, validation_split=0.2, epochs=25, verbose=1,batch_size=32)
history = model.evaluate(X_test_imp_scale,  y_test_imp_scale, verbose=2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
3/3 - 0s - loss: -2.4205e+00 - accuracy: 0.5495


[-2.4204678535461426, 0.5494505763053894]

In [None]:
print('Loss',history[0])
print('Accuracy',history[1])