In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib as plt
%matplotlib inline
exec(open(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
      /_/

Using Python version 3.7.3 (default, Mar 27 2019 16:54:48)
SparkSession available as 'spark'.


In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrameReader
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import *

In [3]:
spark = SparkSession \
    .builder \
    .appName('pyspark_demo_app') \
    .config('spark.driver.extraClassPath',
            'postgresql-42.2.9') \
    .master("local[*]") \
    .getOrCreate()

In [4]:
# tips on jdbc ... https://sparkour.urizone.net/recipes/using-jdbc/#03
# Load properties from file
import json
with open("../data-load/Resources/db-properties.json") as propertyFile:
    properties = json.load(propertyFile)

dbMode = "overwrite"
jdbc_url= properties["jdbcUrl"]
dbConfig = {"user": properties["user"], 
          "password": properties["password"], 
          "driver": properties["driver"]
         }

In [5]:
entireDF = spark.read.jdbc(url=jdbc_url, table = 'heart_cardio_train', properties=dbConfig)
entireDF.printSchema()
entireDF.limit(5).show()

root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- ap_hi: integer (nullable = true)
 |-- ap_lo: integer (nullable = true)
 |-- cholesterol: integer (nullable = true)
 |-- gluc: integer (nullable = true)
 |-- smoke: integer (nullable = true)
 |-- alco: integer (nullable = true)
 |-- active: integer (nullable = true)
 |-- cardio: integer (nullable = true)
 |-- age_yrs: integer (nullable = true)
 |-- weight_lbs: decimal(38,18) (nullable = true)
 |-- height_inches: decimal(38,18) (nullable = true)
 |-- bmi: decimal(38,18) (nullable = true)
 |-- bmi_category: string (nullable = true)

+-----+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+-------+--------------------+--------------------+--------------------+------------+
|   id|  age|gender|height|weight|ap_hi|ap_lo|cholesterol|gluc|smoke|alco|active|cardio|age_yr

In [6]:
#convert to pandas df
heart_cardio = entireDF.select("*").toPandas()
heart_cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_yrs,weight_lbs,height_inches,bmi,bmi_category
0,54329,14395,1,158,46.0,110,70,1,1,0,0,1,0,39,101.41,62.0,18.43,Underweight
1,54400,20571,2,166,48.0,120,90,2,1,0,0,1,0,56,105.82,65.0,17.42,Underweight
2,14908,22007,1,162,38.0,100,70,1,1,0,0,1,0,60,83.78,64.0,14.48,Underweight
3,54860,18290,2,171,51.0,140,80,3,1,1,1,1,1,50,112.44,67.0,17.44,Underweight
4,54895,23113,2,167,48.0,140,80,1,1,0,0,1,1,63,105.82,66.0,17.21,Underweight


In [7]:
heart_cardio=heart_cardio.drop(columns=['id','age', 'height','weight', 'bmi_category'], axis=1)
heart_cardio.head()

Unnamed: 0,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_yrs,weight_lbs,height_inches,bmi
0,1,110,70,1,1,0,0,1,0,39,101.41,62.0,18.43
1,2,120,90,2,1,0,0,1,0,56,105.82,65.0,17.42
2,1,100,70,1,1,0,0,1,0,60,83.78,64.0,14.48
3,2,140,80,3,1,1,1,1,1,50,112.44,67.0,17.44
4,2,140,80,1,1,0,0,1,1,63,105.82,66.0,17.21


In [8]:
#Rename columns
heart_cardio=heart_cardio.rename(columns={"ap_hi": "systolic pressure", "ap_lo": "diastolic pressure", "alco": "alcohol"})
heart_cardio.head()

Unnamed: 0,gender,systolic pressure,diastolic pressure,cholesterol,gluc,smoke,alcohol,active,cardio,age_yrs,weight_lbs,height_inches,bmi
0,1,110,70,1,1,0,0,1,0,39,101.41,62.0,18.43
1,2,120,90,2,1,0,0,1,0,56,105.82,65.0,17.42
2,1,100,70,1,1,0,0,1,0,60,83.78,64.0,14.48
3,2,140,80,3,1,1,1,1,1,50,112.44,67.0,17.44
4,2,140,80,1,1,0,0,1,1,63,105.82,66.0,17.21


In [9]:
#Change object types to int types
heart_cardio['weight_lbs']=heart_cardio['weight_lbs'].astype(int)
heart_cardio['height_inches']=heart_cardio['height_inches'].astype(int)
heart_cardio['bmi']=heart_cardio['bmi'].astype(int)
heart_cardio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68638 entries, 0 to 68637
Data columns (total 13 columns):
gender                68638 non-null int32
systolic pressure     68638 non-null int32
diastolic pressure    68638 non-null int32
cholesterol           68638 non-null int32
gluc                  68638 non-null int32
smoke                 68638 non-null int32
alcohol               68638 non-null int32
active                68638 non-null int32
cardio                68638 non-null int32
age_yrs               68638 non-null int32
weight_lbs            68638 non-null int64
height_inches         68638 non-null int64
bmi                   68638 non-null int64
dtypes: int32(10), int64(3)
memory usage: 4.2 MB


In [10]:
#Drop rest of the features and create new df
feature_selected_df = heart_cardio.drop(['smoke', 'alcohol','height_inches', 'active'], axis=1)
feature_selected_df.head()

Unnamed: 0,gender,systolic pressure,diastolic pressure,cholesterol,gluc,cardio,age_yrs,weight_lbs,bmi
0,1,110,70,1,1,0,39,101,18
1,2,120,90,2,1,0,56,105,17
2,1,100,70,1,1,0,60,83,14
3,2,140,80,3,1,1,50,112,17
4,2,140,80,1,1,1,63,105,17


In [52]:
# duplicated = feature_selected_df.duplicated[feature_selected_df.duplicated(keep=False)]
# # duplicated = duplicated.sort_values(by=['age_yrs'], ascending= False) 
# # sorted the values to see duplication clearly

# duplicated.head(2) # Show us just 1 duplication of 24

In [11]:
#Scaling continuous features using StandardScaler
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
columns_to_scale = ['age_yrs', 'bmi', 'weight_lbs', 'systolic pressure', 'diastolic pressure']
feature_selected_df[columns_to_scale] = standardScaler.fit_transform(feature_selected_df[columns_to_scale])
dataset=feature_selected_df
dataset.head()

Unnamed: 0,gender,systolic pressure,diastolic pressure,cholesterol,gluc,cardio,age_yrs,weight_lbs,bmi
0,1,-0.992021,-1.185255,1,1,0,-2.043112,-1.96613,-1.711763
1,2,-0.395186,0.899744,2,1,0,0.468413,-1.839147,-1.90276
2,1,-1.588856,-1.185255,1,1,0,1.05936,-2.537552,-2.475752
3,2,0.798484,-0.142756,3,1,1,-0.418008,-1.616928,-1.90276
4,2,0.798484,-0.142756,1,1,1,1.50257,-1.839147,-1.90276


In [12]:
target = dataset["cardio"]
data = dataset.drop("cardio", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,gender,systolic pressure,diastolic pressure,cholesterol,gluc,age_yrs,weight_lbs,bmi
0,1,-0.992021,-1.185255,1,1,-2.043112,-1.96613,-1.711763
1,2,-0.395186,0.899744,2,1,0.468413,-1.839147,-1.90276
2,1,-1.588856,-1.185255,1,1,1.05936,-2.537552,-2.475752
3,2,0.798484,-0.142756,3,1,-0.418008,-1.616928,-1.90276
4,2,0.798484,-0.142756,1,1,1.50257,-1.839147,-1.90276


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.info()
X_test.count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51478 entries, 58732 to 15795
Data columns (total 8 columns):
gender                51478 non-null int32
systolic pressure     51478 non-null float64
diastolic pressure    51478 non-null float64
cholesterol           51478 non-null int32
gluc                  51478 non-null int32
age_yrs               51478 non-null float64
weight_lbs            51478 non-null float64
bmi                   51478 non-null float64
dtypes: float64(5), int32(3)
memory usage: 2.9 MB


gender                17160
systolic pressure     17160
diastolic pressure    17160
cholesterol           17160
gluc                  17160
age_yrs               17160
weight_lbs            17160
bmi                   17160
dtype: int64

## Neural Network

In [14]:
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop,Nadam,Adadelta,Adam
from tensorflow.keras.layers import BatchNormalization,LeakyReLU
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import seaborn as sns
import scipy.stats as stats
import sklearn
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
import warnings
from tensorflow.keras.utils import to_categorical

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [15]:
# X_train.shape[1]
# #For one hot encoding
num_classes = 2
y_train_nn = to_categorical(y_train, num_classes)
y_test_nn = to_categorical(y_test, num_classes)
# y_train = to_categorical(y_train)
# y_test = to_categorical(y_test)

In [16]:
y_test_nn = to_categorical(y_test, num_classes)

In [17]:
#Trying new ways to hyperparameter tuning
NN_model = Sequential()
#add hidden layer
NN_model.add(Dense(100, input_dim=X_train.shape[1], activation='relu'))
#add output layer, since it has num_class categories
NN_model.add(Dense(100, activation='relu'))
NN_model.add(Dense(num_classes, activation='sigmoid'))
NN_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'], random_state=4)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [18]:
#Model sent to mary, with label encoding
NN_model_2 = Sequential()
#add hidden layer
NN_model_2.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
#add output layer, since it has 2 categories
NN_model_2.add(Dense(8, activation='relu'))
NN_model_2.add(Dense(2, activation='sigmoid'))
NN_model_2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

#Adding validation_split=0.1 for creating a validation set with 10% data
NN_model_2.fit(X_train,y_train_nn, epochs=100, verbose=2)


Epoch 1/100
 - 3s - loss: 0.5611 - acc: 0.7236
Epoch 2/100
 - 2s - loss: 0.5499 - acc: 0.7303
Epoch 3/100
 - 2s - loss: 0.5476 - acc: 0.7310
Epoch 4/100
 - 2s - loss: 0.5464 - acc: 0.7332
Epoch 5/100
 - 2s - loss: 0.5459 - acc: 0.7321
Epoch 6/100
 - 2s - loss: 0.5452 - acc: 0.7333
Epoch 7/100
 - 2s - loss: 0.5447 - acc: 0.7331
Epoch 8/100
 - 2s - loss: 0.5443 - acc: 0.7339
Epoch 9/100
 - 2s - loss: 0.5440 - acc: 0.7336
Epoch 10/100
 - 2s - loss: 0.5437 - acc: 0.7335
Epoch 11/100
 - 2s - loss: 0.5433 - acc: 0.7343
Epoch 12/100
 - 2s - loss: 0.5431 - acc: 0.7342
Epoch 13/100
 - 2s - loss: 0.5431 - acc: 0.7338
Epoch 14/100
 - 2s - loss: 0.5429 - acc: 0.7349
Epoch 15/100
 - 2s - loss: 0.5426 - acc: 0.7349
Epoch 16/100
 - 2s - loss: 0.5426 - acc: 0.7345
Epoch 17/100
 - 2s - loss: 0.5423 - acc: 0.7348
Epoch 18/100
 - 2s - loss: 0.5422 - acc: 0.7352
Epoch 19/100
 - 2s - loss: 0.5422 - acc: 0.7341
Epoch 20/100
 - 2s - loss: 0.5423 - acc: 0.7341
Epoch 21/100
 - 2s - loss: 0.5420 - acc: 0.7340


<keras.callbacks.History at 0x1642e5320>

In [59]:
# We can summarize our model
NN_model_2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 32)                288       
_________________________________________________________________
dense_7 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 9         
Total params: 561
Trainable params: 561
Non-trainable params: 0
_________________________________________________________________


In [45]:
NN_model.fit(X_train, y_train_nn, epochs=100, validation_split=0.1, verbose=2)

Train on 46330 samples, validate on 5148 samples
Epoch 1/100
 - 3s - loss: 0.5554 - accuracy: 0.7273 - val_loss: 0.5468 - val_accuracy: 0.7355
Epoch 2/100
 - 2s - loss: 0.5492 - accuracy: 0.7316 - val_loss: 0.5450 - val_accuracy: 0.7342
Epoch 3/100
 - 2s - loss: 0.5472 - accuracy: 0.7318 - val_loss: 0.5462 - val_accuracy: 0.7358
Epoch 4/100
 - 2s - loss: 0.5462 - accuracy: 0.7327 - val_loss: 0.5439 - val_accuracy: 0.7358
Epoch 5/100
 - 2s - loss: 0.5451 - accuracy: 0.7330 - val_loss: 0.5468 - val_accuracy: 0.7318
Epoch 6/100
 - 2s - loss: 0.5443 - accuracy: 0.7322 - val_loss: 0.5433 - val_accuracy: 0.7390
Epoch 7/100
 - 2s - loss: 0.5438 - accuracy: 0.7342 - val_loss: 0.5434 - val_accuracy: 0.7354
Epoch 8/100
 - 3s - loss: 0.5437 - accuracy: 0.7348 - val_loss: 0.5448 - val_accuracy: 0.7351
Epoch 9/100
 - 2s - loss: 0.5433 - accuracy: 0.7344 - val_loss: 0.5436 - val_accuracy: 0.7350
Epoch 10/100
 - 2s - loss: 0.5426 - accuracy: 0.7344 - val_loss: 0.5441 - val_accuracy: 0.7345
Epoch 11/1

Epoch 87/100
 - 2s - loss: 0.5189 - accuracy: 0.7488 - val_loss: 0.5713 - val_accuracy: 0.7181
Epoch 88/100
 - 2s - loss: 0.5182 - accuracy: 0.7496 - val_loss: 0.5707 - val_accuracy: 0.7170
Epoch 89/100
 - 2s - loss: 0.5184 - accuracy: 0.7489 - val_loss: 0.5683 - val_accuracy: 0.7158
Epoch 90/100
 - 2s - loss: 0.5176 - accuracy: 0.7500 - val_loss: 0.5712 - val_accuracy: 0.7191
Epoch 91/100
 - 2s - loss: 0.5167 - accuracy: 0.7503 - val_loss: 0.5736 - val_accuracy: 0.7158
Epoch 92/100
 - 2s - loss: 0.5171 - accuracy: 0.7498 - val_loss: 0.5719 - val_accuracy: 0.7178
Epoch 93/100
 - 2s - loss: 0.5160 - accuracy: 0.7514 - val_loss: 0.5699 - val_accuracy: 0.7207
Epoch 94/100
 - 2s - loss: 0.5158 - accuracy: 0.7510 - val_loss: 0.5761 - val_accuracy: 0.7165
Epoch 95/100
 - 3s - loss: 0.5162 - accuracy: 0.7509 - val_loss: 0.5772 - val_accuracy: 0.7155
Epoch 96/100
 - 2s - loss: 0.5153 - accuracy: 0.7517 - val_loss: 0.5749 - val_accuracy: 0.7174
Epoch 97/100
 - 2s - loss: 0.5150 - accuracy: 0.75

<keras.callbacks.callbacks.History at 0x132d4de50>

In [65]:
NN_model_2_loss, NN_model_2_accuracy = NN_model_2.evaluate(
    X_test, y_test_nn, verbose=2)
print(
    f"Normal Neural Network - Loss: {NN_model_2_loss}, Accuracy: {round(NN_model_2_accuracy*100,2)}")

Normal Neural Network - Loss: 0.5437893510976316, Accuracy: 73.22


In [68]:
#Model saved, without label encoding
NN_model_3 = Sequential()
#add hidden layer
NN_model_3.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
#add output layer, since it has 2 categories
NN_model_3.add(Dense(8, activation='relu'))
NN_model_3.add(Dense(1, activation='sigmoid'))
NN_model_3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

#Adding validation_split=0.1 for creating a validation set with 10% data
NN_model_3.fit(X_train,y_train, epochs=100, verbose=2)

Epoch 1/100
 - 3s - loss: 0.5593 - accuracy: 0.7263
Epoch 2/100
 - 2s - loss: 0.5511 - accuracy: 0.7307
Epoch 3/100
 - 2s - loss: 0.5484 - accuracy: 0.7318
Epoch 4/100
 - 2s - loss: 0.5469 - accuracy: 0.7318
Epoch 5/100
 - 2s - loss: 0.5459 - accuracy: 0.7328
Epoch 6/100
 - 2s - loss: 0.5453 - accuracy: 0.7322
Epoch 7/100
 - 2s - loss: 0.5449 - accuracy: 0.7338
Epoch 8/100
 - 2s - loss: 0.5445 - accuracy: 0.7337
Epoch 9/100
 - 2s - loss: 0.5441 - accuracy: 0.7335
Epoch 10/100
 - 2s - loss: 0.5440 - accuracy: 0.7347
Epoch 11/100
 - 3s - loss: 0.5438 - accuracy: 0.7346
Epoch 12/100
 - 3s - loss: 0.5435 - accuracy: 0.7346
Epoch 13/100
 - 3s - loss: 0.5433 - accuracy: 0.7349
Epoch 14/100
 - 3s - loss: 0.5433 - accuracy: 0.7345
Epoch 15/100
 - 2s - loss: 0.5429 - accuracy: 0.7348
Epoch 16/100
 - 2s - loss: 0.5430 - accuracy: 0.7338
Epoch 17/100
 - 3s - loss: 0.5428 - accuracy: 0.7357
Epoch 18/100
 - 2s - loss: 0.5426 - accuracy: 0.7344
Epoch 19/100
 - 2s - loss: 0.5424 - accuracy: 0.7354
Ep

<keras.callbacks.callbacks.History at 0x135aae950>

In [69]:
NN_model_3_loss, NN_model_3_accuracy = NN_model_3.evaluate(
    X_test, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {NN_model_3_loss}, Accuracy: {round(NN_model_3_accuracy*100,2)}")

Normal Neural Network - Loss: 0.5422250737121334, Accuracy: 73.25


In [81]:
#Test the saved model
import numpy as np
Normal=[2, 120, 78, 1, 25, 130, 21.6]
Risk=[2, 200, 120,3, 55,210,31]
B=np.array(Z)
# B
C=B.reshape(1, -1)
# C
# NN_model.predict_proba(C)
# C.shap
from keras.models import load_model
Best_model = load_model('NN_model.h5')
predictions=Best_model.predict(C)
predictions
# print('Predicted:', decode_predictions(predictions))

array([[0.97558844]], dtype=float32)