# Confidence scores: Heart Failure Clinical Records

In [2]:
# Import required dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Load the dataset
df = pd.read_csv("../Proj_2/heart_failure_clinical_records.csv")

In [6]:

df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,55.0,0,748,0,45,0,263358.03,1.3,137,1,1,88,0
1,65.0,0,56,0,25,0,305000.0,5.0,130,1,0,207,0
2,45.0,0,582,1,38,0,319000.0,0.9,140,0,0,244,0
3,60.0,1,754,1,40,1,328000.0,1.2,126,1,0,90,0
4,95.0,1,582,0,30,0,461000.0,2.0,132,1,0,50,1


In [8]:
# Check the value_counts of the target column
df["DEATH_EVENT"].value_counts()

DEATH_EVENT
0    3432
1    1568
Name: count, dtype: int64

## Preprocess the data

In [10]:
# Check the data types
df.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object

In [12]:
# Get the target variable (the "DEATH_EVENT" column)
y = df["DEATH_EVENT"]
y

0       0
1       0
2       0
3       0
4       1
       ..
4995    0
4996    0
4997    1
4998    1
4999    0
Name: DEATH_EVENT, Length: 5000, dtype: int64

In [14]:
# Get the features (everything except the "DEATH_EVENT" column)
X = df.copy()
X = X.drop(columns="DEATH_EVENT")
X.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,55.0,0,748,0,45,0,263358.03,1.3,137,1,1,88
1,65.0,0,56,0,25,0,305000.0,5.0,130,1,0,207
2,45.0,0,582,1,38,0,319000.0,0.9,140,0,0,244
3,60.0,1,754,1,40,1,328000.0,1.2,126,1,0,90
4,95.0,1,582,0,30,0,461000.0,2.0,132,1,0,50


In [16]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
# Remember that all of the columns in the DataFrame are objects
# Use a OneHotEncoder to convert the training data to numerical values
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
X_train_encoded = pd.DataFrame(data=ohe.fit_transform(X_train), columns=ohe.get_feature_names_out())
X_train_encoded

Unnamed: 0,age_40.0,age_41.0,age_42.0,age_43.0,age_44.0,age_45.0,age_46.0,age_47.0,age_48.0,age_49.0,...,time_247,time_250,time_256,time_257,time_258,time_270,time_271,time_278,time_280,time_285
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Encode the test data
X_test_encoded = pd.DataFrame(data=ohe.transform(X_test), columns=ohe.get_feature_names_out())
X_test_encoded

Unnamed: 0,age_40.0,age_41.0,age_42.0,age_43.0,age_44.0,age_45.0,age_46.0,age_47.0,age_48.0,age_49.0,...,time_247,time_250,time_256,time_257,time_258,time_270,time_271,time_278,time_280,time_285
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1247,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1248,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model and Fit to a Logistic Regression Classifier

In [24]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=42)

# Fit the model to the training data
lr_model.fit(X_train_encoded, y_train)

In [26]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % lr_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % lr_model.score(X_test_encoded, y_test))

Train Accuracy: 0.996
Test Accuracy: 0.989


## Model and Fit to a Support Vector Machine

In [28]:
# Create the support vector machine classifier model with a 'linear' kernel
svm_model = SVC(kernel='linear')

# Fit the model to the training data
svm_model.fit(X_train_encoded, y_train)

In [30]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test_encoded, y_test))

Train Accuracy: 0.997
Test Accuracy: 0.991


## Model and Fit to a KNN Model

In [32]:
# Create the KNN model with 9 neighbors
knn_model = KNeighborsClassifier(n_neighbors=9)

# Fit the model to the training data
knn_model.fit(X_train_encoded, y_train)

In [34]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % knn_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % knn_model.score(X_test_encoded, y_test))

Train Accuracy: 0.994
Test Accuracy: 0.994


## Model and Fit to a Decision Tree Classifier

In [36]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier()

# Fit the model to the training data
dt_model.fit(X_train_encoded, y_train)

In [38]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % dt_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % dt_model.score(X_test_encoded, y_test))

Train Accuracy: 0.999
Test Accuracy: 0.986


## Model and Fit to a Random Forest Classifier

In [40]:
# Create the random forest classifier model
# with n_estimators=128 and random_state=1
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

# Fit the model to the training data
rf_model.fit(X_train_encoded, y_train)

In [42]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % rf_model.score(X_train_encoded, y_train))
print('Test Accuracy: %.3f' % rf_model.score(X_test_encoded, y_test))

Train Accuracy: 0.999
Test Accuracy: 0.990
