# Data Preprocessing and Machine Learning Model Training

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import sqlite3
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

## Connect to the database

Connect to the database and select data from the patients table. Assign the selected data from the patients table to a Dataframe so the data can be manipulated and used for training.

In [2]:
database_name = 'heart.db'
conn = sqlite3.connect(database_name)

In [3]:
query = "SELECT * FROM patients"
db = pd.read_sql_query(query, conn)

db.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Data Preprocessing

It is important to ensure that data is preprocessed to fit the model which the data will be used to train. In this case we are looking at models suited to binary classification systems and in that case the data should be numerical, non ordinal and also be scaled correctly. The code section below runs through basic processed to scale the data, encode it and ensure that the data is preprocessed correctly.

In [4]:
nominal_cols = ['cp', 'restecg', 'slope', 'ca', 'thal']
binomial_cols = ['sex', 'fbs', 'exang', 'target']
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

target_col = 'target'

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('nom', OneHotEncoder(), nominal_cols)
    ],
    remainder='passthrough'  # Leave other columns untouched
)

x = db.drop(columns=[target_col])
y = db[target_col]

X_preprocessed = preprocessor.fit_transform(x)

feature_names = preprocessor.get_feature_names_out()
df_preprocessed = pd.DataFrame(X_preprocessed, columns=feature_names)
df_preprocessed[target_col] = y.values

df_preprocessed.to_sql('patients_transformed', conn, if_exists='replace', index=False)

query = "SELECT * FROM patients_transformed"
db = pd.read_sql_query(query, conn)

db.head()

Unnamed: 0,num__age,num__trestbps,num__chol,num__thalach,num__oldpeak,nom__cp_0,nom__cp_1,nom__cp_2,nom__cp_3,nom__restecg_0,...,nom__ca_3,nom__ca_4,nom__thal_0,nom__thal_1,nom__thal_2,nom__thal_3,remainder__sex,remainder__fbs,remainder__exang,target
0,0.949794,0.764066,-0.261285,0.018826,1.084022,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1
1,-1.928548,-0.091401,0.067741,1.636979,2.118926,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
2,-1.485726,-0.091401,-0.822564,0.980971,0.307844,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
3,0.174856,-0.661712,-0.203222,1.243374,-0.209608,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
4,0.285561,-0.661712,2.080602,0.587366,-0.382092,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1


## Training the Models

The data from the patients table is first split 20/80 into a training data set and a testing data set. 80% of the data is used for training and 20% is used for testing.

In [5]:
# Split the data into features and target
x = db.drop(columns=['target'])
y = db['target']

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In this instance three different machine learning models will be used, a random forest mdoel which makes use of decision trees, a support vector machine model which creates diemsional spaces based on the data and plots distances and then a logistical regression model which uses probability to predict the result from two possible outcomes.

In [7]:
# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', probability=True)
}

The models are trained, tested and then the results are compared to determine which model is best suited for the data set.

In [8]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    # Save the model to disk
    joblib.dump(model, f"{name.replace(' ', '_').lower()}_model.joblib")

# Save the preprocessor separately
joblib.dump(preprocessor, 'preprocessor.joblib')

Logistic Regression Accuracy: 0.8852459016393442
              precision    recall  f1-score   support

           0       0.84      0.93      0.89        29
           1       0.93      0.84      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61

Random Forest Accuracy: 0.8688524590163934
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        29
           1       0.88      0.88      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

Support Vector Machine Accuracy: 0.8524590163934426
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.90      0.81      0.85        32

    accuracy                          

['preprocessor.joblib']

In [9]:
conn.close()